From b41c7d9b9980fc9fb54dd77b32371ab52b02bc28 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Wed, 25 Jun 2025 18:14:43 +0800 Subject: [PATCH 01/11] New version initialization --- .bazelversion | 2 +- BUILD | 41 +- README.md | 294 +- WORKSPACE | 105 +- build.sh | 9 +- build_deps/BUILD | 37 + build_deps/build_pip_pkg.sh | 4 +- build_deps/patches/BUILD | 15 + build_deps/patches/internal_visibility.patch | 13 + build_deps/patches/python_toolchain.patch | 74 + build_deps/patches/tensorflow_llvm_url.patch | 23 + build_deps/patches/tensorflow_serving.patch | 25 + .../tensorflow_tf_gen_op_wrapper_py.patch | 11 + build_deps/patches/tensorflow_zlib.patch | 11 + build_deps/patches/tf2xla_visibility.patch | 13 + build_deps/pip_tf/BUILD | 30 + build_deps/pip_tf/README.md | 25 + build_deps/pip_tf/defs.bzl | 132 + build_deps/pip_tf/pip_tf_flags_test.py | 65 + build_deps/pip_tf/tensorflow.bzl | 3536 ++ build_deps/requirements.in | 37 + build_deps/requirements_lock_3_10.txt | 983 + build_deps/requirements_lock_3_11.txt | 733 + build_deps/requirements_lock_3_12.txt | 739 + build_deps/requirements_lock_3_13.txt | 739 + build_deps/tf_dependency/build_defs.bzl.tpl | 1 + build_deps/tf_dependency/tf_configure.bzl | 4 + build_deps/toolchains/gpu/crosstool/BUILD.tpl | 69 - .../toolchains/gpu/crosstool/CROSSTOOL.tpl | 1409 - .../gpu/crosstool/cc_toolchain_config.bzl.tpl | 1493 - .../crosstool_wrapper_driver_is_not_gcc.tpl | 269 - .../windows/msvc_wrapper_for_nvcc.py.tpl | 192 - build_deps/toolchains/gpu/cub.BUILD | 25 - build_deps/toolchains/gpu/cuda/BUILD.tpl | 227 - .../toolchains/gpu/cuda/BUILD.windows.tpl | 164 - .../toolchains/gpu/cuda/build_defs.bzl.tpl | 62 - .../toolchains/gpu/cuda/cuda_config.h.tpl | 26 - build_deps/toolchains/gpu/cuda_configure.bzl | 1116 - build_deps/toolchains/gpu/find_cuda_config.py | 632 - configure.py | 415 +- deepray/BUILD | 122 +- deepray/__init__.py | 113 +- deepray/activations/__init__.py | 27 - deepray/callbacks/__init__.py | 3 +- deepray/callbacks/callbacks.py | 45 +- deepray/callbacks/model_checkpoint.py | 147 + deepray/callbacks/profiler_callback.py | 68 + deepray/callbacks/progbar_logger.py | 458 + .../time_history.py} | 72 +- deepray/callbacks/time_stopping.py | 2 +- deepray/callbacks/tqdm_progress_bar.py | 2 +- deepray/callbacks/training_speed.py | 155 + deepray/copts.bzl | 8 + deepray/core/base_trainer.py | 991 - deepray/core/base_trainer_test.py | 351 - deepray/core/common/distribution_utils.py | 24 +- deepray/core/common/flags.py | 14 - deepray/core/compile_utils.py | 18 +- deepray/core/dllogger_class.py | 77 - deepray/core/module.py | 627 - .../core/platform/build_config.default.bzl | 4 +- deepray/core/platform/build_config_root.bzl | 10 +- deepray/core/trainer.py | 3073 ++ deepray/core/utils/misc/distribution_utils.py | 34 - deepray/core/utils/misc/keras_utils.py | 208 - deepray/custom_ops/BUILD | 5 +- deepray/custom_ops/correlation_cost/BUILD | 29 +- .../cc/kernels/correlation_cost_op_gpu.cu.cc | 3 +- .../correlation_cost/python/optical_flow.py | 6 +- .../python/tests/run_all_test.py | 3 +- .../custom_ops/distributed_embeddings/BUILD | 17 +- .../cc/kernels/embedding_lookup_kernels.cu.cc | 5 +- .../cc/ops/embedding_lookup_ops.cc | 11 +- .../python/tests/dist_model_parallel_test.py | 2 - deepray/custom_ops/embedding_bag/BUILD | 49 + deepray/custom_ops/embedding_bag/__init__.py | 1 + .../embedding_bag_backward_kernels.cu.cc | 247 + .../cc/kernels/embedding_bag_ops.cc | 330 + .../cc/kernels/embedding_bag_ops.h | 57 + .../cc/kernels/embedding_bag_ops_gpu.cu.cc | 108 + .../embedding_bag/cc/ops/embedding_bag_ops.cc | 70 + .../embedding_bag/python}/__init__.py | 0 .../embedding_bag/python/embedding_bag.py | 143 + .../embedding_bag/python/tests}/__init__.py | 0 .../python/tests/embedding_bag_test.py | 116 + .../python}/tests/run_all_test.py | 3 +- deepray/custom_ops/embedding_variable/BUILD | 282 + .../custom_ops/embedding_variable/__init__.py | 3 + .../embedding_variable/cc/embedding/BUILD | 269 + .../cc/embedding/batch.cu.cc | 219 + .../embedding_variable/cc/embedding/batch.h | 66 + .../cc/embedding/bloom_filter_policy.h | 438 + .../embedding_variable/cc/embedding/cache.h | 521 + .../cc/embedding/cache_factory.h | 47 + .../cc/embedding/cache_thread_pool_creator.h | 45 + .../cc/embedding/config.proto | 58 + .../counter_filter_descriptor_impl.h | 252 + .../cc/embedding/counter_filter_policy.h | 189 + .../cc/embedding/cpu_hash_map_kv.h | 214 + .../cc/embedding/dense_hash_map_kv.h | 151 + .../cc/embedding/dram_leveldb_storage.h | 221 + .../cc/embedding/dram_pmem_storage.h | 218 + .../cc/embedding/dram_ssd_storage.h | 214 + .../dynamic_dim_feature_descriptor_impl.h | 195 + .../cc/embedding/emb_file.h | 244 + .../cc/embedding/emb_file_creator.h | 97 + .../cc/embedding/embedding_config.h | 110 + .../cc/embedding/embedding_memory_pool.h | 89 + .../cc/embedding/embedding_var.cu.cc | 77 + .../cc/embedding/embedding_var.h | 706 + .../cc/embedding/embedding_var_ckpt_data.cc | 229 + .../cc/embedding/embedding_var_ckpt_data.h | 57 + .../cc/embedding/embedding_var_context.h | 64 + .../embedding/embedding_var_dump_iterator.h | 91 + .../cc/embedding/embedding_var_restore.cc | 646 + .../cc/embedding/embedding_var_restore.h | 223 + .../cc/embedding/eviction_manager.h | 139 + .../cc/embedding/feature_descriptor.h | 154 + .../cc/embedding/feature_descriptor_impl.h | 299 + .../cc/embedding/filter_factory.h | 51 + .../cc/embedding/filter_policy.h | 106 + .../cc/embedding/globalstep_shrink_policy.h | 62 + .../cc/embedding/gpu_hash_map_kv.h | 333 + .../cc/embedding/gpu_hash_table.cu.cc | 708 + .../cc/embedding/gpu_hash_table.h | 136 + .../cc/embedding/hbm_dram_ssd_storage.h | 601 + .../cc/embedding/hbm_dram_storage.h | 536 + .../hbm_multi_tier_feature_descriptor.h | 116 + .../cc/embedding/hbm_storage_iterator.h | 124 + .../intra_thread_copy_id_allocator.h | 73 + .../cc/embedding/kv_interface.h | 121 + .../cc/embedding/l2weight_shrink_policy.h | 71 + .../cc/embedding/leveldb_kv.h | 288 + .../cc/embedding/multi_tier_storage.cu.cc | 188 + .../cc/embedding/multi_tier_storage.h | 303 + .../cc/embedding/normal_feature_descriptor.h | 127 + .../cc/embedding/nullable_filter_policy.h | 173 + .../cc/embedding/shrink_policy.h | 72 + .../cc/embedding/single_tier_storage.h | 581 + .../cc/embedding/ssd_hash_kv.h | 802 + .../cc/embedding/ssd_record_descriptor.cc | 80 + .../cc/embedding/ssd_record_descriptor.h | 105 + .../embedding_variable/cc/embedding/storage.h | 367 + .../cc/embedding/storage_config.h | 59 + .../cc/embedding/storage_factory.h | 78 + .../embedding_lookup_sparse_local_op.cc | 757 + .../embedding_lookup_sparse_local_op_test.cc | 901 + .../embedding_lookup_sparse_op.h | 11 + ...bedding_lookup_sparse_post_grad_op_test.cc | 394 + .../embedding_lookup_sparse_post_op.cc | 466 + .../embedding_lookup_sparse_post_op_test.cc | 419 + .../embedding_lookup_sparse_pre_op.cc | 315 + .../embedding_lookup_sparse_pre_op_test.cc | 627 + .../fused_embedding_common.cu.h | 98 + .../fused_embedding_local_ops_gpu.cu.cc | 315 + .../fused_embedding_local_ops_test.cc | 419 + .../cc/fused_embedding/fused_embedding_ops.cc | 308 + .../fused_embedding_post_grad_ops_test.cc | 243 + .../fused_embedding_post_ops_gpus.cu.cc | 328 + .../fused_embedding_post_ops_test.cc | 213 + .../fused_embedding_pre_ops_gpus.cu.cc | 521 + .../fused_embedding_pre_ops_test.cc | 352 + .../cc/fused_layer_norm/BUILD | 22 + .../cc/fused_layer_norm/compile_util.h | 78 + .../fused_layer_normalize_ops.cc | 678 + .../fused_layer_normalize_ops_test.cc | 269 + .../group_embedding_lookup_ops.cc | 176 + .../group_embedding_lookup_ops.cu.cc | 105 + .../group_embedding_lookup_ops_test.cc | 1089 + ...dding_lookup_sparse_backward_base_ops.cu.h | 371 + ...up_embedding_lookup_sparse_backward_ops.cc | 264 + ...embedding_lookup_sparse_backward_ops.cu.cc | 176 + ...edding_lookup_sparse_forward_base_ops.cu.h | 721 + ...embedding_lookup_sparse_forward_base_ops.h | 64 + ...oup_embedding_lookup_sparse_forward_ops.cc | 690 + ..._embedding_lookup_sparse_forward_ops.cu.cc | 309 + .../incr_save_restore_ops.cc | 493 + .../incr_save_restore/incr_save_restore_ops.h | 553 + .../incr_save_restore_ops_test.cc | 256 + .../cc/kernels/embedding_collection.cc | 95 + .../cc/kernels/hotness_calculate.cu.cc | 84 + .../cc/kernels/hotness_calculate.h | 41 + .../cc/kernels/kv_variable_lookup_ops.cc | 593 + .../cc/kernels/kv_variable_ops.cc | 620 + .../cc/kernels/kv_variable_restore_ops.cc | 259 + .../cc/kernels/kv_variable_util.cc | 69 + .../cc/kernels/kv_variable_util.h | 165 + .../cc/kernels/save_restore_ops.cc | 176 + .../cc/kernels/save_restore_tensor_ev.h | 82 + .../cc/kernels/training_adagrad_ops.cc | 383 + .../cc/kernels/training_adam_async_ops.cc | 603 + .../cc/kernels/training_adam_ops.cc | 529 + .../cc/kernels/training_ali_op_helpers.h | 182 + .../cc/kernels/training_ali_ops_gpu.cu.cc | 650 + .../cc/kernels/training_ali_ops_gpu.h | 119 + .../cc/kernels/training_ftrl_ops.cc | 485 + .../cc/kernels/training_sgd_ops.cc | 200 + .../cc/ops/embedding_collection.cc | 39 + .../cc/ops/group_embedding_ops.cc | 282 + .../cc/ops/incr_save_restore_ops.cc | 73 + .../cc/ops/kv_variable_ops.cc | 436 + .../cc/ops/save_restore_ops.cc | 122 + .../cc/ops/training_adagrad_ops.cc | 109 + .../cc/ops/training_adam_async_ops.cc | 129 + .../cc/ops/training_adam_ops.cc | 127 + .../cc/ops/training_ftrl_ops.cc | 96 + .../cc/ops/training_sgd_ops.cc | 80 + .../embedding_variable/cc/tests/BUILD | 65 + .../tests/embedding_variable_memory_test.cc | 80 + .../cc/tests/embedding_variable_ops_test.cc | 1324 + .../embedding_variable_performance_test.cc | 455 + .../cc/tests/embedding_variable_test.h | 109 + .../embedding_variable/config.proto | 58 + .../embedding_variable_ops_test.py | 114 + .../embedding_variable/multiplex_1_test.py | 50 + .../embedding_variable/python}/__init__.py | 0 .../python/group_embedding_lookup_ops.py | 543 + .../python/group_embedding_types.py | 54 + .../python/kv_variable_ops.py | 1027 + .../python/tests/__init__.py | 0 .../python/tests/embedding_bag_test.py | 116 + .../tests/group_embedding_lookup_ops_test.py | 254 + .../python/tests/run_all_test.py | 7 + .../embedding_variable/variable_scope.py | 1277 + .../embedding_variable/variables.py | 206 + deepray/custom_ops/ffm_ops/BUILD | 12 +- .../ffm_ops/cc/kernels/ffm_kernels.cu.cc | 3 +- deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc | 5 +- deepray/custom_ops/ffm_ops/python/ffm_ops.py | 2 +- deepray/custom_ops/multiplex_1/BUILD | 17 +- .../custom_ops/multiplex_1/__init__.py | 0 .../custom_ops/multiplex_1/multiplex_1_op.cc | 9 +- deepray/custom_ops/multiplex_2/BUILD | 12 +- .../custom_ops/multiplex_2/__init__.py | 0 .../multiplex_2/multiplex_2_kernel.cc | 2 +- .../multiplex_2/multiplex_2_kernel.cu.cc | 2 +- .../custom_ops/multiplex_2/multiplex_2_op.cc | 2 +- deepray/custom_ops/multiplex_3/BUILD | 13 +- .../custom_ops/multiplex_3/__init__.py | 0 .../multiplex_3/multiplex_3_kernel.cc | 3 +- .../custom_ops/multiplex_3/multiplex_3_op.cc | 5 +- deepray/custom_ops/multiplex_4/BUILD | 5 + deepray/custom_ops/multiplex_4/__init__.py | 0 .../custom_ops/multiplex_4/multiplex_4_op.cc | 3 +- deepray/custom_ops/parquet_dataset/BUILD | 16 +- .../parquet_dataset/cc/kernels/arrow_util.cc | 14 +- .../parquet_dataset/cc/kernels/eigen.h | 2 +- .../cc/kernels/parquet_batch_reader.cc | 34 +- .../cc/kernels/parquet_dataset_ops.cc | 10 +- .../cc/kernels/parquet_dataset_ops.h | 2 +- .../cc/kernels/parquet_pybind.cc | 7 +- .../parquet_dataset/python/dataframe.py | 30 +- .../python/parquet_dataset_ops.py | 33 +- .../parquet_dataset/python/parquet_pybind.py | 7 +- .../python/tests/parquet_dataset_ops_test.py | 299 +- .../parquet_dataset/read_parquet_deepray.py | 24 +- deepray/custom_ops/seq2seq/BUILD | 30 +- deepray/custom_ops/seq2seq/__init__.py | 22 + .../seq2seq/cc/kernels/beam_search_ops.cc | 5 + .../seq2seq/cc/kernels/beam_search_ops.h | 5 + .../seq2seq/python}/README.md | 22 +- deepray/custom_ops/seq2seq/python/__init__.py | 0 .../seq2seq/python}/attention_wrapper.py | 99 +- .../seq2seq/python}/basic_decoder.py | 33 +- .../seq2seq/python}/beam_search_decoder.py | 33 +- .../seq2seq/python}/decoder.py | 22 +- .../seq2seq/python}/loss.py | 7 +- .../seq2seq/python}/sampler.py | 11 +- .../seq2seq/python/tests/__init__.py | 0 .../python}/tests/attention_wrapper_test.py | 18 +- .../python}/tests/basic_decoder_test.py | 9 +- .../python}/tests/beam_search_decoder_test.py | 8 +- .../python}/tests/beam_search_ops_test.py | 12 +- .../seq2seq/python}/tests/decoder_test.py | 9 +- .../seq2seq/python}/tests/loss_test.py | 5 +- .../seq2seq/python/tests/run_all_test.py | 9 + deepray/custom_ops/simple_hash_table/BUILD | 6 +- .../simple_hash_table_kernel.cc | 11 +- .../simple_hash_table/simple_hash_table_op.cc | 9 +- deepray/custom_ops/sleep/BUILD | 3 +- deepray/custom_ops/sleep/sleep_op.cc | 5 +- deepray/custom_ops/text/BUILD | 6 +- deepray/custom_ops/training_ops/BUILD | 8 +- .../training_ops/cc/kernels/training_ops.cc | 194 +- .../training_ops/cc/kernels/training_ops.h | 13 + .../cc/kernels/training_ops_gpu.cu.cc | 64 +- .../training_ops/cc/ops/training_ops.cc | 53 +- deepray/custom_ops/unique_ops/BUILD | 45 +- .../unique_ops/cc/kernels/random.cc | 58 - .../unique_ops/cc/kernels/task_runner.h | 2 +- .../unique_ops/cc/kernels/unique_ali_op.cc | 100 +- .../cc/kernels/unique_ali_op_gpu.cu.cc | 14 +- .../cc/kernels/unique_ali_op_util.h | 147 +- .../unique_ops/cc/ops/unique_ops.cc | 26 +- .../python/tests/unique_ali_op_test.py | 349 + .../unique_ops/python/tests/unique_op_test.py | 303 - deepray/custom_ops/utils/BUILD | 127 + deepray/custom_ops/utils/check.h | 33 + deepray/custom_ops/utils/fake_input.cc | 239 + deepray/custom_ops/utils/fake_input.h | 40 + .../utils/kernel_benchmark_testlib.cc | 210 + .../utils/kernel_benchmark_testlib.h | 86 + deepray/custom_ops/utils/ok_status_util.h | 41 + deepray/custom_ops/utils/ops_testutil.cc | 271 + deepray/custom_ops/utils/ops_testutil.h | 212 + deepray/custom_ops/utils/ops_testutil_test.cc | 52 + .../random_test.cc => utils/random.cc} | 23 +- .../{unique_ops/cc/kernels => utils}/random.h | 8 - deepray/custom_ops/utils/spin_lock.h | 73 + deepray/custom_ops/utils/spin_rw_lock.h | 248 + deepray/custom_ops/utils/tensor_testutil.cc | 294 + deepray/custom_ops/utils/tensor_testutil.h | 162 + .../custom_ops/utils/tensor_testutil_test.cc | 335 + deepray/custom_ops/zero_out/BUILD | 6 + .../zero_out/cc/kernels/zero_out_kernels.cc | 9 + .../zero_out/cc/ops/zero_out_ops.cc | 3 +- .../adult_census_income.py | 10 +- .../adult_census_income_test.py | 2 - .../ali_display_ad_click.py | 21 +- .../ali_display_ad_click_test.py | 2 - .../amazon_books_2014/amazon_books_2014.py | 10 +- .../amazon_books_2014_test.py | 8 +- deepray/datasets/avazu/avazu.py | 6 +- deepray/datasets/avazu/avazu_test.py | 2 - deepray/datasets/cifar/cifar.py | 19 +- deepray/datasets/cifar/cifar_test.py | 2 - .../creditcardfraud/creditcardfraud.py | 26 +- .../creditcardfraud/creditcardfraud_test.py | 2 - .../{docker => }/Dockerfile_preprocessing | 0 deepray/datasets/criteo/README.md | 282 - deepray/datasets/criteo/criteo.py | 18 +- deepray/datasets/criteo/criteo_dataset.md | 190 + deepray/datasets/criteo/criteo_test.py | 3 - deepray/datasets/criteo/criteo_tsv_reader.py | 11 +- .../datasets/criteo/criteo_tsv_reader_test.py | 2 - deepray/datasets/criteo/feature_map_small.csv | 80 +- .../datasets/criteo/feature_map_xlarge.csv | 80 +- .../datasets/criteo/preproc/data/__init__.py | 0 .../datasets/criteo/preproc/data/defaults.py | 43 + .../criteo/preproc/data/feature_spec.py | 268 + .../criteo/preproc/parquet_to_binary.py | 8 +- .../criteo/preproc/preproc_NVTabular.py | 6 +- .../criteo/preproc/spark_data_utils.py | 16 +- .../datasets/criteo/preproc/split_dataset.py | 6 +- .../requirements_preprocessing.txt | 2 +- deepray/datasets/csv_pipeline.py | 20 - deepray/datasets/csv_pipeline/__init__.py | 0 deepray/datasets/csv_pipeline/csv_pipeline.py | 18 + deepray/datasets/datapipeline.py | 51 +- deepray/datasets/dataset_factory.py | 6 +- .../GooglePretrainedWeightDownloader.py | 4 +- deepray/datasets/downloader/bertPrep.py | 21 +- .../downloader/create_datasets_from_start.sh | 2 +- .../downloader/create_finetuning_data.py | 2 - .../downloader/create_pretraining_data.py | 2 - .../datasets/fashion_mnist/fashion_mnist.py | 22 +- .../fashion_mnist/fashion_mnist_test.py | 2 - .../datasets/imagenet-1k/imagenet_to_gcs.py | 3 - deepray/datasets/imdb/imdb.py | 10 +- deepray/datasets/imdb/imdb_test.py | 2 - deepray/datasets/kafka_dataset.py | 43 - deepray/datasets/kafka_pipeline/__init__.py | 0 .../datasets/kafka_pipeline/kafka_pipeline.py | 254 + .../kafka_pipeline/kafka_pipeline_test.py | 52 + deepray/datasets/mnist/mnist.py | 28 +- deepray/datasets/mnist/mnist_test.py | 45 +- deepray/datasets/movielens/movielens.csv | 2 +- deepray/datasets/movielens/movielens.py | 4 +- .../movielens/movielens_100k_ratings.py | 84 +- .../movielens/movielens_100k_ratings_test.py | 43 - .../movielens/movielens_1m_ratings.py | 94 +- .../movielens/movielens_1m_ratings_test.py | 43 - .../movielens/movielens_ratings_test.py | 39 + deepray/datasets/movielens/process.py | 4 +- deepray/datasets/movielens/producer.py | 14 +- deepray/datasets/openwebtext/openwebtext.py | 7 +- .../datasets/openwebtext/openwebtext_test.py | 2 - .../parquet_pipeline/ali_parquet_dataset.py | 232 +- .../ali_parquet_dataset_test.py | 51 +- .../parquet_pipeline/parquet_pipeline_test.py | 2 - deepray/datasets/squad/classifier_dataset.py | 101 + deepray/datasets/squad/pretrain_dataset.py | 122 + deepray/datasets/squad/squad.py | 12 +- deepray/datasets/squad/squad_dataset.py | 111 + deepray/datasets/squad/squad_test.py | 2 - .../tfrecord_pipeline/tfrecord_pipeline.py | 11 +- .../tfrecord_pipeline_test.py | 2 - .../toxic_comment_classification_challenge.py | 7 +- ...c_comment_classification_challenge_test.py | 2 - .../datasets/wikicorpus_en/wikicorpus_en.py | 1 - .../wikicorpus_en/wikicorpus_en_test.py | 2 - deepray/deepray.bzl | 336 +- deepray/layers/BUILD | 3 +- deepray/layers/__init__.py | 3 +- deepray/layers/attention.py | 43 +- deepray/layers/dcn.py | 45 +- deepray/layers/dense.py | 287 + deepray/layers/dense_einsum.py | 1 - deepray/layers/dynamic_embedding.py | 258 +- deepray/layers/embedding.py | 62 +- deepray/layers/embedding_variable.py | 206 + deepray/layers/feature_cross.py | 31 +- deepray/layers/masked_softmax.py | 1 - deepray/layers/max_unpooling_2d.py | 41 +- deepray/layers/max_unpooling_2d_v2.py | 2 +- deepray/layers/mlp.py | 27 +- deepray/layers/networks/__init__.py | 17 - deepray/layers/noisy_dense.py | 33 +- deepray/layers/on_device_embedding.py | 1 - deepray/layers/pooling.py | 4 +- deepray/layers/rnn/esn_cell.py | 8 +- deepray/layers/rnn/layer_norm_lstm_cell.py | 2 +- .../layers/rnn/layer_norm_simple_rnn_cell.py | 2 +- deepray/layers/rnn/nas_cell.py | 8 +- deepray/layers/rnn/tests/esn_cell_test.py | 2 +- .../rnn/tests/layer_norm_lstm_cell_test.py | 2 +- .../tests/layer_norm_simple_rnn_cell_test.py | 2 +- deepray/layers/rnn/tests/nas_cell_test.py | 2 +- deepray/layers/self_attention_mask.py | 1 - .../tests_bak/on_device_embedding_test.py | 183 + deepray/layers/tf_utils.py | 7 +- deepray/layers/transformer.py | 15 +- deepray/layers/transformer_scaffold.py | 1 - deepray/losses/__init__.py | 23 +- deepray/losses/_loss_util.py | 281 + deepray/losses/contrastive.py | 4 +- deepray/losses/focal_loss.py | 7 +- deepray/losses/giou_loss.py | 4 +- deepray/losses/lifted.py | 11 +- deepray/losses/losses_impl.py | 1937 + deepray/losses/quantiles.py | 5 +- deepray/losses/softmax_loss.py | 167 + ...ed_sparse_categorical_crossentropy_test.py | 377 + deepray/losses/triplet.py | 12 +- deepray/losses/utils.py | 563 + ...eighted_sparse_categorical_crossentropy.py | 108 + deepray/metrics/__init__.py | 9 +- deepray/metrics/_ranking.py | 165 + deepray/metrics/alpha_dcg.py | 126 + deepray/metrics/arp.py | 47 + deepray/metrics/cohens_kappa.py | 7 +- deepray/metrics/dcg.py | 75 + deepray/metrics/f_scores.py | 8 +- deepray/metrics/geometric_mean.py | 7 +- deepray/metrics/hits.py | 65 + .../matthews_correlation_coefficient.py | 4 +- deepray/metrics/mean_average_precision.py | 79 + deepray/metrics/metrics_impl.py | 895 + deepray/metrics/mrr.py | 111 + .../metrics/multilabel_confusion_matrix.py | 2 +- deepray/metrics/ndcg.py | 131 + deepray/metrics/opa.py | 55 + deepray/metrics/precision.py | 73 + deepray/metrics/precision_ia.py | 88 + deepray/metrics/r_square.py | 2 +- deepray/metrics/recall.py | 73 + deepray/metrics/streaming_correlations.py | 6 +- deepray/metrics/utils.py | 153 +- deepray/models/BUILD | 1 + deepray/{layers/networks => models}/README.md | 4 +- deepray/models/__init__.py | 3 + .../albert_transformer_encoder.py | 21 +- .../networks => models}/bert_classifier.py | 5 +- .../networks => models}/bert_pretrainer.py | 9 +- .../networks => models}/bert_span_labeler.py | 7 +- .../networks => models}/classification.py | 1 - .../networks => models}/encoder_scaffold.py | 19 +- .../{layers/networks => models}/masked_lm.py | 2 - deepray/models/ncf_common.py | 2 - deepray/models/ncf_model.py | 2 +- deepray/models/ncf_test.py | 4 +- deepray/models/rec/base_model.py | 2 - deepray/models/rec/flen.py | 2 - deepray/models/rec/tfra_demo.py | 192 - deepray/models/rec/tower_new_tfra.py | 162 - .../networks => models}/span_labeling.py | 0 deepray/models/tests/__init__.py | 0 .../tests}/albert_transformer_encoder_test.py | 2 +- .../tests}/bert_classifier_test.py | 0 .../tests}/bert_pretrainer_test.py | 0 .../tests}/bert_span_labeler_test.py | 0 .../tests}/classification_test.py | 0 .../tests}/encoder_scaffold_test.py | 28 +- .../tests}/masked_lm_test.py | 3 +- .../tests}/span_labeling_test.py | 0 .../tests}/transformer_encoder_test.py | 2 +- .../transformer_encoder.py | 18 +- deepray/optimizers/BUILD | 13 + deepray/optimizers/__init__.py | 5 +- deepray/optimizers/adagrad.py | 83 + deepray/optimizers/adam.py | 99 +- deepray/optimizers/adam_async.py | 188 + deepray/optimizers/ev_optimizer_patch.py | 260 + deepray/optimizers/ftrl.py | 96 + deepray/optimizers/gradient_descent.py | 91 + deepray/optimizers/lazy_adam.py | 14 +- deepray/optimizers/multi_optimizer.py | 90 +- deepray/optimizers/optimization.py | 5 - .../tests/weight_decay_optimizers_test.py | 2 +- deepray/optimizers/weight_decay_optimizers.py | 2 +- deepray/repo.bzl | 48 + deepray/seq2seq/BUILD | 26 - deepray/seq2seq/__init__.py | 53 - deepray/tensorflow.bzl | 333 - deepray/utils/BUILD | 4 + deepray/utils/benchmark.py | 14 +- deepray/utils/ckpt_util.py | 11 + deepray/utils/data/feature_map.py | 160 +- deepray/utils/data/input_meta.py | 2 - deepray/utils/dllogger_class.py | 77 - deepray/utils/export/export.py | 157 +- deepray/utils/flags/_base.py | 20 +- deepray/utils/flags/_benchmark.py | 24 +- deepray/utils/flags/_device.py | 4 +- deepray/utils/flags/_distribution.py | 8 +- deepray/utils/flags/common_flags.py | 81 +- deepray/utils/flags/core.py | 2 +- deepray/utils/horovod_utils.py | 39 +- deepray/utils/keras_utils.py | 210 +- deepray/utils/logging_util.py | 392 + deepray/utils/logs/hooks.py | 113 - deepray/utils/logs/hooks_test.py | 142 - deepray/utils/logs/logger.py | 2 - deepray/utils/logs/metric_hook.py | 91 - deepray/utils/logs/metric_hook_test.py | 208 - deepray/utils/logs/mlperf_helper.py | 1 - deepray/utils/logs/summary_manager.py | 4 +- deepray/utils/resource_loader.py | 4 +- deepray/utils/test_utils.py | 10 +- deepray/utils/timer.py | 34 + deepray/utils/types.py | 33 +- deepray/version.py | 4 +- deepray/workspace0.bzl | 4 +- deepray/workspace2.bzl | 155 +- deepray/workspace3.bzl | 16 +- docker.sh | 30 +- .../run_horovod.sh | 64 +- .../CV/Classify_images_of_clothing/train.py | 50 +- modelzoo/CV/GAN/train.py | 5 +- modelzoo/CV/SwinTransformers/train.py | 4 +- modelzoo/CV/mnist/run_early.sh | 15 +- modelzoo/CV/mnist/run_horovod.sh | 46 +- modelzoo/CV/mnist/train.py | 132 +- modelzoo/CV/mnist/train_earlystop.py | 112 - modelzoo/ELECTRA/.gitignore | 129 + modelzoo/ELECTRA/Dockerfile | 31 + modelzoo/ELECTRA/LICENSE | 203 + modelzoo/ELECTRA/NOTICE | 5 + modelzoo/ELECTRA/README.md | 1005 + modelzoo/ELECTRA/build_pretraining_dataset.py | 237 + modelzoo/ELECTRA/configuration.py | 132 + modelzoo/ELECTRA/configuration_utils.py | 518 + modelzoo/ELECTRA/data/BooksDownloader.py | 26 + .../ELECTRA/data/BookscorpusTextFormatting.py | 32 + modelzoo/ELECTRA/data/Downloader.py | 91 + .../data/GooglePretrainedWeightDownloader.py | 158 + modelzoo/ELECTRA/data/MRPCDownloader.py | 44 + .../data/NVIDIAPretrainedWeightDownloader.py | 27 + modelzoo/ELECTRA/data/SquadDownloader.py | 54 + modelzoo/ELECTRA/data/TextSharding.py | 327 + modelzoo/ELECTRA/data/WikiDownloader.py | 57 + .../ELECTRA/data/WikicorpusTextFormatting.py | 46 + modelzoo/ELECTRA/data/__init__.py | 12 + .../data/create_datasets_from_start.sh | 47 + modelzoo/ELECTRA/data/dataPrep.py | 312 + modelzoo/ELECTRA/data/glue/download_mrpc.sh | 20 + modelzoo/ELECTRA/data/squad/squad_download.sh | 73 + modelzoo/ELECTRA/file_utils.py | 515 + modelzoo/ELECTRA/gpu_affinity.py | 63 + modelzoo/ELECTRA/images/total_loss.svg | 1 + modelzoo/ELECTRA/modeling.py | 1084 + modelzoo/ELECTRA/modeling_utils.py | 2843 ++ modelzoo/ELECTRA/optimization.py | 383 + .../ELECTRA/postprocess_pretrained_ckpt.py | 72 + modelzoo/ELECTRA/pretrain_utils.py | 367 + modelzoo/ELECTRA/run.sub | 88 + modelzoo/ELECTRA/run_inference.py | 212 + modelzoo/ELECTRA/run_pretraining.py | 505 + modelzoo/ELECTRA/run_tf_squad.py | 675 + .../ELECTRA/scripts/benchmark_pretraining.sh | 43 + modelzoo/ELECTRA/scripts/benchmark_squad.sh | 28 + modelzoo/ELECTRA/scripts/bind.sh | 226 + .../scripts/configs/pretrain_config.sh | 411 + .../ELECTRA/scripts/configs/squad_config.sh | 271 + modelzoo/ELECTRA/scripts/docker/build.sh | 15 + modelzoo/ELECTRA/scripts/docker/launch.sh | 29 + .../scripts/finetune_ckpts_on_squad.sh | 28 + modelzoo/ELECTRA/scripts/run_pretraining.sh | 171 + modelzoo/ELECTRA/scripts/run_squad.sh | 112 + modelzoo/ELECTRA/squad_utils.py | 1093 + modelzoo/ELECTRA/tokenization.py | 68 + modelzoo/ELECTRA/tokenization_utils.py | 2415 ++ modelzoo/ELECTRA/utils.py | 231 + modelzoo/ELECTRA/vocab/vocab.txt | 30522 ++++++++++++++++ modelzoo/LanguageModeling/BERT/.dockerignore | 27 + modelzoo/LanguageModeling/BERT/.gitignore | 147 + modelzoo/LanguageModeling/BERT/Bert_result.md | 26 - modelzoo/LanguageModeling/BERT/Dockerfile | 55 + modelzoo/LanguageModeling/BERT/README.md | 8 +- .../LanguageModeling/BERT/bert_dllogger.json | 15 + .../BERT/classifier_data_lib.py | 581 + .../LanguageModeling/BERT/common_flags.py | 72 + .../BERT/create_finetuning_data.py | 184 + .../BERT/create_pretraining_data.py | 655 + .../BERT/data/BooksDownloader.py | 26 + .../BERT/data/BookscorpusTextFormatting.py | 32 + .../LanguageModeling/BERT/data/Downloader.py | 123 + .../BERT/data/GLUEDownloader.py | 46 + .../data/GooglePretrainedWeightDownloader.py | 157 + .../data/NVIDIAPretrainedWeightDownloader.py | 27 + .../BERT/data/PubMedDownloader.py | 93 + .../BERT/data/PubMedTextFormatting.py | 44 + modelzoo/LanguageModeling/BERT/data/README.md | 28 + .../BERT/data/SquadDownloader.py | 54 + .../BERT/data/TextSharding.py | 331 + .../BERT/data/WikiDownloader.py | 59 + .../BERT/data/WikicorpusTextFormatting.py | 46 + .../LanguageModeling/BERT/data/__init__.py | 12 + .../LanguageModeling/BERT/data/bertPrep.py | 388 + .../create_biobert_datasets_from_start.sh | 55 + .../BERT/data/create_datasets_from_start.sh | 71 + .../BERT/data/images/bert_pipeline.png | Bin 0 -> 212516 bytes .../BERT/data/images/images_nvlamb.png | Bin 0 -> 88164 bytes .../LanguageModeling/BERT/gpu_affinity.py | 63 + .../LanguageModeling/BERT/input_pipeline.py | 232 + ...uad_train_benchmark_base_fp16_gpu4_bs8.log | 477 - ...ing_squad_base_fp16_gbs48.230222025408.log | 594 - .../BERT/model_saving_utils.py | 101 + .../BERT/official/modeling/__init__.py | 0 .../official/modeling/hyperparams/__init__.py | 0 .../modeling/hyperparams/params_dict.py | 410 + .../modeling/hyperparams/params_dict_test.py | 322 + .../official/modeling/training/__init__.py | 0 .../modeling/training/distributed_executor.py | 800 + .../BERT/official}/nlp/bert_modeling.py | 358 +- .../BERT/official}/nlp/bert_models.py | 229 +- .../BERT/official/nlp/modeling/__init__.py | 1 + .../official/nlp/modeling/losses/__init__.py | 17 + ...eighted_sparse_categorical_crossentropy.py | 106 + ...ed_sparse_categorical_crossentropy_test.py | 381 + .../BERT/official/nlp/transformer/__init__.py | 0 .../nlp/transformer/beam_search_v1.py | 184 +- .../nlp/transformer/beam_search_v1_test.py | 15 +- .../official}/nlp/transformer/model_params.py | 11 +- .../official}/nlp/transformer/model_utils.py | 16 +- .../nlp/transformer/model_utils_test.py | 27 +- .../LanguageModeling/BERT/optimization.py | 140 +- modelzoo/LanguageModeling/BERT/run.sub | 82 + .../LanguageModeling/BERT/run_classifier.py | 402 + .../LanguageModeling/BERT/run_pretraining.py | 205 + modelzoo/LanguageModeling/BERT/run_squad.py | 414 +- .../BERT/run_squad_predict.py | 287 - .../benchmark_pretraining_lamb_phase2.sh | 8 +- .../BERT/scripts/docker/build.sh | 15 + .../BERT/scripts/docker/launch.sh | 28 + .../BERT/scripts/finetune_train_benchmark.sh | 55 +- .../BERT/scripts/gen_squad_evel.sh | 40 - .../BERT/scripts/run_inference_benchmark.sh | 4 +- .../scripts/run_inference_benchmark_seq128.sh | 4 +- .../BERT/scripts/run_pretraining_adam.sh | 6 +- .../scripts/run_pretraining_lamb_phase1.sh | 8 +- .../scripts/run_pretraining_lamb_phase2.sh | 8 +- .../BERT/scripts/run_squad.sh | 25 +- .../BERT/scripts/run_squad_inference.sh | 4 +- modelzoo/LanguageModeling/BERT/squad_lib.py | 877 + .../LanguageModeling/BERT/squad_lib_sp.py | 868 + modelzoo/LanguageModeling/BERT/tf_trt.py | 70 + .../LanguageModeling/BERT/tokenization.py | 537 + .../run_horovod.sh | 2 +- .../trainer.py | 4 +- .../a.py | 67 + .../models.py | 242 + .../run.py | 126 + .../run_dp.py | 139 + .../run.sh} | 52 +- .../CreditCardFraudDetection/run_horovod.sh | 77 - .../CreditCardFraudDetection/train.py | 46 +- modelzoo/Recommendation/Criteo_DCN/README.md | 32 + .../Criteo_DCN/datasets/__init__.py | 0 .../Criteo_DCN/datasets/custom_dataset.py | 26 + .../datasets/custom_dataset_test.py | 49 + modelzoo/Recommendation/Criteo_DCN/dcn_v2.py | 119 + modelzoo/Recommendation/Criteo_DCN/eval.py | 43 + .../Criteo_DCN/feature_map_small.csv | 41 + modelzoo/Recommendation/Criteo_DCN/infer.py | 43 + modelzoo/Recommendation/Criteo_DCN/run.sh | 37 + modelzoo/Recommendation/Criteo_DCN/train.py | 84 + modelzoo/Recommendation/Criteo_DCN/train1.py | 90 + modelzoo/Recommendation/MovieLens/mymodel.py | 57 + modelzoo/Recommendation/MovieLens/run.sh | 3 + .../Recommendation/MovieLens/run_ranking.sh | 3 + modelzoo/Recommendation/MovieLens/train.py | 94 + .../Recommendation/MovieLens/train_ranking.py | 81 + modelzoo/Recommendation/NCF/run_ncf.py | 7 +- ...s_2014_dien_fp16_gbs32768.230321132836.log | 2 +- ...ks_2014_din_fp16_gbs32768.230321132123.log | 2 +- ...ks_2014_sim_fp16_gbs32768.230321133429.log | 2 +- modelzoo/Recommendation/SIM/main.py | 5 +- modelzoo/Recommendation/SIM/run_dien.py | 4 +- modelzoo/Recommendation/SIM/run_din.py | 4 +- modelzoo/Recommendation/SIM/run_horovod.sh | 4 +- modelzoo/Recommendation/SIM/run_sim.py | 4 +- modelzoo/Recommendation/TFRA/demo.py | 68 - modelzoo/Recommendation/TFRA/demo_tfra.py | 50 - modelzoo/Recommendation/WideDeep/train.py | 4 +- .../avazu-ctr-prediction/ccpm.py | 2 - .../avazu-ctr-prediction/ccpm_diamond.py | 2 - .../avazu-ctr-prediction/run_horovod.sh | 2 +- .../avazu-ctr-prediction/train.py | 24 +- .../Frozen-Graph-TensorFlow/README.md | 38 - .../TensorFlow_v2/README.md | 75 - .../TensorFlow_v2/example_1.py | 103 - .../TensorFlow_v2/example_2.py | 167 - .../TensorFlow_v2/utils.py | 38 - modelzoo/Recommendation/criteo_ctr/dcn_v2.py | 140 +- .../criteo_ctr/feature_map_small.csv | 80 +- modelzoo/Recommendation/criteo_ctr/frozen.py | 46 - modelzoo/Recommendation/criteo_ctr/infer.py | 91 +- .../criteo_ctr/optimize_for_inference.py | 41 - .../Recommendation/criteo_ctr/run_horovod.sh | 90 - .../Recommendation/criteo_ctr/run_optimize.sh | 76 - modelzoo/Recommendation/criteo_ctr/train.py | 149 +- .../keras_horovod_dis/demo_tfra.py | 5 +- .../keras_horovod_distributed_demo.py | 2 +- recommendation/create_ncf_data.py | 2 +- recommendation/movielens.py | 2 +- recommendation/movielens_dataset.py | 8 +- recommendation/ncf_common.py | 4 +- recommendation/ncf_keras_main.py | 8 +- recommendation/ncf_test.py | 4 +- recommendation/ranking/common.py | 2 +- .../preprocessing/criteo_preprocess.py | 1 - recommendation/ranking/train.py | 3 +- recommendation/ranking/train_test.py | 2 +- requirements.txt | 14 +- setup.py | 3 - third_party/arrow/arrow-20.patch | 13 + third_party/arrow/arrow.BUILD | 206 +- third_party/clang_toolchain/BUILD | 0 .../clang_toolchain/cc_configure_clang.bzl | 27 + .../clang_toolchain/download_clang.bzl | 64 + third_party/cuCollections/BUILD | 0 third_party/cuCollections/cuCollections.BUILD | 26 + .../cuco.BUILD | 3 +- .../cucollection.patch | 80 +- third_party/cutlass.BUILD | 27 +- third_party/flash_attn/BUILD | 0 third_party/flash_attn/flash_attn.BUILD | 51 + third_party/flash_attn/flash_attn.patch | 450 + third_party/gpus/BUILD.bazel | 0 third_party/gpus/find_cuda_config.py | 161 +- third_party/leveldb.BUILD | 80 + third_party/openblas.BUILD | 1 + third_party/openssl.BUILD | 56 + third_party/py/BUILD | 40 + third_party/py/pypi.bzl | 54 + third_party/py/python_init_pip.bzl | 53 + third_party/readerwriterqueue.BUILD | 10 + third_party/remote_config/BUILD | 0 third_party/remote_config/common.bzl | 327 + third_party/repo.bzl | 244 + third_party/sparsehash.BUILD | 12 - third_party/sparsehash_c11/BUILD | 0 .../{ => sparsehash_c11}/sparsehash_c11.BUILD | 0 .../sparsehash_c11/sparsehash_c11.patch | 4956 +++ third_party/tf/BUILD | 0 third_party/tf/tf_215.patch | 31 + third_party/xla/BUILD.bazel | 25 + third_party/xla/workspace.bzl | 37 + tools/build_base_container.sh | 20 +- tools/docker/base_container.Dockerfile | 125 +- tools/docker/bashrc.bash | 4 +- tools/docker/bazel.bazelrc | 2 + tools/docker/build_wheel.Dockerfile | 2 +- tools/docker/entry.sh | 38 + tools/docker/py3.10_env.yml | 11 + tools/docker/py3.8_env.yml | 11 + tools/docker/sanity_check.Dockerfile | 12 +- tools/docs/build_docs.py | 2 +- tools/install_deps/install_clang.sh | 9 +- tools/install_deps/install_cmake.sh | 5 +- tools/install_deps/install_miniforge.sh | 58 + tools/install_deps/install_nsight-systems.sh | 24 + tools/install_deps/install_openmpi.sh | 15 +- tools/install_deps/install_python.sh | 50 +- tools/install_deps/pytest.txt | 7 - tools/install_deps/tensorflow-cpu.txt | 1 - tools/install_deps/tensorflow.txt | 2 +- tools/install_deps/typedapi.txt | 1 - tools/install_deps/yapf.txt | 2 +- tools/update_release_version.sh | 1 - 791 files changed, 130839 insertions(+), 16923 deletions(-) create mode 100644 build_deps/BUILD create mode 100644 build_deps/patches/BUILD create mode 100644 build_deps/patches/internal_visibility.patch create mode 100644 build_deps/patches/python_toolchain.patch create mode 100644 build_deps/patches/tensorflow_llvm_url.patch create mode 100644 build_deps/patches/tensorflow_serving.patch create mode 100644 build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch create mode 100644 build_deps/patches/tensorflow_zlib.patch create mode 100644 build_deps/patches/tf2xla_visibility.patch create mode 100644 build_deps/pip_tf/BUILD create mode 100644 build_deps/pip_tf/README.md create mode 100644 build_deps/pip_tf/defs.bzl create mode 100644 build_deps/pip_tf/pip_tf_flags_test.py create mode 100644 build_deps/pip_tf/tensorflow.bzl create mode 100644 build_deps/requirements.in create mode 100644 build_deps/requirements_lock_3_10.txt create mode 100644 build_deps/requirements_lock_3_11.txt create mode 100644 build_deps/requirements_lock_3_12.txt create mode 100644 build_deps/requirements_lock_3_13.txt delete mode 100644 build_deps/toolchains/gpu/crosstool/BUILD.tpl delete mode 100644 build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl delete mode 100755 build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl delete mode 100644 build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl delete mode 100644 build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl delete mode 100644 build_deps/toolchains/gpu/cub.BUILD delete mode 100644 build_deps/toolchains/gpu/cuda/BUILD.tpl delete mode 100644 build_deps/toolchains/gpu/cuda/BUILD.windows.tpl delete mode 100644 build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl delete mode 100644 build_deps/toolchains/gpu/cuda/cuda_config.h.tpl delete mode 100644 build_deps/toolchains/gpu/cuda_configure.bzl delete mode 100644 build_deps/toolchains/gpu/find_cuda_config.py create mode 100644 deepray/callbacks/model_checkpoint.py create mode 100644 deepray/callbacks/profiler_callback.py create mode 100644 deepray/callbacks/progbar_logger.py rename deepray/{utils/misc/keras_utils.py => callbacks/time_history.py} (65%) create mode 100644 deepray/callbacks/training_speed.py delete mode 100644 deepray/core/base_trainer.py delete mode 100644 deepray/core/base_trainer_test.py delete mode 100644 deepray/core/dllogger_class.py delete mode 100644 deepray/core/module.py create mode 100644 deepray/core/trainer.py delete mode 100644 deepray/core/utils/misc/keras_utils.py create mode 100644 deepray/custom_ops/embedding_bag/BUILD create mode 100644 deepray/custom_ops/embedding_bag/__init__.py create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc create mode 100644 deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc rename deepray/{layers/nlp => custom_ops/embedding_bag/python}/__init__.py (100%) create mode 100644 deepray/custom_ops/embedding_bag/python/embedding_bag.py rename deepray/{layers/nlp/transformer => custom_ops/embedding_bag/python/tests}/__init__.py (100%) create mode 100644 deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py rename deepray/{seq2seq => custom_ops/embedding_bag/python}/tests/run_all_test.py (72%) create mode 100644 deepray/custom_ops/embedding_variable/BUILD create mode 100644 deepray/custom_ops/embedding_variable/__init__.py create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/BUILD create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/batch.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/config.proto create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/BUILD create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h create mode 100644 deepray/custom_ops/embedding_variable/config.proto create mode 100644 deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py create mode 100644 deepray/custom_ops/embedding_variable/multiplex_1_test.py rename deepray/{seq2seq/tests => custom_ops/embedding_variable/python}/__init__.py (100%) create mode 100644 deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py create mode 100644 deepray/custom_ops/embedding_variable/python/group_embedding_types.py create mode 100644 deepray/custom_ops/embedding_variable/python/kv_variable_ops.py rename build_deps/toolchains/gpu/BUILD => deepray/custom_ops/embedding_variable/python/tests/__init__.py (100%) create mode 100644 deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py create mode 100644 deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py create mode 100644 deepray/custom_ops/embedding_variable/python/tests/run_all_test.py create mode 100644 deepray/custom_ops/embedding_variable/variable_scope.py create mode 100644 deepray/custom_ops/embedding_variable/variables.py rename build_deps/toolchains/gpu/crosstool/BUILD => deepray/custom_ops/multiplex_1/__init__.py (100%) rename build_deps/toolchains/gpu/cuda/BUILD => deepray/custom_ops/multiplex_2/__init__.py (100%) rename third_party/cucollection/BUILD => deepray/custom_ops/multiplex_3/__init__.py (100%) create mode 100644 deepray/custom_ops/multiplex_4/__init__.py create mode 100644 deepray/custom_ops/seq2seq/__init__.py rename deepray/{seq2seq => custom_ops/seq2seq/python}/README.md (90%) create mode 100644 deepray/custom_ops/seq2seq/python/__init__.py rename deepray/{seq2seq => custom_ops/seq2seq/python}/attention_wrapper.py (95%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/basic_decoder.py (87%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/beam_search_decoder.py (98%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/decoder.py (97%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/loss.py (99%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/sampler.py (99%) create mode 100644 deepray/custom_ops/seq2seq/python/tests/__init__.py rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/attention_wrapper_test.py (98%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/basic_decoder_test.py (99%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/beam_search_decoder_test.py (98%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/beam_search_ops_test.py (94%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/decoder_test.py (97%) rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/loss_test.py (99%) create mode 100644 deepray/custom_ops/seq2seq/python/tests/run_all_test.py delete mode 100644 deepray/custom_ops/unique_ops/cc/kernels/random.cc create mode 100644 deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py delete mode 100644 deepray/custom_ops/unique_ops/python/tests/unique_op_test.py create mode 100644 deepray/custom_ops/utils/BUILD create mode 100644 deepray/custom_ops/utils/check.h create mode 100644 deepray/custom_ops/utils/fake_input.cc create mode 100644 deepray/custom_ops/utils/fake_input.h create mode 100644 deepray/custom_ops/utils/kernel_benchmark_testlib.cc create mode 100644 deepray/custom_ops/utils/kernel_benchmark_testlib.h create mode 100644 deepray/custom_ops/utils/ok_status_util.h create mode 100644 deepray/custom_ops/utils/ops_testutil.cc create mode 100644 deepray/custom_ops/utils/ops_testutil.h create mode 100644 deepray/custom_ops/utils/ops_testutil_test.cc rename deepray/custom_ops/{unique_ops/cc/kernels/random_test.cc => utils/random.cc} (67%) rename deepray/custom_ops/{unique_ops/cc/kernels => utils}/random.h (82%) create mode 100644 deepray/custom_ops/utils/spin_lock.h create mode 100644 deepray/custom_ops/utils/spin_rw_lock.h create mode 100644 deepray/custom_ops/utils/tensor_testutil.cc create mode 100644 deepray/custom_ops/utils/tensor_testutil.h create mode 100644 deepray/custom_ops/utils/tensor_testutil_test.cc rename deepray/datasets/criteo/{docker => }/Dockerfile_preprocessing (100%) delete mode 100644 deepray/datasets/criteo/README.md create mode 100644 deepray/datasets/criteo/criteo_dataset.md create mode 100644 deepray/datasets/criteo/preproc/data/__init__.py create mode 100644 deepray/datasets/criteo/preproc/data/defaults.py create mode 100644 deepray/datasets/criteo/preproc/data/feature_spec.py rename deepray/datasets/criteo/{docker => }/requirements_preprocessing.txt (58%) delete mode 100644 deepray/datasets/csv_pipeline.py create mode 100644 deepray/datasets/csv_pipeline/__init__.py create mode 100644 deepray/datasets/csv_pipeline/csv_pipeline.py delete mode 100644 deepray/datasets/kafka_dataset.py create mode 100644 deepray/datasets/kafka_pipeline/__init__.py create mode 100644 deepray/datasets/kafka_pipeline/kafka_pipeline.py create mode 100644 deepray/datasets/kafka_pipeline/kafka_pipeline_test.py delete mode 100644 deepray/datasets/movielens/movielens_100k_ratings_test.py delete mode 100644 deepray/datasets/movielens/movielens_1m_ratings_test.py create mode 100644 deepray/datasets/movielens/movielens_ratings_test.py create mode 100644 deepray/datasets/squad/classifier_dataset.py create mode 100644 deepray/datasets/squad/pretrain_dataset.py create mode 100644 deepray/datasets/squad/squad_dataset.py create mode 100644 deepray/layers/dense.py create mode 100644 deepray/layers/embedding_variable.py create mode 100644 deepray/layers/tests_bak/on_device_embedding_test.py create mode 100644 deepray/losses/_loss_util.py create mode 100644 deepray/losses/losses_impl.py create mode 100644 deepray/losses/softmax_loss.py create mode 100644 deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py create mode 100644 deepray/losses/utils.py create mode 100644 deepray/losses/weighted_sparse_categorical_crossentropy.py create mode 100644 deepray/metrics/_ranking.py create mode 100644 deepray/metrics/alpha_dcg.py create mode 100644 deepray/metrics/arp.py create mode 100644 deepray/metrics/dcg.py create mode 100644 deepray/metrics/hits.py create mode 100644 deepray/metrics/mean_average_precision.py create mode 100644 deepray/metrics/metrics_impl.py create mode 100644 deepray/metrics/mrr.py create mode 100644 deepray/metrics/ndcg.py create mode 100644 deepray/metrics/opa.py create mode 100644 deepray/metrics/precision.py create mode 100644 deepray/metrics/precision_ia.py create mode 100644 deepray/metrics/recall.py rename deepray/{layers/networks => models}/README.md (96%) rename deepray/{layers/networks => models}/albert_transformer_encoder.py (91%) rename deepray/{layers/networks => models}/bert_classifier.py (95%) rename deepray/{layers/networks => models}/bert_pretrainer.py (96%) rename deepray/{layers/networks => models}/bert_span_labeler.py (93%) rename deepray/{layers/networks => models}/classification.py (97%) rename deepray/{layers/networks => models}/encoder_scaffold.py (94%) rename deepray/{layers/networks => models}/masked_lm.py (98%) delete mode 100644 deepray/models/rec/tfra_demo.py delete mode 100644 deepray/models/rec/tower_new_tfra.py rename deepray/{layers/networks => models}/span_labeling.py (100%) create mode 100644 deepray/models/tests/__init__.py rename deepray/{layers/networks => models/tests}/albert_transformer_encoder_test.py (98%) rename deepray/{layers/networks => models/tests}/bert_classifier_test.py (100%) rename deepray/{layers/networks => models/tests}/bert_pretrainer_test.py (100%) rename deepray/{layers/networks => models/tests}/bert_span_labeler_test.py (100%) rename deepray/{layers/networks => models/tests}/classification_test.py (100%) rename deepray/{layers/networks => models/tests}/encoder_scaffold_test.py (96%) rename deepray/{layers/networks => models/tests}/masked_lm_test.py (98%) rename deepray/{layers/networks => models/tests}/span_labeling_test.py (100%) rename deepray/{layers/networks => models/tests}/transformer_encoder_test.py (99%) rename deepray/{layers/networks => models}/transformer_encoder.py (92%) create mode 100644 deepray/optimizers/adagrad.py create mode 100644 deepray/optimizers/adam_async.py create mode 100644 deepray/optimizers/ev_optimizer_patch.py create mode 100644 deepray/optimizers/ftrl.py create mode 100644 deepray/optimizers/gradient_descent.py create mode 100644 deepray/repo.bzl delete mode 100644 deepray/seq2seq/BUILD delete mode 100644 deepray/seq2seq/__init__.py delete mode 100644 deepray/tensorflow.bzl create mode 100644 deepray/utils/ckpt_util.py delete mode 100644 deepray/utils/dllogger_class.py create mode 100644 deepray/utils/logging_util.py delete mode 100644 deepray/utils/logs/hooks.py delete mode 100644 deepray/utils/logs/hooks_test.py delete mode 100644 deepray/utils/logs/metric_hook.py delete mode 100644 deepray/utils/logs/metric_hook_test.py create mode 100644 deepray/utils/timer.py delete mode 100644 modelzoo/CV/mnist/train_earlystop.py create mode 100644 modelzoo/ELECTRA/.gitignore create mode 100644 modelzoo/ELECTRA/Dockerfile create mode 100644 modelzoo/ELECTRA/LICENSE create mode 100644 modelzoo/ELECTRA/NOTICE create mode 100644 modelzoo/ELECTRA/README.md create mode 100644 modelzoo/ELECTRA/build_pretraining_dataset.py create mode 100644 modelzoo/ELECTRA/configuration.py create mode 100644 modelzoo/ELECTRA/configuration_utils.py create mode 100644 modelzoo/ELECTRA/data/BooksDownloader.py create mode 100644 modelzoo/ELECTRA/data/BookscorpusTextFormatting.py create mode 100644 modelzoo/ELECTRA/data/Downloader.py create mode 100644 modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py create mode 100644 modelzoo/ELECTRA/data/MRPCDownloader.py create mode 100644 modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py create mode 100644 modelzoo/ELECTRA/data/SquadDownloader.py create mode 100644 modelzoo/ELECTRA/data/TextSharding.py create mode 100644 modelzoo/ELECTRA/data/WikiDownloader.py create mode 100644 modelzoo/ELECTRA/data/WikicorpusTextFormatting.py create mode 100644 modelzoo/ELECTRA/data/__init__.py create mode 100755 modelzoo/ELECTRA/data/create_datasets_from_start.sh create mode 100644 modelzoo/ELECTRA/data/dataPrep.py create mode 100755 modelzoo/ELECTRA/data/glue/download_mrpc.sh create mode 100755 modelzoo/ELECTRA/data/squad/squad_download.sh create mode 100644 modelzoo/ELECTRA/file_utils.py create mode 100644 modelzoo/ELECTRA/gpu_affinity.py create mode 100644 modelzoo/ELECTRA/images/total_loss.svg create mode 100644 modelzoo/ELECTRA/modeling.py create mode 100644 modelzoo/ELECTRA/modeling_utils.py create mode 100644 modelzoo/ELECTRA/optimization.py create mode 100644 modelzoo/ELECTRA/postprocess_pretrained_ckpt.py create mode 100644 modelzoo/ELECTRA/pretrain_utils.py create mode 100644 modelzoo/ELECTRA/run.sub create mode 100644 modelzoo/ELECTRA/run_inference.py create mode 100644 modelzoo/ELECTRA/run_pretraining.py create mode 100644 modelzoo/ELECTRA/run_tf_squad.py create mode 100644 modelzoo/ELECTRA/scripts/benchmark_pretraining.sh create mode 100644 modelzoo/ELECTRA/scripts/benchmark_squad.sh create mode 100755 modelzoo/ELECTRA/scripts/bind.sh create mode 100644 modelzoo/ELECTRA/scripts/configs/pretrain_config.sh create mode 100644 modelzoo/ELECTRA/scripts/configs/squad_config.sh create mode 100644 modelzoo/ELECTRA/scripts/docker/build.sh create mode 100644 modelzoo/ELECTRA/scripts/docker/launch.sh create mode 100644 modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh create mode 100644 modelzoo/ELECTRA/scripts/run_pretraining.sh create mode 100644 modelzoo/ELECTRA/scripts/run_squad.sh create mode 100644 modelzoo/ELECTRA/squad_utils.py create mode 100644 modelzoo/ELECTRA/tokenization.py create mode 100644 modelzoo/ELECTRA/tokenization_utils.py create mode 100644 modelzoo/ELECTRA/utils.py create mode 100755 modelzoo/ELECTRA/vocab/vocab.txt create mode 100644 modelzoo/LanguageModeling/BERT/.dockerignore create mode 100644 modelzoo/LanguageModeling/BERT/.gitignore delete mode 100644 modelzoo/LanguageModeling/BERT/Bert_result.md create mode 100644 modelzoo/LanguageModeling/BERT/Dockerfile create mode 100644 modelzoo/LanguageModeling/BERT/bert_dllogger.json create mode 100644 modelzoo/LanguageModeling/BERT/classifier_data_lib.py create mode 100644 modelzoo/LanguageModeling/BERT/common_flags.py create mode 100644 modelzoo/LanguageModeling/BERT/create_finetuning_data.py create mode 100644 modelzoo/LanguageModeling/BERT/create_pretraining_data.py create mode 100644 modelzoo/LanguageModeling/BERT/data/BooksDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py create mode 100644 modelzoo/LanguageModeling/BERT/data/Downloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py create mode 100644 modelzoo/LanguageModeling/BERT/data/README.md create mode 100644 modelzoo/LanguageModeling/BERT/data/SquadDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/TextSharding.py create mode 100644 modelzoo/LanguageModeling/BERT/data/WikiDownloader.py create mode 100644 modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py create mode 100644 modelzoo/LanguageModeling/BERT/data/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/data/bertPrep.py create mode 100644 modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh create mode 100644 modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh create mode 100644 modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png create mode 100644 modelzoo/LanguageModeling/BERT/data/images/images_nvlamb.png create mode 100644 modelzoo/LanguageModeling/BERT/gpu_affinity.py create mode 100644 modelzoo/LanguageModeling/BERT/input_pipeline.py delete mode 100644 modelzoo/LanguageModeling/BERT/logs/squad_train_benchmark_base_fp16_gpu4_bs8.log delete mode 100644 modelzoo/LanguageModeling/BERT/logs/tf_bert_finetuning_squad_base_fp16_gbs48.230222025408.log create mode 100644 modelzoo/LanguageModeling/BERT/model_saving_utils.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/training/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/bert_modeling.py (81%) rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/bert_models.py (61%) create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/transformer/__init__.py rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/beam_search_v1.py (83%) rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/beam_search_v1_test.py (85%) rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_params.py (94%) rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_utils.py (89%) rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_utils_test.py (74%) create mode 100644 modelzoo/LanguageModeling/BERT/run.sub create mode 100644 modelzoo/LanguageModeling/BERT/run_classifier.py create mode 100644 modelzoo/LanguageModeling/BERT/run_pretraining.py delete mode 100644 modelzoo/LanguageModeling/BERT/run_squad_predict.py create mode 100644 modelzoo/LanguageModeling/BERT/scripts/docker/build.sh create mode 100644 modelzoo/LanguageModeling/BERT/scripts/docker/launch.sh delete mode 100644 modelzoo/LanguageModeling/BERT/scripts/gen_squad_evel.sh create mode 100644 modelzoo/LanguageModeling/BERT/squad_lib.py create mode 100644 modelzoo/LanguageModeling/BERT/squad_lib_sp.py create mode 100644 modelzoo/LanguageModeling/BERT/tf_trt.py create mode 100644 modelzoo/LanguageModeling/BERT/tokenization.py create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py rename modelzoo/Recommendation/{TFRA/run_horovod.sh => CreditCardFraudDetection/run.sh} (58%) delete mode 100644 modelzoo/Recommendation/CreditCardFraudDetection/run_horovod.sh create mode 100644 modelzoo/Recommendation/Criteo_DCN/README.md create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/__init__.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/dcn_v2.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/eval.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/feature_map_small.csv create mode 100644 modelzoo/Recommendation/Criteo_DCN/infer.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/run.sh create mode 100644 modelzoo/Recommendation/Criteo_DCN/train.py create mode 100644 modelzoo/Recommendation/Criteo_DCN/train1.py create mode 100644 modelzoo/Recommendation/MovieLens/mymodel.py create mode 100644 modelzoo/Recommendation/MovieLens/run.sh create mode 100644 modelzoo/Recommendation/MovieLens/run_ranking.sh create mode 100644 modelzoo/Recommendation/MovieLens/train.py create mode 100644 modelzoo/Recommendation/MovieLens/train_ranking.py delete mode 100644 modelzoo/Recommendation/TFRA/demo.py delete mode 100644 modelzoo/Recommendation/TFRA/demo_tfra.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/README.md delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/README.md delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_1.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_2.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/utils.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/frozen.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/optimize_for_inference.py delete mode 100644 modelzoo/Recommendation/criteo_ctr/run_horovod.sh delete mode 100644 modelzoo/Recommendation/criteo_ctr/run_optimize.sh create mode 100644 third_party/arrow/arrow-20.patch create mode 100644 third_party/clang_toolchain/BUILD create mode 100644 third_party/clang_toolchain/cc_configure_clang.bzl create mode 100644 third_party/clang_toolchain/download_clang.bzl create mode 100644 third_party/cuCollections/BUILD create mode 100644 third_party/cuCollections/cuCollections.BUILD rename third_party/{cucollection => cuCollections}/cuco.BUILD (99%) rename third_party/{cucollection => cuCollections}/cucollection.patch (90%) create mode 100644 third_party/flash_attn/BUILD create mode 100644 third_party/flash_attn/flash_attn.BUILD create mode 100644 third_party/flash_attn/flash_attn.patch create mode 100644 third_party/gpus/BUILD.bazel create mode 100644 third_party/leveldb.BUILD create mode 100644 third_party/openssl.BUILD create mode 100644 third_party/py/BUILD create mode 100644 third_party/py/pypi.bzl create mode 100644 third_party/py/python_init_pip.bzl create mode 100644 third_party/readerwriterqueue.BUILD create mode 100644 third_party/remote_config/BUILD create mode 100644 third_party/remote_config/common.bzl create mode 100644 third_party/repo.bzl delete mode 100644 third_party/sparsehash.BUILD create mode 100644 third_party/sparsehash_c11/BUILD rename third_party/{ => sparsehash_c11}/sparsehash_c11.BUILD (100%) create mode 100644 third_party/sparsehash_c11/sparsehash_c11.patch create mode 100644 third_party/tf/BUILD create mode 100644 third_party/tf/tf_215.patch create mode 100644 third_party/xla/BUILD.bazel create mode 100644 third_party/xla/workspace.bzl create mode 100644 tools/docker/bazel.bazelrc create mode 100644 tools/docker/entry.sh create mode 100644 tools/docker/py3.10_env.yml create mode 100644 tools/docker/py3.8_env.yml create mode 100644 tools/install_deps/install_miniforge.sh create mode 100644 tools/install_deps/install_nsight-systems.sh delete mode 100644 tools/install_deps/pytest.txt delete mode 100644 tools/install_deps/tensorflow-cpu.txt delete mode 100644 tools/install_deps/typedapi.txt diff --git a/.bazelversion b/.bazelversion index 7d3cdbf0..4be2c727 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -5.3.1 \ No newline at end of file +6.5.0 \ No newline at end of file diff --git a/BUILD b/BUILD index 174fad05..369f8476 100644 --- a/BUILD +++ b/BUILD @@ -1,6 +1,45 @@ +load("@bazel_skylib//rules:build_test.bzl", "build_test") + +# Copyright 2024 The Deepray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +load("@rules_license//rules:license.bzl", "license") +load("//third_party/py:pypi.bzl", "pypi_requirement") + +package( + default_applicable_licenses = [":license"], + default_visibility = ["//deepray:__subpackages__"], +) + +license( + name = "license", + package_name = "deepray", +) + +exports_files([ + "LICENSE", + "setup.py", + "MANIFEST.in", + "README.md", + "requirements.txt", +]) + +############################################################################### +# PIP Package +############################################################################### sh_binary( name = "build_pip_pkg", - srcs = ["build_deps/build_pip_pkg.sh"], + srcs = ["//build_deps:build_pip_pkg.sh"], data = [ "LICENSE", "MANIFEST.in", diff --git a/README.md b/README.md index b40e2573..a0fdd4a4 100644 --- a/README.md +++ b/README.md @@ -1,220 +1,152 @@ +## **Introduction** +Deepray is a deep learning framework for Keras, to build model like LEGO, and train model with easier, faster and cheaper way. ------------------ -[![PyPI Status Badge](https://badge.fury.io/py/deepray.svg)](https://pypi.org/project/deepray/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/deepray)](https://pypi.org/project/deepray/) -[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/deepray/api_docs/python/dp) -[![Gitter chat](https://img.shields.io/badge/chat-on%20gitter-46bc99.svg)](https://gitter.im/tensorflow/sig-deepray) -[![Code style: yapf](https://img.shields.io/badge/code%20style-yapf-blue)](https://github.com/google/yapf) +## **Why Deepray?** +Deepray contains list of features to improve usability and performance for Deep Learning, especially provides some essential components for recommendation algorithm. -### Continuous Build Status -| Build | Status | -| --- | --- | -| **Ubuntu** | [![Status](https://github.com/deepray-AI/deepray/workflows/deepray-release/badge.svg)](https://github.com/deepray-AI/deepray/actions?query=workflow%3deepray-release) | +**Trainer** + - Distributed Training with Horovod backend + - Gradient accumulation +**Layers** + - Embedding Variable from [DeepRec](https://github.com/DeepRec-AI/DeepRec). + - Compositional Embedding + - Feature Cross layer for recommendation algorithm + - ...... -**Deepray** is a repository of contributions that conform to -well-established API patterns, but implement new functionality -not available in core TensorFlow. TensorFlow natively supports -a large number of operators, layers, metrics, losses, and optimizers. -However, in a fast moving field like ML, there are many interesting new -developments that cannot be integrated into core TensorFlow -(because their broad applicability is not yet clear, or it is mostly - used by a smaller subset of the community). +**Kernels** + - Group Embedding for Embedding Variable + - ...... +**Optimizer** + - Adam/Adagrad/SDG/FTRL Optimizer for Embedding Variable + - AdamAsync Optimizer + - MultiOptimizer + - ...... -## Maintainership -The maintainer of Deepray now is [@fuhailin](https://github.com/fuhailin). If you would -like to maintain something, please feel free to submit a PR. We encourage multiple -owners for all submodules. +**Datasets** + - Parquet Dataset from [HybridBackend](https://github.com/DeepRec-AI/HybridBackend) + - ...... -## Installation -#### Stable Builds -Deepray is available on PyPI for Linux. To install the latest version, run the following: -``` -pip install deepray -``` +**......** -To ensure you have a version of TensorFlow that is compatible with Deepray, you can specify the `tensorflow` extra requirement during install: +#### Compatibility Matrix +| Deepray | TensorFlow | Compiler | cuDNN | CUDA | +| :-------------- | :--------- | :--------- | :---- | :--- | +| deepray-0.21.86 | 2.15 | GCC 11.4.0 | 8.9 | 12.3.2 | -``` -pip install deepray[tensorflow] -``` -Similar extras exist for the `tensorflow-gpu` and `tensorflow-cpu` packages. To use Deepray: +# Quick start + - Install Deepray: -```python -import tensorflow as tf -import deepray as dp +```bash +pip install deepray ``` -### Python Op Compatility -Deepray is actively working towards forward compatibility with TensorFlow 2.x. -However, there are still a few private API uses within the repository so at the moment -we can only guarantee compatibility with the TensorFlow versions which it was tested against. -Warnings will be emitted when importing `deepray` if your TensorFlow version does not match -what it was tested against. - -#### Python Op Compatibility Matrix -| Deepray | TensorFlow | Python | -| :------------- | :------------- | :------------------ | -| deepray-0.18.0 | 2.9.3 | 3.8, 3.9, 3.10, 3.11 | - - -### C++ Custom Op Compatibility -TensorFlow C++ APIs are not stable and thus we can only guarantee compatibility with the -version Deepray was built against. It is possible custom ops will work with multiple -versions of TensorFlow, but there is also a chance for segmentation faults or other problematic crashes. -Warnings will be emitted when loading a custom op if your TensorFlow version does not match -what it was built against. - -Additionally, custom ops registration does not have a stable ABI interface so it is -required that users have a compatible installation of TensorFlow even if the versions -match what we had built against. A simplification of this is that **Deepray -custom ops will work with `pip`-installed TensorFlow** but will have issues when TensorFlow -is compiled differently. A typical example of this would be `conda`-installed TensorFlow. -[RFC #133](https://github.com/tensorflow/community/pull/133) aims to fix this. - - -#### C++ Custom Op Compatibility Matrix -| Deepray | TensorFlow | Compiler | cuDNN | CUDA | -| :------------- | :--------- | :-------- | :---- | :--- | -| deepray-0.18.0 | 2.12 | GCC 9.3.1 | 8.1 | 11.8 | - - - -#### Installing from Source -You can also install from source. This requires the [Bazel]( -https://bazel.build/) build system (version >= 1.0.0). - -##### CPU Custom Ops + - Using Docker(**Recommended**): +Latest Release Images: **hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04** ``` -git clone https://github.com/deepray-AI/deepray.git -cd deepray - -# This script links project with TensorFlow dependency -python3 ./configure.py - -bazel build build_pip_pkg -bazel-bin/build_pip_pkg artifacts - -pip install artifacts/deepray-*.whl +docker pull hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04 +docker run -it hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04 ``` -##### GPU and CPU Custom Ops + - Build from source: ``` git clone https://github.com/deepray-AI/deepray.git -cd deepray +cd deepray && bash build.sh +``` -export TF_NEED_CUDA="1" -# Set these if the below defaults are different on your system -export TF_CUDA_VERSION="11" -export TF_CUDNN_VERSION="8" -export CUDA_TOOLKIT_PATH="/usr/local/cuda" -export CUDNN_INSTALL_PATH="/usr/lib/x86_64-linux-gnu" +### Deepray example +Define the training workflow. Here's a toy example ([explore real examples](https://github.com/deepray-AI/deepray/blob/main/modelzoo/Recommendation/CreditCardFraudDetection/train.py)): -# This script links project with TensorFlow dependency -python3 ./configure.py +```python +# main.py +# ! pip install deepray +from typing import Dict -bazel build build_pip_pkg -bazel-bin/build_pip_pkg artifacts +import tensorflow as tf +from absl import flags -pip install artifacts/deepray-*.whl +import deepray as dp +from deepray.core.trainer import Trainer +from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating +from deepray.layers.embedding_variable import EmbeddingVariable + +# -------------------------------- +# Step 1: Define a Keras Module +# -------------------------------- +class RankingModel(tf.keras.Model): + + def __init__(self, embedding_dimension=32): + super().__init__() + # Compute embeddings for users. + self.user_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension) + self.movie_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension) + + # Compute predictions. + self.ratings = tf.keras.Sequential( + [ + # Learn multiple dense layers. + tf.keras.layers.Dense(256, activation="relu"), + tf.keras.layers.Dense(64, activation="relu"), + # Make rating predictions in the final layer. + tf.keras.layers.Dense(1) + ] + ) + + def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor: + user_id, movie_title = inputs["user_id"], inputs["movie_title"] + user_id = tf.reshape(user_id, [-1]) + movie_title = tf.reshape(movie_title, [-1]) + user_embedding = self.user_embeddings(user_id) + movie_embedding = self.movie_embeddings(movie_title) + emb_vec = tf.concat([user_embedding, movie_embedding], axis=1) + return self.ratings(emb_vec) + + +# ------------------- +# Step 2: Define data +# ------------------- +data_pipe = Movielens100kRating(split=True) +dataset = data_pipe(flags.FLAGS.batch_size, is_training=True) + +# ------------------- +# Step 3: Train +# ------------------- +optimizer = dp.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False) +model = RankingModel() +trainer = Trainer(model=model, optimizer=optimizer, loss="MSE", metrics=[tf.keras.metrics.RootMeanSquaredError()]) +trainer.fit(x=dataset) ``` -## Tutorials -See [`docs/tutorials/`](docs/tutorials/) -for end-to-end examples of various deepray. - -## Core Concepts +Run the model on your terminal -#### Standardized API within Subpackages -User experience and project maintainability are core concepts in -Deepray. In order to achieve these we require that our additions -conform to established API patterns seen in core TensorFlow. - -#### GPU and CPU Custom Ops -Deepray supports precompiled custom ops for CPU and GPU. However, -GPU custom ops currently only work on Linux distributions. For this reason Windows and macOS -will fallback to pure TensorFlow Python implementations whenever possible. +```bash +python main.py --batch_size=32 --learning_rate=0.03 +``` +---- +## Examples -The order of priority on macOS/Windows is: -1) Pure TensorFlow + Python implementation (works on CPU and GPU) -2) C++ implementation for CPU +###### Recommender Systems -The order of priority on Linux is: -1) CUDA implementation -2) C++ implementation -3) Pure TensorFlow + Python implementation (works on CPU and GPU) +- [Deep & Cross Network V2 with Criteo](https://github.com/deepray-AI/deepray/tree/main/modelzoo/Recommendation/criteo_ctr) +- [MovieLens](https://github.com/deepray-AI/deepray/tree/main/modelzoo/Recommendation) -If you want to change the default priority, "C++ and CUDA" VS "pure TensorFlow Python", -you can set the environment variable `DEEPRAY_PY_OPS=1` from the command line or -run `dp.options.disable_custom_kernel()` in your code. +###### Natural Language Processing -For example, if you are on Linux and you have compatibility problems with the compiled ops, -you can give priority to the Python implementations: +- [BERT](https://github.com/deepray-AI/deepray/tree/main/modelzoo/LanguageModeling/BERT) -From the command line: -```bash -export DEEPRAY_PY_OPS=1 -``` +###### Computer Vision -or in your code: - -```python -import deepray as dp -dp.options.disable_custom_kernel() -``` +- [Mnist](https://github.com/deepray-AI/deepray/tree/main/modelzoo/CV/mnist) -This variable defaults to `True` on Windows and macOS, and `False` on Linux. - -## Contributing -Deepray is a community-led open source project (only a few maintainers work for Google!). -As such, the project depends on public contributions, bug fixes, and documentation. -This project adheres to [TensorFlow's code of conduct](CODE_OF_CONDUCT.md). -By participating, you are expected to uphold this code. - -Do you want to contribute but are not sure of what? Here are a few suggestions: -1. Add a new tutorial. Located in [`docs/tutorials/`](docs/tutorials), - these are a great way to familiarize yourself and others with Deepray. See - [the guidelines](docs/tutorials/README.md) for more information on how to add - examples. -2. Improve the docstrings. The docstrings are fetched and then displayed in the documentation. - Do a change and hundreds of developers will see it and benefit from it. Maintainers are often focused - on making APIs, fixing bugs and other code related changes. The documentation will never - be loved enough! -3. Solve an [existing issue](https://github.com/deepray-AI/deepray/issues). - These range from low-level software bugs to higher-level design problems. - Check out the label [help wanted](https://github.com/tensorflow/deepray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22). If you're a new contributor, the label [good first issue](https://github.com/tensorflow/deepray/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) can be a good place to start. -4. Review a pull request. So you're not a software engineer but you know a lot - about a certain field a research? That's awesome and we need your help! Many people - are submitting pull requests to add layers/optimizers/functions taken from recent - papers. Since Deepray maintainers are not specialized in everything, - you can imagine how hard it is to review. It takes very long to read the paper, - understand it and check the math in the pull request. If you're specialized, look at - the [list of pull requests](https://github.com/deepray-AI/deepray/pulls). - If there is something from a paper you know, please comment on the pull request to - check the math is ok. If you see that everything is good, say it! It will help - the maintainers to sleep better at night knowing that he/she wasn't the only - person to approve the pull request. -5. You have an opinion and want to share it? The docs are not very helpful for - a function or a class? You tried to open a pull request but you didn't manage to - install or test anything and you think it's too complicated? You made a pull request - but you didn't find the process good enough and it made no sense to you? Please - say it! We want feedback. Maintainers are too much the head into the code - to understand what it's like for someone new to open source to come to this project. - If you don't understand something, be aware there are no people who are - bad at understanding, there are just bad tutorials and bad guides. - -Please see [contribution guidelines](CONTRIBUTING.md) to get started (and remember, -if you don't understand something, open an issue, or even make a pull request to -improve the guide!). - -## Community -* [Public Mailing List](https://groups.google.com/a/tensorflow.org/forum/#!forum/deepray) +## Communication +- [GitHub issues](https://github.com/deepray-AI/deepray/issues): any install, bug, feature issues. +- 微信号: StateOfArt ## License [Apache License 2.0](LICENSE) diff --git a/WORKSPACE b/WORKSPACE index 74519138..0fe7ed52 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,16 +1,99 @@ -load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure") -load("//build_deps/toolchains/gpu:cuda_configure.bzl", "cuda_configure") +workspace(name = "deepray") + +load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") # buildifier: disable=load-on-top +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") # buildifier: disable=load-on-top + +http_archive( + name = "rules_python", + sha256 = "d71d2c67e0bce986e1c5a7731b4693226867c45bfe0b7c5e0067228a536fc580", + strip_prefix = "rules_python-0.29.0", + url = "https://github.com/bazelbuild/rules_python/releases/download/0.29.0/rules_python-0.29.0.tar.gz", +) + +load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains") # buildifier: disable=load-on-top + +py_repositories() + +python_register_toolchains( + name = "python", + ignore_root_user_error = True, + python_version = "3.10", +) + +load("//third_party/xla:workspace.bzl", xla_repo = "repo") -tf_configure( - name = "local_config_tf", +xla_repo() + +# Initialize hermetic Python +load("@xla//third_party/py:python_init_rules.bzl", "python_init_rules") + +python_init_rules() + +load("@xla//third_party/py:python_init_repositories.bzl", "python_init_repositories") + +python_init_repositories( + default_python_version = "system", + requirements = { + "3.10": "//build_deps:requirements_lock_3_10.txt", + "3.11": "//build_deps:requirements_lock_3_11.txt", + "3.12": "//build_deps:requirements_lock_3_12.txt", + "3.13": "//build_deps:requirements_lock_3_13.txt", + }, ) +load("@xla//third_party/py:python_init_toolchains.bzl", "python_init_toolchains") + +python_init_toolchains() + +load("//third_party/py:python_init_pip.bzl", "python_init_pip") + +python_init_pip() + +load("@pypi//:requirements.bzl", "install_deps") + +install_deps() + +load("@xla//:workspace4.bzl", "xla_workspace4") + +xla_workspace4() + +load("@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl", "cuda_json_init_repository") + +cuda_json_init_repository() + +load("@cuda_redist_json//:distributions.bzl", "CUDA_REDISTRIBUTIONS", "CUDNN_REDISTRIBUTIONS") +load("@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl", "cuda_redist_init_repositories", "cudnn_redist_init_repository") + +cuda_redist_init_repositories(cuda_redistributions = CUDA_REDISTRIBUTIONS) + +cudnn_redist_init_repository(cudnn_redistributions = CUDNN_REDISTRIBUTIONS) + +load("@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl", "cuda_configure") + +cuda_configure(name = "local_config_cuda") + +load("@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl", "nccl_redist_init_repository") + +nccl_redist_init_repository() + +load("@tsl//third_party/nccl/hermetic:nccl_configure.bzl", "nccl_configure") + +nccl_configure(name = "local_config_nccl") + +load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure") + +tf_configure(name = "local_config_tf") + http_archive( name = "org_tensorflow", - strip_prefix = "tensorflow-2.9.1", + patch_args = ["-p1"], + patches = [ + "//third_party/tf:tf_215.patch", + ], + sha256 = "f36416d831f06fe866e149c7cd752da410a11178b01ff5620e9f265511ed57cf", + strip_prefix = "tensorflow-2.15.1", urls = [ - "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.9.1.tar.gz", + "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.15.1.tar.gz", ], ) @@ -30,12 +113,6 @@ load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0") tf_workspace0() -# Initialize the TensorFlow repository and all dependencies. -# -# The cascade of load() statements and tf_workspace?() calls works around the -# restriction that load() statements need to be at the top of .bzl files. -# E.g. we can not retrieve a new repository with http_archive and then load() -# a macro from that repository in the same file. load("@//deepray:workspace3.bzl", "dp_workspace3") dp_workspace3() @@ -51,5 +128,3 @@ dp_workspace2() load("@//deepray:workspace0.bzl", "dp_workspace0") dp_workspace0() - -cuda_configure(name = "local_config_cuda") diff --git a/build.sh b/build.sh index 8f2db504..b2435923 100644 --- a/build.sh +++ b/build.sh @@ -3,11 +3,10 @@ set -e yes "" | bash ./configure || true -# bazel build build_pip_pkg \ -# --action_env=HTTP_PROXY=http://127.0.0.1:7890 \ -# --action_env=HTTPS_PROXY=http://127.0.0.1:7890 - -bazel build build_pip_pkg +# --compilation_mode dbg \ +bazel build build_pip_pkg \ + --copt=-O3 --copt=-march=native \ + -s rm -rf artifacts/ diff --git a/build_deps/BUILD b/build_deps/BUILD new file mode 100644 index 00000000..8298d741 --- /dev/null +++ b/build_deps/BUILD @@ -0,0 +1,37 @@ +# Copyright 2024 The Deepray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +load("@python//:defs.bzl", "compile_pip_requirements") +load("@python_version_repo//:py_version.bzl", "REQUIREMENTS") + +licenses(["notice"]) + +package( + default_visibility = ["//deepray:__subpackages__"], +) + +exports_files(["build_pip_pkg.sh"]) + +compile_pip_requirements( + name = "requirements", + timeout = "moderate", + extra_args = [ + "--allow-unsafe", + "--build-isolation", + "--rebuild", + "--resolver=backtracking", + "-i https://pypi.tuna.tsinghua.edu.cn/simple", + ], + requirements_in = "requirements.in", + requirements_txt = REQUIREMENTS, +) diff --git a/build_deps/build_pip_pkg.sh b/build_deps/build_pip_pkg.sh index 07fe4ca4..5e962bcc 100755 --- a/build_deps/build_pip_pkg.sh +++ b/build_deps/build_pip_pkg.sh @@ -28,9 +28,9 @@ function is_macos() { } if is_windows; then - PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/__main__/" + PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/deepray/" else - PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/__main__/" + PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/deepray/" fi function abspath() { diff --git a/build_deps/patches/BUILD b/build_deps/patches/BUILD new file mode 100644 index 00000000..a4988a91 --- /dev/null +++ b/build_deps/patches/BUILD @@ -0,0 +1,15 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This empty BUILD file is required to make Bazel treat this directory as a package. diff --git a/build_deps/patches/internal_visibility.patch b/build_deps/patches/internal_visibility.patch new file mode 100644 index 00000000..7703001e --- /dev/null +++ b/build_deps/patches/internal_visibility.patch @@ -0,0 +1,13 @@ +diff --git tensorflow/BUILD tensorflow/BUILD +index 202553cd531..171eb04665c 100644 +--- tensorflow/BUILD ++++ tensorflow/BUILD +@@ -1039,7 +1039,7 @@ package_group( + "//smartass/brain/configure/...", + "//tensorflow/...", + "//tensorflow_decision_forests/...", +- "//tensorflow_federated/...", ++ "public", + "//third_party/cloud_tpu/convergence_tools/sdc_monitoring/...", + "//third_party/cloud_tpu/inference_converter/...", + "//third_party/py/cloud_ml_autoflow/...", diff --git a/build_deps/patches/python_toolchain.patch b/build_deps/patches/python_toolchain.patch new file mode 100644 index 00000000..14f3dc69 --- /dev/null +++ b/build_deps/patches/python_toolchain.patch @@ -0,0 +1,74 @@ +diff --git tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl +index a2bdd6a7eed..ec25c23d8d4 100644 +--- tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl ++++ tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl +@@ -2,7 +2,7 @@ + + load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure") + load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure") +-load("//third_party/py:python_configure.bzl", "remote_python_configure") ++load("//third_party/py/non_hermetic:python_configure.bzl", "remote_python_configure") + + def ml2014_tf_aarch64_configs(name_container_map, env): + for name, container in name_container_map.items(): +diff --git tensorflow/tools/toolchains/remote_config/rbe_config.bzl tensorflow/tools/toolchains/remote_config/rbe_config.bzl +index 9f71a414bf7..57f70752323 100644 +--- tensorflow/tools/toolchains/remote_config/rbe_config.bzl ++++ tensorflow/tools/toolchains/remote_config/rbe_config.bzl +@@ -1,6 +1,6 @@ + """Macro that creates external repositories for remote config.""" + +-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure") ++load("//third_party/py/non_hermetic:python_configure.bzl", "local_python_configure", "remote_python_configure") + load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure") + load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure") + load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure") +diff --git tensorflow/workspace2.bzl tensorflow/workspace2.bzl +index 7e9faa558a4..5b18cb0969a 100644 +--- tensorflow/workspace2.bzl ++++ tensorflow/workspace2.bzl +@@ -8,7 +8,7 @@ load("//third_party/gpus:rocm_configure.bzl", "rocm_configure") + load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure") + load("//third_party/nccl:nccl_configure.bzl", "nccl_configure") + load("//third_party/git:git_configure.bzl", "git_configure") +-load("//third_party/py:python_configure.bzl", "python_configure") ++load("//third_party/py/non_hermetic:python_configure.bzl", "python_configure") + load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure") + load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure") + load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure") +diff --git third_party/py/non_hermetic/python_configure.bzl third_party/py/non_hermetic/python_configure.bzl +index 300cbfb6c71..09d98505dd9 100644 +--- third_party/py/non_hermetic/python_configure.bzl ++++ third_party/py/non_hermetic/python_configure.bzl +@@ -206,7 +206,7 @@ def _create_local_python_repository(repository_ctx): + # Resolve all labels before doing any real work. Resolving causes the + # function to be restarted with all previous state being lost. This + # can easily lead to a O(n^2) runtime in the number of labels. +- build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl")) ++ build_tpl = repository_ctx.path(Label("//third_party/py/non_hermetic:BUILD.tpl")) + + python_bin = get_python_bin(repository_ctx) + _check_python_bin(repository_ctx, python_bin) +diff --git third_party/py/numpy/BUILD third_party/py/numpy/BUILD +index 97c7907fc38..c80cc5287bc 100644 +--- third_party/py/numpy/BUILD ++++ third_party/py/numpy/BUILD +@@ -2,14 +2,15 @@ licenses(["restricted"]) + + package(default_visibility = ["//visibility:public"]) + +-alias( ++py_library( + name = "numpy", +- actual = "@pypi_numpy//:pkg", ++ srcs = ["tf_numpy_dummy.py"] ++ srcs_version = "PY3", + ) + + alias( + name = "headers", +- actual = "@pypi_numpy//:numpy_headers", ++ actual = "@local_config_python//:numpy_headers", + ) + + genrule( diff --git a/build_deps/patches/tensorflow_llvm_url.patch b/build_deps/patches/tensorflow_llvm_url.patch new file mode 100644 index 00000000..88136b56 --- /dev/null +++ b/build_deps/patches/tensorflow_llvm_url.patch @@ -0,0 +1,23 @@ +diff --git third_party/llvm/workspace.bzl third_party/llvm/workspace.bzl +index 038e0ee5fe5..4693f5cfadc 100644 +--- third_party/llvm/workspace.bzl ++++ third_party/llvm/workspace.bzl +@@ -5,15 +5,15 @@ load("//third_party:repo.bzl", "tf_http_archive") + def repo(name): + """Imports LLVM.""" + LLVM_COMMIT = "668e33c6401abe7844691fb7d47a3cf2d2012dbc" +- LLVM_SHA256 = "b97fefaa486b106c8dd45b963116ed7684d8f3f55682116d5760b0b60db17702" ++ LLVM_SHA256 = "f6659fe4c8bfb271262abbe52f1f1320d12174504202c7c4bc4bce0910511297" + + tf_http_archive( + name = name, + sha256 = LLVM_SHA256, +- strip_prefix = "llvm-project-{commit}".format(commit = LLVM_COMMIT), ++ strip_prefix = "llvm-llvm-project-{commit_partial}".format(commit_partial = LLVM_COMMIT[:7]), + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), +- "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), ++ "https://api.github.com/repos/llvm/llvm-project/tarball/{commit}".format(commit = LLVM_COMMIT), + ], + build_file = "//third_party/llvm:llvm.BUILD", + patch_file = [ diff --git a/build_deps/patches/tensorflow_serving.patch b/build_deps/patches/tensorflow_serving.patch new file mode 100644 index 00000000..9808ef8c --- /dev/null +++ b/build_deps/patches/tensorflow_serving.patch @@ -0,0 +1,25 @@ +diff --git a/tensorflow_serving/util/net_http/server/public/BUILD b/tensorflow_serving/util/net_http/server/public/BUILD +index e7f96d98..2ae0530a 100644 +--- tensorflow_serving/util/net_http/server/public/BUILD ++++ tensorflow_serving/util/net_http/server/public/BUILD +@@ -34,6 +34,7 @@ cc_library( + hdrs = [ + "httpserver.h", + ], ++ visibility = ["//visibility:public"], + deps = [ + ":http_server_api", + "//tensorflow_serving/util/net_http/server/internal:evhttp_server", +diff --git a/tensorflow_serving/workspace.bzl b/tensorflow_serving/workspace.bzl +index 08c3cc28..0803cdf3 100644 +--- tensorflow_serving/workspace.bzl ++++ tensorflow_serving/workspace.bzl +@@ -31,7 +31,7 @@ def tf_serving_workspace(): + url = "https://github.com/libevent/libevent/archive/release-2.1.8-stable.zip", + sha256 = "70158101eab7ed44fd9cc34e7f247b3cae91a8e4490745d9d6eb7edc184e4d96", + strip_prefix = "libevent-release-2.1.8-stable", +- build_file = "@//third_party/libevent:BUILD", ++ build_file = "@//third_party:event.BUILD.bzl", + ) + + # ===== ICU dependency ===== diff --git a/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch b/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch new file mode 100644 index 00000000..f24ed7dc --- /dev/null +++ b/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch @@ -0,0 +1,11 @@ +--- tensorflow/tensorflow.bzl ++++ tensorflow/tensorflow.bzl +@@ -1473,7 +1473,7 @@ + # Make a py_library out of the generated python file. + if not generated_target_name: + generated_target_name = name +- py_deps = [clean_dep("//tensorflow/python/framework:for_generated_wrappers_v2")] ++ py_deps = ["@pypi_tensorflow//:pkg"] + if extra_py_deps: + py_deps += extra_py_deps + py_lib_rule( diff --git a/build_deps/patches/tensorflow_zlib.patch b/build_deps/patches/tensorflow_zlib.patch new file mode 100644 index 00000000..e551f3d6 --- /dev/null +++ b/build_deps/patches/tensorflow_zlib.patch @@ -0,0 +1,11 @@ +--- third_party/zlib.BUILD ++++ third_party/zlib.BUILD +@@ -31,7 +31,7 @@ + "zutil.c", + "zutil.h", + ], +- hdrs = ["zlib.h"], ++ hdrs = ["zconf.h", "zlib.h"], + copts = select({ + "@org_tensorflow//tensorflow/tsl:windows": [], + "//conditions:default": [ diff --git a/build_deps/patches/tf2xla_visibility.patch b/build_deps/patches/tf2xla_visibility.patch new file mode 100644 index 00000000..84b7cdca --- /dev/null +++ b/build_deps/patches/tf2xla_visibility.patch @@ -0,0 +1,13 @@ +diff --git tensorflow/compiler/tf2xla/BUILD tensorflow/compiler/tf2xla/BUILD +index 22d9877bed9..3f6b421465d 100644 +--- tensorflow/compiler/tf2xla/BUILD ++++ tensorflow/compiler/tf2xla/BUILD +@@ -46,7 +46,7 @@ package_group( + packages = [ + "//platforms/performance/automl/...", + "//tensorflow/...", +- "//tensorflow_federated/cc/core/impl/executors/...", ++ "public", + "//tensorflow_models/...", + "//third_party/deepmind/deepmind_research/density_functional_approximation_dm21/...", + "//third_party/mlir_edge/model_curriculum/iree/...", diff --git a/build_deps/pip_tf/BUILD b/build_deps/pip_tf/BUILD new file mode 100644 index 00000000..56ebe258 --- /dev/null +++ b/build_deps/pip_tf/BUILD @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_python//python:defs.bzl", "py_test") +load(":defs.bzl", "PIP_TF_COPTS", "PIP_TF_CXXOPTS", "PIP_TF_LINKOPTS") + +py_test( + name = "pip_tf_flags_test", + srcs = ["pip_tf_flags_test.py"], + args = [ + "--copts=" + ",".join(PIP_TF_COPTS), + "--cxxopts=" + ",".join(PIP_TF_CXXOPTS), + "--linkopts=" + ",".join(PIP_TF_LINKOPTS), + ], + deps = [ + "@pypi_absl_py//:pkg", + "@pypi_tensorflow//:pkg", + ], +) diff --git a/build_deps/pip_tf/README.md b/build_deps/pip_tf/README.md new file mode 100644 index 00000000..79283160 --- /dev/null +++ b/build_deps/pip_tf/README.md @@ -0,0 +1,25 @@ + +When building libraries (such as custom op libraries) against the TensorFlow pip +package, care must be taken to ensure those libraries build against that +package's headers and with the same compiler and linker flags as that package +was compiled with. These utilities help ensure that's the case. + +This package assumes Tensorflow is available in the `@pypi_tensorflow` package, +with the additional build content specified in `TF_ADDITIVE_BUILD_CONTENT`: + +``` +load("@com_google_fcp//tensorflow/pip_tf:defs.bzl", "TF_ADDITIVE_BUILD_CONTENT") + +pip_parse( + name = "pypi", + annotations = { + "tensorflow": package_annotation( + additive_build_content = TF_ADDITIVE_BUILD_CONTENT, + ), + }, + ... +) +``` + +NOTE: The `gpu_srcs` and `gpu_deps` parameters supported by TensorFlow's version +of `tf_custom_op_library` are not supported by this version. diff --git a/build_deps/pip_tf/defs.bzl b/build_deps/pip_tf/defs.bzl new file mode 100644 index 00000000..c5395854 --- /dev/null +++ b/build_deps/pip_tf/defs.bzl @@ -0,0 +1,132 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provides rules for building custom TensorFlow ops compatible with pip.""" + +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda", "if_cuda_is_configured") + +# Build flags for using the pip-provided TensorFlow package. pip_tf_flags_test ensures that these +# values stay in sync with the currently-used TF version. +PIP_TF_COPTS = ["-DEIGEN_MAX_ALIGN_BYTES=64", "-D_GLIBCXX_USE_CXX11_ABI=1"] +PIP_TF_CXXOPTS = ["--std=c++17"] +PIP_TF_LINKOPTS = [] + +def _force_pip_tf_transition_impl(settings, _attr): + copts = list(settings["//command_line_option:copt"]) + cxxopts = list(settings["//command_line_option:cxxopt"]) + linkopts = list(settings["//command_line_option:linkopt"]) + copts += PIP_TF_COPTS + cxxopts += PIP_TF_CXXOPTS + linkopts += PIP_TF_LINKOPTS + + # TensorFlow's pip package was built with libstdc++. + # TODO: Enable when clang build + # cxxopts.append("-stdlib=libstdc++") + # linkopts.append("-stdlib=libstdc++") + + return { + "//command_line_option:copt": copts, + "//command_line_option:cxxopt": cxxopts, + "//command_line_option:linkopt": linkopts, + } + +_force_pip_tf_transition = transition( + implementation = _force_pip_tf_transition_impl, + inputs = [ + "//command_line_option:copt", + "//command_line_option:cxxopt", + "//command_line_option:linkopt", + ], + outputs = [ + "//command_line_option:copt", + "//command_line_option:cxxopt", + "//command_line_option:linkopt", + ], +) + +def _force_pip_tf_impl(ctx): + cc_binary = ctx.attr.cc_binary[0] + output_file = ctx.actions.declare_file(ctx.label.name) + ctx.actions.symlink( + output = output_file, + target_file = cc_binary.files.to_list()[0], + ) + return DefaultInfo( + files = depset([output_file]), + data_runfiles = ctx.runfiles(transitive_files = depset([output_file])), + ) + +_force_pip_tf = rule( + doc = """Forces a shared library to be built in a way that's compatible +with the pip-provided Python TensorFlow package.""", + implementation = _force_pip_tf_impl, + attrs = { + "cc_binary": attr.label( + cfg = _force_pip_tf_transition, + mandatory = True, + doc = "The cc_binary target to build with TensorFlow compatibility.", + ), + "_allowlist_function_transition": attr.label( + default = "@bazel_tools//tools/allowlists/function_transition_allowlist", + ), + }, +) + +def tf_custom_op_library( + name, + srcs = [], + gpu_srcs = [], + deps = [], + gpu_deps = None, + tags = [], + visibility = None, + **kwargs): + """Replacement for TF's custom_op_library that targets pip-provided TF. + + This rule will force a transition to an environment that targets the + pip-provided TF library. This means that all deps of this target and the + target's own sources will be compiled with the necessary compiler flags to + correctly target a pip TF library. + """ + if not gpu_deps: + gpu_deps = [] + + if gpu_srcs: + basename = name.split(".")[0] + cuda_library( + name = basename + "_gpu", + srcs = gpu_srcs, + deps = deps + gpu_deps, + **kwargs + ) + deps = deps + [":" + basename + "_gpu"] + + native.cc_binary( + name = name + "_lib", + srcs = srcs, + linkshared = 1, + deps = deps + [ + "@pypi_tensorflow//:libtensorflow_framework", + "@pypi_tensorflow//:tf_headers", + ], + tags = tags + ["manual"], + visibility = ["//visibility:private"], + **kwargs + ) + + _force_pip_tf( + name = name, + cc_binary = name + "_lib", + visibility = visibility, + tags = tags, + ) diff --git a/build_deps/pip_tf/pip_tf_flags_test.py b/build_deps/pip_tf/pip_tf_flags_test.py new file mode 100644 index 00000000..dec43f4a --- /dev/null +++ b/build_deps/pip_tf/pip_tf_flags_test.py @@ -0,0 +1,65 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Verifies that build flags for custom TF ops are correct.""" + +import re + +from absl import flags +from absl.testing import absltest +import tensorflow as tf + +_COPTS = flags.DEFINE_list('copts', [], 'TF copts') +_CXXOPTS = flags.DEFINE_list('cxxopts', [], 'TF cxxopts') +_LINKOPTS = flags.DEFINE_list('linkopts', [], 'TF linkopts') + +_ERROR_MSG = """ +If the TensorFlow version has been updated, copy the new value to +fcp/tensorflow/pip_tf/defs.bzl. +""" + + +class PipTfFlagsTest(absltest.TestCase): + + def test_compile_flags(self): + copts = [] + cxxopts = [] + for flag in tf.sysconfig.get_compile_flags(): + # Ignore include flags, which are handled by bazel. + if flag.startswith('-I'): + continue + + if flag.startswith('--std=c++'): # Don't add C++-only flags to copts. + cxxopts.append(flag) + else: + copts.append(flag) + + self.assertSameElements(copts, _COPTS.value, _ERROR_MSG) + self.assertSameElements(cxxopts, _CXXOPTS.value, _ERROR_MSG) + + def test_link_flags(self): + linkopts = [] + for flag in tf.sysconfig.get_link_flags(): + # Ignore library search paths, which are handled by bazel. + if flag.startswith('-L'): + continue + # Ignore -ltensorflow_framework, which is handled by bazel. + if re.search(r'^-l(:lib)?tensorflow_framework', flag): + continue + linkopts.append(flag) + + self.assertSameElements(linkopts, _LINKOPTS.value, _ERROR_MSG) + + +if __name__ == '__main__': + absltest.main() diff --git a/build_deps/pip_tf/tensorflow.bzl b/build_deps/pip_tf/tensorflow.bzl new file mode 100644 index 00000000..5f164549 --- /dev/null +++ b/build_deps/pip_tf/tensorflow.bzl @@ -0,0 +1,3536 @@ +# +# Returns the options to use for a C++ library or binary build. +# Uses the ":optmode" config_setting to pick the options. +load( + "@org_tensorflow//tensorflow/core/platform:build_config_root.bzl", + "if_dynamic_kernels", + "if_static", + "tf_additional_grpc_deps_py", + "tf_additional_xla_deps_py", + "tf_exec_properties", + "tf_gpu_tests_tags", +) +load( + "@org_tensorflow//tensorflow/core/platform:rules_cc.bzl", + "cc_binary", + "cc_library", + "cc_shared_library", + "cc_test", +) +load( + "@org_tensorflow//tensorflow/tsl:tsl.bzl", + "tsl_gpu_library", + _cc_header_only_library = "cc_header_only_library", + _clean_dep = "clean_dep", + _if_cuda_or_rocm = "if_cuda_or_rocm", + _if_nccl = "if_nccl", + _transitive_hdrs = "transitive_hdrs", +) +load( + "@local_config_tensorrt//:build_defs.bzl", + "if_tensorrt", + "if_tensorrt_exec", +) +load( + "@local_config_cuda//cuda:build_defs.bzl", + "cuda_library", + "if_cuda", + "if_cuda_exec", +) +load( + "@local_config_rocm//rocm:build_defs.bzl", + "if_rocm", + "rocm_copts", +) +load( + "@org_tensorflow//third_party/mkl:build_defs.bzl", + "if_enable_mkl", + "if_mkl", + "if_mkl_ml", +) +load( + "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", + "if_mkldnn_aarch64_acl", + "if_mkldnn_aarch64_acl_openmp", + "if_mkldnn_openmp", +) +load( + "@org_tensorflow//tensorflow/tsl/mkl:build_defs.bzl", + "onednn_v3_define", +) +load( + "@org_tensorflow//third_party/compute_library:build_defs.bzl", + "if_enable_acl", +) +load( + "@org_tensorflow//third_party/llvm_openmp:openmp.bzl", + "windows_llvm_openmp_linkopts", +) +load( + "@org_tensorflow//tensorflow:py.default.bzl", + _plain_py_binary = "py_binary", + _plain_py_library = "py_library", + _plain_py_test = "py_test", +) +load("@bazel_skylib//lib:new_sets.bzl", "sets") +load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo") + +def register_extension_info(**kwargs): + pass + +# version for the shared libraries, can +# not contain rc or alpha, only numbers. +# Also update tensorflow/core/public/version.h +# and tensorflow/tools/pip_package/setup.py +VERSION = "2.14.0" +VERSION_MAJOR = VERSION.split(".")[0] +two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"] + +# The workspace root, to be used to set workspace 'include' paths in a way that +# will still work correctly when TensorFlow is included as a dependency of an +# external project. +workspace_root = Label("//:WORKSPACE").workspace_root or "." + +clean_dep = _clean_dep +cc_header_only_library = _cc_header_only_library +transitive_hdrs = _transitive_hdrs + +def if_oss(oss_value, google_value = []): + """Returns one of the arguments based on the non-configurable build env. + + Specifically, it does not return a `select`, and can be used to e.g. + compute elements of list attributes. + """ + return oss_value # copybara:comment_replace return google_value + +def if_google(google_value, oss_value = []): + """Returns one of the arguments based on the non-configurable build env. + + Specifically, it does not return a `select`, and can be used to e.g. + compute elements of list attributes. + """ + return oss_value # copybara:comment_replace return google_value + +def if_v2(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:api_version_2"): a, + "//conditions:default": [], + }) + +def if_not_v2(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:api_version_2"): [], + "//conditions:default": a, + }) + +def if_nvcc(a): + return select({ + "@local_config_cuda//cuda:using_nvcc": a, + "//conditions:default": [], + }) + +def if_xla_available(if_true, if_false = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:with_xla_support"): if_true, + "//conditions:default": if_false, + }) + +# Given a source file, generate a test name. +# i.e. "common_runtime/direct_session_test.cc" becomes +# "common_runtime_direct_session_test" +def src_to_test_name(src): + return src.replace("/", "_").replace(":", "_").split(".")[0] + +def full_path(relative_paths): + return [native.package_name() + "/" + relative for relative in relative_paths] + +def _add_tfcore_prefix(src): + if src.startswith("//"): + return src + return "@org_tensorflow//tensorflow/core:" + src + +def tf_android_core_proto_headers(core_proto_sources_relative): + """Returns the list of pb.h and proto.h headers that are generated for the provided sources.""" + return ([ + _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h") + for p in core_proto_sources_relative + ] + [ + _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h") + for p in core_proto_sources_relative + ]) + +def tf_portable_full_lite_protos(full, lite): + return select({ + "@org_tensorflow//tensorflow:mobile_lite_protos": lite, + "@org_tensorflow//tensorflow:mobile_full_protos": full, + # The default should probably be lite runtime, but since most clients + # seem to use the non-lite version, let's make that the default for now. + "//conditions:default": full, + }) + +def if_no_default_logger(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:no_default_logger"): a, + "//conditions:default": [], + }) + +def if_android_x86(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android_x86"): a, + clean_dep("@org_tensorflow//tensorflow:android_x86_64"): a, + "//conditions:default": [], + }) + +def if_android_arm(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android_arm"): a, + "//conditions:default": [], + }) + +def if_android_arm64(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android_arm64"): a, + "//conditions:default": [], + }) + +def if_android_mips(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android_mips"): a, + "//conditions:default": [], + }) + +def if_not_android(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android"): [], + "//conditions:default": a, + }) + +def if_not_android_mips_and_mips64(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android_mips"): [], + clean_dep("@org_tensorflow//tensorflow:android_mips64"): [], + "//conditions:default": a, + }) + +def if_android(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android"): a, + "//conditions:default": [], + }) + +def if_android_or_ios(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:android"): a, + clean_dep("@org_tensorflow//tensorflow:ios"): a, + "//conditions:default": [], + }) + +def if_emscripten(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:emscripten"): a, + "//conditions:default": [], + }) + +def if_chromiumos(a, otherwise = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:chromiumos"): a, + "//conditions:default": otherwise, + }) + +def if_macos(a, otherwise = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:macos"): a, + "//conditions:default": otherwise, + }) + +def if_ios(a, otherwise = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:ios"): a, + "//conditions:default": otherwise, + }) + +def if_ios_x86_64(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:ios_x86_64"): a, + "//conditions:default": [], + }) + +def if_mobile(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:mobile"): a, + "//conditions:default": [], + }) + +def if_not_mobile(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:mobile"): [], + "//conditions:default": a, + }) + +# Config setting selector used when building for products +# which requires restricted licenses to be avoided. +def if_not_mobile_or_arm_or_lgpl_restricted(a): + _ = (a,) + return select({ + "//conditions:default": [], + }) + +def if_not_windows(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": a, + }) + +def if_windows(a, otherwise = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:windows"): a, + "//conditions:default": otherwise, + }) + +def if_windows_cuda(a, otherwise = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:is_cuda_enabled_and_windows"): a, + "//conditions:default": otherwise, + }) + +def if_not_fuchsia(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:fuchsia"): [], + "//conditions:default": a, + }) + +def if_linux_x86_64(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): a, + "//conditions:default": [], + }) + +def if_override_eigen_strong_inline(a): + return select({ + clean_dep("@org_tensorflow//tensorflow:override_eigen_strong_inline"): a, + "//conditions:default": [], + }) + +if_nccl = _if_nccl + +def if_zendnn(if_true, if_false = []): + return select({ + clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): if_true, + "//conditions:default": if_false, + }) + +def if_libtpu(if_true, if_false = []): + """Shorthand for select()ing whether to build backend support for TPUs when building libtpu.so""" + return select({ + # copybara:uncomment_begin(different config setting in OSS) + # "//tools/cc_target_os:gce": if_true, + # copybara:uncomment_end_and_comment_begin + clean_dep("@org_tensorflow//tensorflow:with_tpu_support"): if_true, + # copybara:comment_end + "//conditions:default": if_false, + }) + +def if_with_tpu_support(if_true, if_false = []): + """Shorthand for select()ing whether to build API support for TPUs when building TensorFlow""" + return select({ + "@org_tensorflow//tensorflow:with_tpu_support": if_true, + "//conditions:default": if_false, + }) + +def if_registration_v2(if_true, if_false = []): + return select({ + "@org_tensorflow//tensorflow:registration_v2": if_true, + "//conditions:default": if_false, + }) + +def if_portable(if_true, if_false = []): + return if_true + +ADDITIONAL_API_INDEXABLE_SETTINGS = [] + +# We are never indexing generated code in the OSS build, but still +# return a select() for consistency. +def if_indexing_source_code( + if_true, # @unused + if_false): + """Return a select() on whether or not we are building for source code indexing.""" + return select({ + "//conditions:default": if_false, + }) + +# Linux systems may required -lrt linker flag for e.g. clock_gettime +# see https://github.com/tensorflow/tensorflow/issues/15129 +def lrt_if_needed(): + lrt = ["-lrt"] + return select({ + clean_dep("@org_tensorflow//tensorflow:linux_aarch64"): lrt, + clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): lrt, + clean_dep("@org_tensorflow//tensorflow:linux_ppc64le"): lrt, + "//conditions:default": [], + }) + +def get_win_copts(is_external = False): + WINDOWS_COPTS = [ + # copybara:uncomment_begin(no MSVC flags in google) + # "-DPLATFORM_WINDOWS", + # "-DEIGEN_HAS_C99_MATH", + # "-DTENSORFLOW_USE_EIGEN_THREADPOOL", + # "-DEIGEN_AVOID_STL_ARRAY", + # "-Iexternal/gemmlowp", + # "-Wno-sign-compare", + # "-DNOGDI", + # copybara:uncomment_end_and_comment_begin + "/DPLATFORM_WINDOWS", + "/DEIGEN_HAS_C99_MATH", + "/DTENSORFLOW_USE_EIGEN_THREADPOOL", + "/DEIGEN_AVOID_STL_ARRAY", + "/Iexternal/gemmlowp", + "/wd4018", # -Wno-sign-compare + # Bazel's CROSSTOOL currently pass /EHsc to enable exception by + # default. We can't pass /EHs-c- to disable exception, otherwise + # we will get a waterfall of flag conflict warnings. Wait for + # Bazel to fix this. + # "/D_HAS_EXCEPTIONS=0", + # "/EHs-c-", + "/wd4577", + "/DNOGDI", + # Also see build:windows lines in tensorflow/opensource_only/.bazelrc + # where we set some other options globally. + # copybara:comment_end + ] + + if is_external: + return WINDOWS_COPTS + [if_oss( + "/UTF_COMPILE_LIBRARY", + "-UTF_COMPILE_LIBRARY", + )] + else: + return WINDOWS_COPTS + [if_oss( + "/DTF_COMPILE_LIBRARY", + "-DTF_COMPILE_LIBRARY", + )] + +def tf_copts( + android_optimization_level_override = "-O2", + is_external = False, + allow_exceptions = False): + # For compatibility reasons, android_optimization_level_override + # is currently only being set for Android. + # To clear this value, and allow the CROSSTOOL default + # to be used, pass android_optimization_level_override=None + android_copts = [ + "-DTF_LEAN_BINARY", + "-Wno-narrowing", + ] + if android_optimization_level_override: + android_copts.append(android_optimization_level_override) + return ( + if_not_windows([ + "-DEIGEN_AVOID_STL_ARRAY", + "-Iexternal/gemmlowp", + "-Wno-sign-compare", + "-ftemplate-depth=900", + ]) + + (if_not_windows(["-fno-exceptions"]) if not allow_exceptions else []) + + if_cuda(["-DGOOGLE_CUDA=1"]) + + if_nvcc(["-DTENSORFLOW_USE_NVCC=1"]) + + if_libtpu(["-DLIBTPU_ON_GCE"], []) + + if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) + + if_tensorrt(["-DGOOGLE_TENSORRT=1"]) + + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + + # Compile in oneDNN based ops when building for x86 platforms + if_mkl(["-DINTEL_MKL"]) + + # Enable additional ops (e.g., ops with non-NHWC data layout) and + # optimizations for Intel builds using oneDNN if configured + if_enable_mkl(["-DENABLE_MKL"]) + + if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) + + onednn_v3_define() + + if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) + + if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) + + if_zendnn(["-DAMD_ZENDNN"]) + + if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) + + if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) + + if_linux_x86_64(["-msse3"]) + + if_ios_x86_64(["-msse4.1"]) + + if_no_default_logger(["-DNO_DEFAULT_LOGGER"]) + + select({ + clean_dep("@org_tensorflow//tensorflow:framework_shared_object"): [], + "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"], + }) + + select({ + clean_dep("@org_tensorflow//tensorflow:android"): android_copts, + clean_dep("@org_tensorflow//tensorflow:emscripten"): [], + clean_dep("@org_tensorflow//tensorflow:macos"): [], + clean_dep("@org_tensorflow//tensorflow:windows"): get_win_copts(is_external), + clean_dep("@org_tensorflow//tensorflow:ios"): [], + clean_dep("@org_tensorflow//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"], + "//conditions:default": ["-pthread"], + }) + ) + +def tf_copts_exec( + android_optimization_level_override = "-O2", + is_external = False, + allow_exceptions = False): + return tf_copts( + android_optimization_level_override, + is_external, + allow_exceptions, + ) + if_cuda_exec(["-DGOOGLE_CUDA=1"]) + if_tensorrt_exec(["-DGOOGLE_TENSORRT=1"]) + +def tf_openmp_copts(): + # We assume when compiling on Linux gcc/clang will be used and MSVC on Windows + return select({ + # copybara:uncomment_begin + # "//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"], + # "//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp"], + # copybara:uncomment_end_and_comment_begin + "@org_tensorflow//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"], + "@org_tensorflow//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp:llvm"], + # copybara:comment_end + "//conditions:default": [], + }) + +def tf_openmp_lopts(): + # When compiling on Windows, force MSVC to use libiomp that was compiled + # as part of this build. + return select({ + "//third_party/mkl:build_with_mkl_windows_openmp": [windows_llvm_openmp_linkopts()], + "//conditions:default": [], + }) + +def tf_opts_nortti(): + return [ + "-fno-rtti", + "-DGOOGLE_PROTOBUF_NO_RTTI", + "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER", + ] + +def tf_opts_force_rtti(): + return select({ + clean_dep("@org_tensorflow//tensorflow:force_rtti"): ["-frtti"], + "//conditions:default": [], + }) + +def tf_opts_nortti_if_android(): + return if_android(tf_opts_nortti()) + tf_opts_force_rtti() + +def tf_opts_nortti_if_mobile(): + return if_mobile(tf_opts_nortti()) + tf_opts_force_rtti() + +def tf_defines_nortti(): + return [ + "GOOGLE_PROTOBUF_NO_RTTI", + "GOOGLE_PROTOBUF_NO_STATIC_INITIALIZER", + ] + +def tf_defines_nortti_if_android(): + return if_android(tf_defines_nortti()) + +def tf_features_nomodules_if_android(): + return if_android(["-use_header_modules"]) + +def tf_features_nomodules_if_mobile(): + return if_mobile(["-use_header_modules"]) + +# portable_tensorflow_lib_lite does not export the headers needed to +# use it. Thus anything that depends on it needs to disable layering +# check. +def tf_features_nolayering_check_if_ios(): + return select({ + clean_dep("@org_tensorflow//tensorflow:ios"): ["-layering_check"], + "//conditions:default": [], + }) + +def tf_opts_nortti_if_lite_protos(): + return tf_portable_full_lite_protos( + full = [], + lite = tf_opts_nortti(), + ) + tf_opts_force_rtti() + +def tf_defines_nortti_if_lite_protos(): + return tf_portable_full_lite_protos( + full = [], + lite = tf_defines_nortti(), + ) + +# Given a list of "op_lib_names" (a list of files in the ops directory +# without their .cc extensions), generate a library for that file. +def tf_gen_op_libs( + op_lib_names, + sub_directory = "ops/", + deps = None, + is_external = True, + compatible_with = None, + features = []): + # Make library out of each op so it can also be used to generate wrappers + # for various languages. + if not deps: + deps = [] + for n in op_lib_names: + cc_library( + name = n + "_op_lib", + copts = tf_copts_exec(is_external = is_external), + features = features, + srcs = [sub_directory + n + ".cc"], + deps = deps + [clean_dep("@org_tensorflow//tensorflow/core:framework")], + compatible_with = compatible_with, + visibility = ["//visibility:public"], + alwayslink = 1, + linkstatic = 1, + ) + +def _make_search_paths(prefix, levels_to_root): + return ",".join( + [ + "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level)) + for search_level in range(levels_to_root + 1) + ], + ) + +def _rpath_linkopts(name): + # Search parent directories up to the TensorFlow root directory for shared + # object dependencies, even if this op shared object is deeply nested + # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then + # the root and tensorflow/libtensorflow_framework.so should exist when + # deployed. Other shared object dependencies (e.g. shared between contrib/ + # ops) are picked up as long as they are in either the same or a parent + # directory in the tensorflow/ tree. + levels_to_root = native.package_name().count("/") + name.count("/") + return select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), + "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),), + ], + }) + +def _rpath_user_link_flags(name): + # Search parent directories up to the TensorFlow root directory for shared + # object dependencies, even if this op shared object is deeply nested + # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then + # the root and tensorflow/libtensorflow_framework.so should exist when + # deployed. Other shared object dependencies (e.g. shared between contrib/ + # ops) are picked up as long as they are in either the same or a parent + # directory in the tensorflow/ tree. + levels_to_root = native.package_name().count("/") + name.count("/") + return select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), + "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),), + ], + }) + +# Bazel-generated shared objects which must be linked into TensorFlow binaries +# to define symbols from //tensorflow/core:framework and //tensorflow/core:lib. +def tf_binary_additional_srcs(fullversion = False): + if fullversion: + suffix = "." + VERSION + else: + suffix = "." + VERSION_MAJOR + + return if_static( + extra_deps = [], + macos = [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework%s.dylib" % suffix), + ], + otherwise = [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so%s" % suffix), + ], + ) + +def tf_binary_additional_data_deps(): + return if_static( + extra_deps = [], + macos = [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.dylib"), + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION_MAJOR), + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION), + ], + otherwise = [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so"), + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION_MAJOR), + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION), + ], + ) + +def tf_binary_pybind_deps(): + return select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + clean_dep( + "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_macos", + ), + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [ + clean_dep( + "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_windows", + ), + ], + "//conditions:default": [ + clean_dep( + "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_linux", + ), + ], + }) + +# Helper function for the per-OS tensorflow libraries and their version symlinks +def tf_shared_library_deps(): + return select({ + clean_dep("@org_tensorflow//tensorflow:macos_with_framework_shared_object"): [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow.dylib"), + clean_dep("@org_tensorflow//tensorflow:libtensorflow.%s.dylib" % VERSION_MAJOR), + clean_dep("@org_tensorflow//tensorflow:libtensorflow.%s.dylib" % VERSION), + ], + clean_dep("@org_tensorflow//tensorflow:macos"): [], + clean_dep("@org_tensorflow//tensorflow:windows"): [ + clean_dep("@org_tensorflow//tensorflow:tensorflow.dll"), + clean_dep("@org_tensorflow//tensorflow:tensorflow_dll_import_lib"), + ], + clean_dep("@org_tensorflow//tensorflow:framework_shared_object"): [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow.so"), + clean_dep("@org_tensorflow//tensorflow:libtensorflow.so.%s" % VERSION_MAJOR), + clean_dep("@org_tensorflow//tensorflow:libtensorflow.so.%s" % VERSION), + ], + "//conditions:default": [], + }) + tf_binary_additional_srcs() + +# Helper functions to add kernel dependencies to tf binaries when using dynamic +# kernel linking. +def tf_binary_dynamic_kernel_dsos(): + return if_dynamic_kernels( + extra_deps = [ + # TODO(gunan): Remove dependencies on these, and make them load dynamically. + # "@org_tensorflow//tensorflow/core/kernels:libtfkernel_all_kernels.so", + ], + otherwise = [], + ) + +# Helper functions to add kernel dependencies to tf binaries when using static +# kernel linking. +def tf_binary_dynamic_kernel_deps(kernels): + return if_dynamic_kernels( + extra_deps = [], + otherwise = kernels, + ) + +# Shared libraries have different name pattern on different platforms, +# but cc_binary cannot output correct artifact name yet, +# so we generate multiple cc_binary targets with all name patterns when necessary. +# TODO(pcloudy): Remove this workaround when https://github.com/bazelbuild/bazel/issues/4570 +# is done and cc_shared_library is available. +SHARED_LIBRARY_NAME_PATTERN_LINUX = "lib%s.so%s" +SHARED_LIBRARY_NAME_PATTERN_MACOS = "lib%s%s.dylib" +SHARED_LIBRARY_NAME_PATTERN_WINDOWS = "%s%s.dll" +SHARED_LIBRARY_NAME_PATTERNS = [ + SHARED_LIBRARY_NAME_PATTERN_LINUX, + SHARED_LIBRARY_NAME_PATTERN_MACOS, + SHARED_LIBRARY_NAME_PATTERN_WINDOWS, +] + +def tf_cc_shared_object( + name, + srcs = [], + deps = [], + data = [], + linkopts = lrt_if_needed(), + framework_so = tf_binary_additional_srcs(), + soversion = None, + kernels = [], + per_os_targets = False, # Generate targets with SHARED_LIBRARY_NAME_PATTERNS + visibility = None, + **kwargs): + """Configure the shared object (.so) file for TensorFlow.""" + if soversion != None: + suffix = "." + str(soversion).split(".")[0] + longsuffix = "." + str(soversion) + else: + suffix = "" + longsuffix = "" + + if per_os_targets: + names = [ + ( + pattern % (name, ""), + pattern % (name, suffix), + pattern % (name, longsuffix), + ) + for pattern in SHARED_LIBRARY_NAME_PATTERNS + ] + else: + names = [( + name, + name + suffix, + name + longsuffix, + )] + + testonly = kwargs.pop("testonly", False) + + for name_os, name_os_major, name_os_full in names: + # Windows DLLs cant be versioned + if name_os.endswith(".dll"): + name_os_major = name_os + name_os_full = name_os + + if name_os != name_os_major: + native.genrule( + name = name_os + "_sym", + outs = [name_os], + srcs = [name_os_major], + output_to_bindir = 1, + cmd = "ln -sf $$(basename $<) $@", + ) + native.genrule( + name = name_os_major + "_sym", + outs = [name_os_major], + srcs = [name_os_full], + output_to_bindir = 1, + cmd = "ln -sf $$(basename $<) $@", + ) + + soname = name_os_major.split("/")[-1] + + data_extra = [] + if framework_so != []: + data_extra = tf_binary_additional_data_deps() + + cc_binary( + exec_properties = if_google({"cpp_link.mem": "16g"}, {}), + name = name_os_full, + srcs = srcs + framework_so, + deps = deps, + linkshared = 1, + data = data + data_extra, + linkopts = linkopts + _rpath_linkopts(name_os_full) + select({ + clean_dep("@org_tensorflow//tensorflow:ios"): [ + "-Wl,-install_name,@rpath/" + soname, + ], + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-Wl,-install_name,@rpath/" + soname, + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,-soname," + soname, + ], + }), + testonly = testonly, + visibility = visibility, + **kwargs + ) + + flat_names = [item for sublist in names for item in sublist] + if name not in flat_names: + native.filegroup( + name = name, + srcs = select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [":%s.dll" % (name)], + clean_dep("@org_tensorflow//tensorflow:macos"): [":lib%s%s.dylib" % (name, longsuffix)], + "//conditions:default": [":lib%s.so%s" % (name, longsuffix)], + }), + visibility = visibility, + testonly = testonly, + ) + +# buildozer: disable=function-docstring-args +def tf_cc_shared_library_opensource( + name, + srcs = [], + dynamic_deps = [], + static_deps = [], + deps = [], + roots = [], + exports_filter = [], + data = [], + copts = [], + linkopts = lrt_if_needed(), + additional_linker_inputs = [], + linkstatic = True, + framework_so = [clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework_import_lib")], + soversion = None, + per_os_targets = False, # TODO(rostam): Should be deprecated. + win_def_file = None, + visibility = None): + """Configures the shared object file for TensorFlow.""" + names = _get_shared_library_name_os_version_matrix( + name, + per_os_targets = per_os_targets, + version = soversion, + ) + for name_os, name_os_major, name_os_full in names: + soname = name_os_major.split("/")[-1] # Uses major version for soname. + user_link_flags = linkopts + _rpath_user_link_flags(name_os_full) + select({ + clean_dep("@org_tensorflow//tensorflow:ios"): [ + "-Wl,-install_name,@rpath/" + soname, + ], + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-Wl,-install_name,@rpath/" + soname, + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,-soname," + soname, + ], + }) + _tf_cc_shared_library_opensource( + name_os_full, + additional_linker_inputs = additional_linker_inputs, + copts = copts, + data = data, + deps = deps + framework_so, + dynamic_deps = dynamic_deps, + exports_filter = exports_filter, + linkstatic = linkstatic, + roots = roots, + shared_lib_name = name_os_full, + srcs = srcs, + static_deps = static_deps, + user_link_flags = user_link_flags, + visibility = visibility, + win_def_file = win_def_file, + ) + + if name_os != name_os_major: + filegroup_name = name_os_full + "_filegroup" + filegroup( + name = filegroup_name, + srcs = [name_os_full], + output_group = "main_shared_library_output", + visibility = visibility, + ) + _create_symlink(name_os, name_os_major, visibility = visibility) + _create_symlink(name_os_major, filegroup_name, visibility = visibility) + + if name not in [item for sublist in names for item in sublist]: + native.filegroup( + name = name, + srcs = select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [":%s" % get_versioned_shared_library_name_windows(name, soversion)], + clean_dep("@org_tensorflow//tensorflow:macos"): [":%s" % get_versioned_shared_library_name_macos(name, soversion)], + "//conditions:default": [":%s" % get_versioned_shared_library_name_linux(name, soversion)], + }), + visibility = visibility, + ) + +def _tf_cc_shared_library_opensource( + name, + additional_linker_inputs = None, + copts = None, + data = None, + deps = None, + dynamic_deps = None, + exports_filter = None, + linkstatic = False, + roots = None, + shared_lib_name = None, + srcs = None, + static_deps = None, + user_link_flags = None, + visibility = None, + win_def_file = None): + cc_library_name = name + "_cclib" + cc_library( + name = cc_library_name, + srcs = srcs, + data = data, + deps = deps, + copts = copts, + linkstatic = linkstatic, + ) + cc_shared_library( + name = name, + roots = [cc_library_name] + roots, + exports_filter = exports_filter, + dynamic_deps = dynamic_deps, + static_deps = static_deps, + shared_lib_name = shared_lib_name, + user_link_flags = user_link_flags, + additional_linker_inputs = additional_linker_inputs, + visibility = visibility, + win_def_file = if_windows(win_def_file, otherwise = None), + ) + +def _create_symlink(src, dest, visibility = None): + native.genrule( + name = src + "_sym", + outs = [src], + srcs = [dest], + output_to_bindir = 1, + cmd = "ln -sf $$(realpath --relative-to=$(RULEDIR) $<) $@", + visibility = visibility, + ) + +def _get_shared_library_name_os_version_matrix(name, per_os_targets = False, version = None): + if per_os_targets: + names = [ + (get_versioned_shared_library_name_linux(name), get_versioned_shared_library_name_linux(name, version, True), get_versioned_shared_library_name_linux(name, version)), + (get_versioned_shared_library_name_macos(name), get_versioned_shared_library_name_macos(name, version, True), get_versioned_shared_library_name_macos(name, version)), + (get_versioned_shared_library_name_windows(name), get_versioned_shared_library_name_windows(name, version, True), get_versioned_shared_library_name_windows(name, version)), + ] + else: + names = [(name, name + get_suffix_major_version(version), name + get_suffix_version(version))] + return names + +def get_versioned_shared_library_name_linux(name, version = None, major = False): + if major: + name = SHARED_LIBRARY_NAME_PATTERN_LINUX % (name, get_suffix_major_version(version)) + else: + name = SHARED_LIBRARY_NAME_PATTERN_LINUX % (name, get_suffix_version(version)) + return name + +def get_versioned_shared_library_name_macos(name, version = None, major = False): + if major: + name = SHARED_LIBRARY_NAME_PATTERN_MACOS % (name, get_suffix_major_version(version)) + else: + name = SHARED_LIBRARY_NAME_PATTERN_MACOS % (name, get_suffix_version(version)) + return name + +def get_versioned_shared_library_name_windows(name, version = None, major = False): + _ = version # buildifier: disable=unused-variable + _ = major # buildifier: disable=unused-variable + return SHARED_LIBRARY_NAME_PATTERN_WINDOWS % (name, "") + +def get_suffix_version(version): + return "." + str(version) if version else "" + +def get_suffix_major_version(version): + return "." + extract_major_version(version) if version else "" + +def extract_major_version(version): + return str(version).split(".", 1)[0] + +# Export open source version of tf_cc_shared_library under base name as well. +tf_cc_shared_library = tf_cc_shared_library_opensource + +# Links in the framework shared object +# (//third_party/tensorflow:libtensorflow_framework.so) when not building +# statically. Also adds linker options (rpaths) so that the framework shared +# object can be found. +def tf_cc_binary( + name, + srcs = [], + deps = [], + data = [], + linkopts = lrt_if_needed(), + copts = tf_copts(), + kernels = [], + per_os_targets = False, # Generate targets with SHARED_LIBRARY_NAME_PATTERNS + visibility = None, + default_copts = [], + **kwargs): + if kernels: + added_data_deps = tf_binary_dynamic_kernel_dsos() + else: + added_data_deps = [] + + if per_os_targets: + names = [pattern % (name, "") for pattern in SHARED_LIBRARY_NAME_PATTERNS] + else: + names = [name] + + # Optional MKL dependency, we also tell buildcleaner to ignore this dep using a tag. + mkl_dep = if_mkl_ml([clean_dep("//third_party/mkl:intel_binary_blob")]) + tags = kwargs.pop("tags", []) + ["req_dep=" + clean_dep("//third_party/mkl:intel_binary_blob")] + + for name_os in names: + cc_binary( + name = name_os, + copts = default_copts + copts, + srcs = srcs + tf_binary_additional_srcs(), + deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_dep + if_static( + extra_deps = [], + otherwise = [ + clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework_import_lib"), + ], + ), + tags = tags, + data = depset(data + added_data_deps), + linkopts = linkopts + _rpath_linkopts(name_os), + visibility = visibility, + **kwargs + ) + if name not in names: + native.filegroup( + name = name, + srcs = select({ + "@org_tensorflow//tensorflow:windows": [":%s.dll" % name], + "@org_tensorflow//tensorflow:macos": [":lib%s.dylib" % name], + "//conditions:default": [":lib%s.so" % name], + }), + visibility = visibility, + ) + +register_extension_info( + extension = tf_cc_binary, + label_regex_for_dep = "{extension_name}", +) + +# A simple wrap around native.cc_binary rule. +# When using this rule, you should realize it doesn't link to any tensorflow +# dependencies by default. +def tf_native_cc_binary( + name, + copts = tf_copts(), + linkopts = [], + **kwargs): + cc_binary( + name = name, + copts = copts, + linkopts = select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [], + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-lm", + ], + "//conditions:default": [ + "-lpthread", + "-lm", + ], + }) + linkopts + _rpath_linkopts(name) + lrt_if_needed(), + **kwargs + ) + +def tf_gen_op_wrapper_cc( + name, + out_ops_file, + pkg = "", + op_gen = clean_dep("@org_tensorflow//tensorflow/cc:cc_op_gen_main"), + deps = None, + include_internal_ops = 0, + # ApiDefs will be loaded in the order specified in this list. + api_def_srcs = [], + compatible_with = []): + # Construct an op generator binary for these ops. + tool = out_ops_file + "_gen_cc" + if deps == None: + deps = [pkg + ":" + name + "_op_lib"] + tf_cc_binary( + name = tool, + copts = tf_copts(), + linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + lrt_if_needed(), + linkstatic = 1, # Faster to link this one-time-use binary dynamically + deps = [op_gen] + deps, + ) + + srcs = api_def_srcs[:] + + if not api_def_srcs: + api_def_args_str = "," + else: + api_def_args = [] + for api_def_src in api_def_srcs: + # Add directory of the first ApiDef source to args. + # We are assuming all ApiDefs in a single api_def_src are in the + # same directory. + api_def_args.append( + " $$(dirname $$(echo $(locations " + api_def_src + + ") | cut -d\" \" -f1))", + ) + api_def_args_str = ",".join(api_def_args) + + native.genrule( + name = name + "_genrule", + outs = [ + out_ops_file + ".h", + out_ops_file + ".cc", + out_ops_file + "_internal.h", + out_ops_file + "_internal.cc", + ], + srcs = srcs, + tools = [":" + tool] + tf_binary_additional_srcs(), + cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " + + "$(location :" + out_ops_file + ".cc) " + + str(include_internal_ops) + " " + api_def_args_str), + compatible_with = compatible_with, + ) + +# Given a list of "op_lib_names" (a list of files in the ops directory +# without their .cc extensions), generate individual C++ .cc and .h +# files for each of the ops files mentioned, and then generate a +# single cc_library called "name" that combines all the +# generated C++ code. +# +# For example, for: +# tf_gen_op_wrappers_cc("tf_ops_lib", [ "array_ops", "math_ops" ]) +# +# +# This will ultimately generate ops/* files and a library like: +# +# cc_library(name = "tf_ops_lib", +# srcs = [ "ops/array_ops.cc", +# "ops/math_ops.cc" ], +# hdrs = [ "ops/array_ops.h", +# "ops/math_ops.h" ], +# deps = [ ... ]) +# +# Plus a private library for the "hidden" ops. +# cc_library(name = "tf_ops_lib_internal", +# srcs = [ "ops/array_ops_internal.cc", +# "ops/math_ops_internal.cc" ], +# hdrs = [ "ops/array_ops_internal.h", +# "ops/math_ops_internal.h" ], +# deps = [ ... ]) +# TODO(joshl): Cleaner approach for hidden ops. +def tf_gen_op_wrappers_cc( + name, + op_lib_names = [], + other_srcs = [], + other_hdrs = [], + other_srcs_internal = [], + other_hdrs_internal = [], + pkg = "", + deps = [ + clean_dep("@org_tensorflow//tensorflow/cc:ops"), + clean_dep("@org_tensorflow//tensorflow/cc:scope"), + clean_dep("@org_tensorflow//tensorflow/cc:const_op"), + ], + deps_internal = [], + op_gen = clean_dep("@org_tensorflow//tensorflow/cc:cc_op_gen_main"), + include_internal_ops = 0, + visibility = None, + # ApiDefs will be loaded in the order specified in this list. + api_def_srcs = [], + # Any extra dependencies that the wrapper generator might need. + extra_gen_deps = [], + compatible_with = []): + subsrcs = other_srcs[:] + subhdrs = other_hdrs[:] + internalsrcs = other_srcs_internal[:] + internalhdrs = other_hdrs_internal[:] + for n in op_lib_names: + tf_gen_op_wrapper_cc( + n, + "ops/" + n, + api_def_srcs = api_def_srcs, + include_internal_ops = include_internal_ops, + op_gen = op_gen, + pkg = pkg, + deps = [pkg + ":" + n + "_op_lib"] + extra_gen_deps, + compatible_with = compatible_with, + ) + subsrcs += ["ops/" + n + ".cc"] + subhdrs += ["ops/" + n + ".h"] + internalsrcs += ["ops/" + n + "_internal.cc"] + internalhdrs += ["ops/" + n + "_internal.h"] + + cc_library( + name = name, + srcs = subsrcs, + hdrs = subhdrs, + deps = deps + if_not_android([ + clean_dep("@org_tensorflow//tensorflow/core:core_cpu"), + clean_dep("@org_tensorflow//tensorflow/core:framework"), + clean_dep("@org_tensorflow//tensorflow/core:lib"), + clean_dep("@org_tensorflow//tensorflow/core:ops"), + clean_dep("@org_tensorflow//tensorflow/core:protos_all_cc"), + ]) + if_android([ + clean_dep("@org_tensorflow//tensorflow/core:portable_tensorflow_lib"), + ]), + copts = tf_copts(), + alwayslink = 1, + visibility = visibility, + compatible_with = compatible_with, + ) + cc_library( + name = name + "_internal", + srcs = internalsrcs, + hdrs = internalhdrs, + deps = deps + deps_internal + if_not_android([ + clean_dep("@org_tensorflow//tensorflow/core:core_cpu"), + clean_dep("@org_tensorflow//tensorflow/core:framework"), + clean_dep("@org_tensorflow//tensorflow/core:lib"), + clean_dep("@org_tensorflow//tensorflow/core:ops"), + clean_dep("@org_tensorflow//tensorflow/core:protos_all_cc"), + ]) + if_android([ + clean_dep("@org_tensorflow//tensorflow/core:portable_tensorflow_lib"), + ]), + copts = tf_copts(), + alwayslink = 1, + visibility = [clean_dep("@org_tensorflow//tensorflow:internal")], + compatible_with = compatible_with, + ) + +OpRegistrationSrcInfo = provider( + "Info needed to extract op registration sources.", + fields = { + "srcs": "depset of source Files that contains op registrations.", + }, +) + +def _collect_op_reg_srcs_aspect_impl(_target, ctx): + """Aspect implementation function for collect_op_reg_srcs_aspect. + + This aspect will traverse the dependency graph along the "deps" attribute of the target + and return an OpRegistrationSrcInfo provider. + + OpRegistrationSrcInfo will have the union of the srcs of the C++ dependencies + with filename end with "_ops.cc" or "_op.cc". + """ + direct, transitive = [], [] + if ctx.rule.kind == "cc_library" and hasattr(ctx.rule.attr, "srcs"): + # Assuming the filename of op registration source files ends with "_ops.cc" or "_op.cc" + direct += [ + src + for src in ctx.rule.files.srcs + if src.path.endswith("_op.cc") or src.path.endswith("_ops.cc") + ] + if hasattr(ctx.rule.attr, "deps"): + for dep in ctx.rule.attr.deps: + if OpRegistrationSrcInfo in dep: + transitive.append(dep[OpRegistrationSrcInfo].srcs) + if not direct and not transitive: + return [] + return [OpRegistrationSrcInfo(srcs = depset(direct = direct, transitive = transitive))] + +collect_op_reg_srcs_aspect = aspect( + attr_aspects = ["deps"], + required_providers = [CcInfo], + implementation = _collect_op_reg_srcs_aspect_impl, +) + +def _generate_op_reg_offsets_impl(ctx): + op_reg_srcs = [] + for dep in ctx.attr.deps: + if OpRegistrationSrcInfo in dep: + for src in dep[OpRegistrationSrcInfo].srcs.to_list(): + op_reg_srcs.append(src) + + args = ctx.actions.args() + args.add(ctx.outputs.out.path, format = "--out_path=%s") + args.add_all(op_reg_srcs) + + ctx.actions.run( + outputs = [ctx.outputs.out], + inputs = op_reg_srcs + ctx.files.tf_binary_additional_srcs, + tools = [ctx.executable._offset_counter], + executable = ctx.executable._offset_counter, + arguments = [args], + ) + +generate_op_reg_offsets = rule( + attrs = { + "out": attr.output(), + "deps": attr.label_list( + aspects = [collect_op_reg_srcs_aspect], + mandatory = True, + allow_files = True, + providers = [CcInfo], + ), + # This is for carrying the required files for _offset_counter to execute. + "tf_binary_additional_srcs": attr.label_list( + cfg = "exec", + mandatory = True, + allow_files = True, + ), + "_offset_counter": attr.label( + cfg = "exec", + executable = True, + allow_files = True, + default = "@org_tensorflow//tensorflow/python/framework:offset_counter", + ), + }, + implementation = _generate_op_reg_offsets_impl, +) + +def tf_gen_op_wrapper_py( + name, + out = None, + hidden = None, + visibility = None, + deps = [], + require_shape_functions = False, + hidden_file = None, + generated_target_name = None, + op_whitelist = [], + op_allowlist = [], + cc_linkopts = lrt_if_needed(), + api_def_srcs = [], + compatible_with = [], + testonly = False, + copts = [], + extra_py_deps = None, + py_lib_rule = _plain_py_library): + """Generates a Python library target wrapping the ops registered in "deps". + + Args: + name: used as the name of the generated target and as a name component of + the intermediate files. + out: name of the python file created by this rule. If None, then + "ops/gen_{name}.py" is used. + hidden: Optional list of ops names to make private in the Python module. + It is invalid to specify both "hidden" and "op_allowlist". + visibility: passed to py_library. + deps: list of dependencies for the intermediate tool used to generate the + python target. NOTE these `deps` are not applied to the final python + library target itself. + require_shape_functions: Unused. Leave this as False. + hidden_file: optional file that contains a list of op names to make private + in the generated Python module. Each op name should be on a line by + itself. Lines that start with characters that are invalid op name + starting characters are treated as comments and ignored. + generated_target_name: name of the generated target (overrides the + "name" arg) + op_whitelist: [DEPRECATED] an older spelling for "op_allowlist" + op_allowlist: if not empty, only op names in this list will be wrapped. It + is invalid to specify both "hidden" and "op_allowlist". + cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the + specified ops. + api_def_srcs: undocumented. + compatible_with: undocumented. + testonly: undocumented. + copts: undocumented. + extra_py_deps: undocumented. + py_lib_rule: undocumented. + """ + _ = require_shape_functions # Unused. + if op_whitelist and op_allowlist: + fail("op_whitelist is deprecated. Only use op_allowlist.") + if op_whitelist: + full_target_name = "//" + native.package_name() + ":" + name + print("op_whitelist is deprecated. Please migrate to the preferred " + + "`op_allowlist` spelling. Offending target: " + + full_target_name) # buildifier: disable=print + op_allowlist = op_whitelist + + if (hidden or hidden_file) and op_allowlist: + fail("Cannot pass specify both hidden and op_allowlist.") + + # Construct a cc_binary containing the specified ops. + tool_name = "gen_" + name + "_py_wrappers_cc" + if not deps: + deps = [str(Label("@org_tensorflow//tensorflow/core:" + name + "_op_lib"))] + tf_cc_binary( + name = tool_name, + copts = copts + tf_copts_exec(), + linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + cc_linkopts, + linkstatic = 1, # Faster to link this one-time-use binary dynamically + visibility = [clean_dep("@org_tensorflow//tensorflow:internal")], + deps = ([ + clean_dep("@org_tensorflow//tensorflow/core:framework"), + clean_dep("@org_tensorflow//tensorflow/python/framework:python_op_gen_main"), + ] + deps), + testonly = testonly, + ) + + pygen_args = [] + + # Invoke the previous cc_binary to generate a python file. + if not out: + out = "ops/gen_" + name + ".py" + + extra_srcs = [] + if hidden: + pygen_args.append("--hidden_op_list=" + ",".join(hidden)) + elif hidden_file: + # `hidden_file` is file containing a list of op names to be hidden in the + # generated module. + pygen_args.append("--hidden_op_list_filename=$(location " + hidden_file + ")") + extra_srcs.append(hidden_file) + elif op_allowlist: + pygen_args.append("--op_allowlist=" + ",".join(["'%s'" % op for op in op_allowlist])) + + # Prepare ApiDef directories to pass to the genrule. + if api_def_srcs: + api_def_args = [] + for api_def_src in api_def_srcs: + # Add directory of the first ApiDef source to args. + # We are assuming all ApiDefs in a single api_def_src are in the + # same directory. + api_def_args.append( + "$$(dirname $$(echo $(locations " + api_def_src + + ") | cut -d\" \" -f1))", + ) + pygen_args.append("--api_def_dirs=" + ",".join(api_def_args)) + + op_reg_offset_out = "gen_" + name + "_reg_offsets.pb" + generate_op_reg_offsets( + name = name + "_reg_offsets", + out = op_reg_offset_out, + # Feed an empty dep list if not indexing to skip unnecessary aspect propagation. + deps = select({ + clean_dep("@org_tensorflow//tensorflow:api_indexable"): deps, + "//conditions:default": [], + }), + tf_binary_additional_srcs = tf_binary_additional_srcs(), + testonly = testonly, + ) + extra_srcs.append(op_reg_offset_out) + pygen_args.append("--op_reg_offset_filename=$(location " + op_reg_offset_out + ")") + + native.genrule( + name = name + "_pygenrule", + outs = [out], + srcs = api_def_srcs + extra_srcs, + tools = [tool_name] + tf_binary_additional_srcs(), + cmd = ("$(location " + tool_name + ") " + " ".join(pygen_args) + " > $@"), + compatible_with = compatible_with, + testonly = testonly, + ) + + # Make a py_library out of the generated python file. + if not generated_target_name: + generated_target_name = name + py_deps = [clean_dep("@org_tensorflow//tensorflow/python/framework:for_generated_wrappers_v2")] + if extra_py_deps: + py_deps += extra_py_deps + py_lib_rule( + name = generated_target_name, + srcs = [out], + srcs_version = "PY3", + visibility = visibility, + deps = py_deps, + # Instruct build_cleaner to try to avoid using this rule; typically ops + # creators will provide their own tf_custom_op_py_library based target + # that wraps this one. + tags = ["avoid_dep"], + compatible_with = compatible_with, + testonly = testonly, + ) + +# Define a bazel macro that creates cc_test for tensorflow. +# +# Links in the framework shared object +# (//third_party/tensorflow:libtensorflow_framework.so) when not building +# statically. Also adds linker options (rpaths) so that the framework shared +# object can be found. +# +# TODO(opensource): we need to enable this to work around the hidden symbol +# __cudaRegisterFatBinary error. Need more investigations. +def tf_cc_test( + name, + srcs, + deps, + data = [], + extra_copts = [], + suffix = "", + linkopts = lrt_if_needed(), + kernels = [], + **kwargs): + cc_test( + name = "%s%s" % (name, suffix), + srcs = srcs + tf_binary_additional_srcs(), + copts = tf_copts() + extra_copts, + linkopts = select({ + clean_dep("@org_tensorflow//tensorflow:android"): [ + "-pie", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-lm", + ], + "//conditions:default": [ + "-lpthread", + "-lm", + ], + clean_dep("//third_party/compute_library:build_with_acl"): [ + "-fopenmp", + "-lm", + ], + }) + linkopts + _rpath_linkopts(name), + deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml( + [ + clean_dep("//third_party/mkl:intel_binary_blob"), + ], + ), + data = data + + tf_binary_dynamic_kernel_dsos() + + tf_binary_additional_srcs(), + exec_properties = tf_exec_properties(kwargs), + **kwargs + ) + +def tf_cc_shared_test( + name, + srcs, + deps, + data = [], + extra_copts = [], + suffix = "", + linkopts = lrt_if_needed(), + kernels = [], + **kwargs): + cc_test( + name = "%s%s" % (name, suffix), + srcs = srcs, + copts = tf_copts() + extra_copts, + linkopts = select({ + clean_dep("@org_tensorflow//tensorflow:android"): [ + "-pie", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + clean_dep("@org_tensorflow//tensorflow:macos"): [ + "-lm", + ], + "//conditions:default": [ + "-lpthread", + "-lm", + ], + clean_dep("//third_party/compute_library:build_with_acl"): [ + "-fopenmp", + "-lm", + ], + }) + linkopts + _rpath_linkopts(name), + deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml( + [ + clean_dep("//third_party/mkl:intel_binary_blob"), + ], + ), + dynamic_deps = if_static( + extra_deps = [], + macos = ["@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION], + otherwise = ["@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION], + ), + data = data + tf_binary_dynamic_kernel_dsos(), + exec_properties = tf_exec_properties(kwargs), + **kwargs + ) + +register_extension_info( + extension = tf_cc_test, + label_regex_for_dep = "{extension_name}", +) + +# TODO(jakeharmon): Replace with an implementation which doesn't add a +# dependency on core:common_runtime +def tf_gpu_cc_test( + name, + srcs = [], + deps = [], + tags = [], + data = [], + size = "medium", + extra_copts = [], + linkstatic = 0, + args = [], + kernels = [], + linkopts = [], + **kwargs): + targets = [] + tf_cc_test( + name = name, + size = size, + srcs = srcs, + args = args, + data = data, + extra_copts = extra_copts + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"]), + kernels = kernels, + linkopts = linkopts, + linkstatic = linkstatic, + suffix = "_cpu", + tags = tags, + deps = deps, + **kwargs + ) + targets.append(name + "_cpu") + tf_cc_test( + name = name, + size = size, + srcs = srcs, + args = args, + data = data, + extra_copts = extra_copts + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"]), + kernels = kernels, + linkopts = linkopts, + linkstatic = select({ + # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out. + clean_dep("@org_tensorflow//tensorflow:macos"): 1, + "@local_config_cuda//cuda:using_nvcc": 1, + "@local_config_cuda//cuda:using_clang": 1, + "//conditions:default": 0, + }), + suffix = "_gpu", + tags = tags + tf_gpu_tests_tags(), + deps = deps + if_cuda_or_rocm([ + clean_dep("@org_tensorflow//tensorflow/core:gpu_runtime"), + ]), + **kwargs + ) + targets.append(name + "_gpu") + if "multi_gpu" in tags or "multi_and_single_gpu" in tags: + cleaned_tags = tags + two_gpu_tags + if "requires-gpu-nvidia" in cleaned_tags: + cleaned_tags.remove("requires-gpu-nvidia") + tf_cc_test( + name = name, + size = size, + srcs = srcs, + args = args, + data = data, + extra_copts = extra_copts, + kernels = kernels, + linkopts = linkopts, + linkstatic = select({ + # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out. + clean_dep("@org_tensorflow//tensorflow:macos"): 1, + "@local_config_cuda//cuda:using_nvcc": 1, + "@local_config_cuda//cuda:using_clang": 1, + "//conditions:default": 0, + }), + suffix = "_2gpu", + tags = cleaned_tags, + deps = deps + if_cuda_or_rocm([ + clean_dep("@org_tensorflow//tensorflow/core:gpu_runtime"), + ]), + **kwargs + ) + targets.append(name + "_2gpu") + + native.test_suite(name = name, tests = targets, tags = tags) + +# terminology changes: saving tf_cuda_* definition for compatibility +def tf_cuda_cc_test(*args, **kwargs): + tf_gpu_cc_test(*args, **kwargs) + +def tf_gpu_only_cc_test( + name, + srcs = [], + deps = [], + tags = [], + data = [], + size = "medium", + args = [], + kernels = [], + linkopts = []): + tags = tags + tf_gpu_tests_tags() + + gpu_lib_name = "%s%s" % (name, "_gpu_lib") + tf_gpu_kernel_library( + name = gpu_lib_name, + srcs = srcs + tf_binary_additional_srcs(), + deps = deps, + testonly = 1, + ) + cc_test( + name = "%s%s" % (name, "_gpu"), + size = size, + args = args, + features = if_cuda(["-use_header_modules"]), + data = data + tf_binary_dynamic_kernel_dsos(), + deps = [":" + gpu_lib_name], + linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name), + tags = tags, + exec_properties = tf_exec_properties({"tags": tags}), + ) + +# terminology changes: saving tf_cuda_* definition for compatibility +def tf_cuda_only_cc_test(*args, **kwargs): + tf_gpu_only_cc_test(*args, **kwargs) + +# Create a cc_test for each of the tensorflow tests listed in "tests", along +# with a test suite of the given name, if provided. +def tf_cc_tests( + srcs, + deps, + name = "", + linkstatic = 0, + tags = [], + size = "medium", + args = None, + linkopts = lrt_if_needed(), + kernels = [], + create_named_test_suite = False, + visibility = None): + test_names = [] + for src in srcs: + test_name = src_to_test_name(src) + tf_cc_test( + name = test_name, + size = size, + srcs = [src], + args = args, + kernels = kernels, + linkopts = linkopts, + linkstatic = linkstatic, + tags = tags, + deps = deps, + visibility = visibility, + ) + test_names.append(test_name) + + # Add a test suite with the generated tests if a name was provided and + # it does not conflict any of the test names. + if create_named_test_suite: + native.test_suite( + name = name, + tests = test_names, + visibility = visibility, + tags = tags, + ) + +register_extension_info( + extension = tf_cc_tests, + label_regex_for_dep = "{extension_name}", +) + +def tf_cc_test_mkl( + srcs, + deps, + name = "", + data = [], + linkstatic = 0, + tags = [], + size = "medium", + kernels = [], + args = None): + # -fno-exceptions in nocopts breaks compilation if header modules are enabled. + disable_header_modules = ["-use_header_modules"] + + for src in srcs: + cc_test( + name = src_to_test_name(src), + srcs = if_mkl([src]) + tf_binary_additional_srcs(), + # Adding an explicit `-fexceptions` because `allow_exceptions = True` + # in `tf_copts` doesn't work internally. + copts = tf_copts() + ["-fexceptions"] + tf_openmp_copts(), + linkopts = select({ + clean_dep("@org_tensorflow//tensorflow:android"): [ + "-pie", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-lpthread", + "-lm", + ], + }) + _rpath_linkopts(src_to_test_name(src)) + tf_openmp_lopts(), + deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]), + data = data + tf_binary_dynamic_kernel_dsos(), + exec_properties = tf_exec_properties({"tags": tags}), + linkstatic = linkstatic, + tags = tags, + size = size, + args = args, + features = disable_header_modules, + ) + +def tf_gpu_cc_tests( + srcs, + deps, + name = "", + tags = [], + size = "medium", + linkstatic = 0, + args = None, + kernels = [], + linkopts = []): + for src in srcs: + tf_gpu_cc_test( + name = src_to_test_name(src), + size = size, + srcs = [src], + args = args, + kernels = kernels, + linkopts = linkopts, + linkstatic = linkstatic, + tags = tags, + deps = deps, + ) + +# terminology changes: saving tf_cuda_* definition for compatibility +def tf_cuda_cc_tests(*args, **kwargs): + tf_gpu_cc_tests(*args, **kwargs) + +def tf_java_test( + name, + srcs = [], + deps = [], + kernels = [], + *args, + **kwargs): + cc_library_name = name + "_cclib" + cc_library( + # TODO(b/183579145): Remove when cc_shared_library supports CcInfo or JavaInfo providers . + name = cc_library_name, + srcs = tf_binary_additional_srcs(fullversion = True) + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels), + ) + native.java_test( + name = name, + srcs = srcs, + deps = deps + [cc_library_name], + *args, + **kwargs + ) + +def _cuda_copts(opts = []): + """Gets the appropriate set of copts for (maybe) CUDA compilation. + + If we're doing CUDA compilation, returns copts for our particular CUDA + compiler. If we're not doing CUDA compilation, returns an empty list. + + """ + return select({ + "//conditions:default": [], + "@local_config_cuda//cuda:using_nvcc": [ + "-nvcc_options=relaxed-constexpr", + "-nvcc_options=ftz=true", + ] + opts, + "@local_config_cuda//cuda:using_clang": [ + "-fcuda-flush-denormals-to-zero", + ] + opts, + }) + +# Build defs for TensorFlow kernels + +# When this target is built using --config=cuda, a cc_library is built +# that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional +# libraries needed by GPU kernels. +# +# When this target is built using --config=rocm, a cc_library is built +# that passes -DTENSORFLOW_USE_ROCM and '-x rocm', linking in additional +# libraries needed by GPU kernels. +def tf_gpu_kernel_library( + srcs, + copts = [], + cuda_copts = [], + deps = [], + hdrs = [], + **kwargs): + copts = copts + tf_copts() + _cuda_copts(opts = cuda_copts) + rocm_copts(opts = cuda_copts) + kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"] + + cuda_library( + srcs = srcs, + hdrs = hdrs, + copts = copts, + deps = deps + if_cuda([ + clean_dep("@org_tensorflow//tensorflow/tsl/cuda:cudart_stub"), + ]) + if_cuda_or_rocm([ + clean_dep("@org_tensorflow//tensorflow/core:gpu_lib"), + ]), + alwayslink = 1, + **kwargs + ) + +tf_gpu_library = tsl_gpu_library + +# terminology changes: saving tf_cuda_* definition for compatibility +tf_cuda_library = tsl_gpu_library + +def tf_kernel_library( + name, + prefix = None, + srcs = None, + gpu_srcs = None, + hdrs = None, + deps = None, + gpu_deps = None, + alwayslink = 1, + copts = None, + gpu_copts = None, + is_external = False, + compatible_with = None, + **kwargs): + """A rule to build a TensorFlow OpKernel. + + May either specify srcs/hdrs or prefix. Similar to tf_gpu_library, + but with alwayslink=1 by default. If prefix is specified: + * prefix*.cc (except *.cu.cc) is added to srcs + * prefix*.h (except *.cu.h) is added to hdrs + * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs. + With the exception that test files are excluded. + For example, with prefix = "cast_op", + * srcs = ["cast_op.cc"] + * hdrs = ["cast_op.h"] + * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"] + * "cast_op_test.cc" is excluded + With prefix = "cwise_op" + * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"], + * hdrs = ["cwise_ops.h", "cwise_ops_common.h"], + * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc", + "cwise_ops.h", "cwise_ops_common.h", + "cwise_ops_gpu_common.cu.h"] + * "cwise_ops_test.cc" is excluded + """ + if not srcs: + srcs = [] + if not hdrs: + hdrs = [] + if not deps: + deps = [] + if not gpu_deps: + gpu_deps = [] + if not copts: + copts = [] + if not gpu_copts: + gpu_copts = [] + textual_hdrs = [] + copts = copts + tf_copts(is_external = is_external) + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"]) + + # Override EIGEN_STRONG_INLINE to inline when + # --define=override_eigen_strong_inline=true to avoid long compiling time. + # See https://github.com/tensorflow/tensorflow/issues/10521 + copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]) + if prefix: + if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]): + if not gpu_srcs: + gpu_srcs = [] + gpu_srcs = gpu_srcs + native.glob( + [prefix + "*.cu.cc", prefix + "*.h"], + exclude = [prefix + "*test*"], + ) + srcs = srcs + native.glob( + [prefix + "*.cc"], + exclude = [prefix + "*test*", prefix + "*.cu.cc"], + ) + hdrs = hdrs + native.glob( + [prefix + "*.h"], + exclude = [prefix + "*test*", prefix + "*.cu.h", prefix + "*impl.h"], + ) + textual_hdrs = native.glob( + [prefix + "*impl.h"], + exclude = [prefix + "*test*", prefix + "*.cu.h"], + ) + cuda_deps = [clean_dep("@org_tensorflow//tensorflow/core:gpu_lib")] + if gpu_srcs: + for gpu_src in gpu_srcs: + if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"): + fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc" + .format(gpu_src)) + tf_gpu_kernel_library( + name = name + "_gpu", + srcs = gpu_srcs, + deps = deps + gpu_deps, + copts = gpu_copts, + **kwargs + ) + cuda_deps.extend([":" + name + "_gpu"]) + kwargs["tags"] = kwargs.get("tags", []) + [ + "req_dep=%s" % clean_dep("@org_tensorflow//tensorflow/core:gpu_lib"), + "req_dep=@local_config_cuda//cuda:cuda_headers", + ] + tf_gpu_library( + name = name, + srcs = srcs, + hdrs = hdrs, + textual_hdrs = textual_hdrs, + copts = copts, + cuda_deps = cuda_deps + gpu_deps, + linkstatic = 1, # Needed since alwayslink is broken in bazel b/27630669 + alwayslink = alwayslink, + deps = deps, + compatible_with = compatible_with, + **kwargs + ) + + # TODO(gunan): CUDA dependency not clear here. Fix it. + tf_cc_shared_object( + name = "libtfkernel_%s.so" % name, + srcs = srcs + hdrs, + copts = copts, + tags = ["manual", "notap"], + deps = deps, + ) + +register_extension_info( + extension = tf_kernel_library, + label_regex_for_dep = "{extension_name}", +) + +def tf_mkl_kernel_library( + name, + prefix = None, + srcs = None, + hdrs = None, + deps = None, + alwayslink = 1, + # Adding an explicit `-fexceptions` because `allow_exceptions = True` + # in `tf_copts` doesn't work internally. + copts = tf_copts() + ["-fexceptions"] + tf_openmp_copts(), + linkopts = tf_openmp_lopts()): + """A rule to build MKL-based TensorFlow kernel libraries.""" + + if not bool(srcs): + srcs = [] + if not bool(hdrs): + hdrs = [] + + if prefix: + srcs = srcs + native.glob( + [prefix + "*.cc"], + exclude = [prefix + "*test*"], + ) + hdrs = hdrs + native.glob( + [prefix + "*.h"], + exclude = [prefix + "*test*"], + ) + + # -fno-exceptions in nocopts breaks compilation if header modules are enabled. + disable_header_modules = ["-use_header_modules"] + + cc_library( + name = name, + srcs = if_mkl(srcs), + hdrs = hdrs, + deps = deps, + linkopts = linkopts, + alwayslink = alwayslink, + copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), + features = disable_header_modules, + ) + +def _get_transitive_headers(hdrs, deps): + """Obtain the header files for a target and its transitive dependencies. + + Args: + hdrs: a list of header files + deps: a list of targets that are direct dependencies + + Returns: + a collection of the transitive headers + """ + return depset( + hdrs, + transitive = [dep[CcInfo].compilation_context.headers for dep in deps], + ) + +def _get_repository_roots(ctx, files): + """Returns abnormal root directories under which files reside. + + When running a ctx.action, source files within the main repository are all + relative to the current directory; however, files that are generated or exist + in remote repositories will have their root directory be a subdirectory, + e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function + returns the set of these devious directories, ranked and sorted by popularity + in order to hopefully minimize the number of I/O system calls within the + compiler, because includes have quadratic complexity. + """ + result = {} + for f in files.to_list(): + root = f.root.path + if root: + if root not in result: + result[root] = 0 + result[root] -= 1 + work = f.owner.workspace_root + if work: + if root: + root += "/" + root += work + if root: + if root not in result: + result[root] = 0 + result[root] -= 1 + return [k for v, k in sorted([(v, k) for k, v in result.items()])] + +def tf_custom_op_library_additional_deps(): + return [ + "@com_google_protobuf//:protobuf_headers", # copybara:comment + clean_dep("//third_party/eigen3"), + clean_dep("@org_tensorflow//tensorflow/core:framework_headers_lib"), + ] + +# A list of targets that contains the implementation of +# tf_custom_op_library_additional_deps. It's used to generate a DEF file for +# exporting symbols from _pywrap_tensorflow.dll on Windows. +def tf_custom_op_library_additional_deps_impl(): + return [ + # copybara:comment_begin + "@com_google_protobuf//:protobuf", + "@nsync//:nsync_cpp", + # copybara:comment_end + + # for //third_party/eigen3 + clean_dep("//third_party/eigen3"), + + # for //tensorflow/core:framework_headers_lib + clean_dep("@org_tensorflow//tensorflow/core:framework"), + clean_dep("@org_tensorflow//tensorflow/core:reader_base"), + ] + +# Traverse the dependency graph along the "deps" attribute of the +# target and return a struct with one field called 'tf_collected_deps'. +# tf_collected_deps will be the union of the deps of the current target +# and the tf_collected_deps of the dependencies of this target. +def _collect_deps_aspect_impl(target, ctx): + direct, transitive = [], [] + all_deps = [] + if hasattr(ctx.rule.attr, "deps"): + all_deps += ctx.rule.attr.deps + if hasattr(ctx.rule.attr, "data"): + all_deps += ctx.rule.attr.data + if hasattr(ctx.rule.attr, "roots"): + all_deps += ctx.rule.attr.roots + for dep in all_deps: + direct.append(dep.label) + if hasattr(dep, "tf_collected_deps"): + transitive.append(dep.tf_collected_deps) + return struct(tf_collected_deps = depset(direct = direct, transitive = transitive)) + +collect_deps_aspect = aspect( + attr_aspects = ["deps", "data", "roots"], + implementation = _collect_deps_aspect_impl, +) + +def _dep_label(dep): + label = dep.label + return label.package + ":" + label.name + +# This rule checks that transitive dependencies don't depend on the targets +# listed in the 'disallowed_deps' attribute, but do depend on the targets listed +# in the 'required_deps' attribute. Dependencies considered are targets in the +# 'deps' attribute or the 'data' attribute. +def _check_deps_impl(ctx): + required_deps = ctx.attr.required_deps + disallowed_deps = ctx.attr.disallowed_deps + for input_dep in ctx.attr.deps: + if not hasattr(input_dep, "tf_collected_deps"): + continue + collected_deps = sets.make(input_dep.tf_collected_deps.to_list()) + for disallowed_dep in disallowed_deps: + if sets.contains(collected_deps, disallowed_dep.label): + fail( + "{src} cannot depend on {dep}. See: bazel query 'somepath(//{src}, //{dep})'".format( + src = _dep_label(input_dep), + dep = _dep_label(disallowed_dep), + ), + ) + for required_dep in required_deps: + if not sets.contains(collected_deps, required_dep.label): + fail( + _dep_label(input_dep) + " must depend on " + + _dep_label(required_dep), + ) + return [] + +check_deps = rule( + _check_deps_impl, + attrs = { + "deps": attr.label_list( + aspects = [collect_deps_aspect], + mandatory = True, + allow_files = True, + ), + "disallowed_deps": attr.label_list( + default = [], + allow_files = True, + ), + "required_deps": attr.label_list( + default = [], + allow_files = True, + ), + }, +) + +def tf_custom_op_library( + name, + srcs = [], + gpu_srcs = [], + deps = [], + gpu_deps = None, + linkopts = [], + copts = [], + **kwargs): + """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.""" + + if not gpu_deps: + gpu_deps = [] + + deps = deps + if_cuda_or_rocm([ + clean_dep("@org_tensorflow//tensorflow/core:stream_executor_headers_lib"), + ]) + if_cuda([ + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cudart_static", + ]) + if_windows([ + clean_dep("@org_tensorflow//tensorflow/python:pywrap_tensorflow_import_lib"), + ]) + tf_custom_op_library_additional_deps() + + # Override EIGEN_STRONG_INLINE to inline when + # --define=override_eigen_strong_inline=true to avoid long compiling time. + # See https://github.com/tensorflow/tensorflow/issues/10521 + copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]) + + if gpu_srcs: + basename = name.split(".")[0] + cuda_library( + name = basename + "_gpu", + srcs = gpu_srcs, + copts = copts + tf_copts() + _cuda_copts() + rocm_copts() + + if_tensorrt(["-DGOOGLE_TENSORRT=1"]), + deps = deps + gpu_deps, + **kwargs + ) + deps = deps + [":" + basename + "_gpu"] + + check_deps( + name = name + "_check_deps", + disallowed_deps = [ + clean_dep("@org_tensorflow//tensorflow/core:framework"), + clean_dep("@org_tensorflow//tensorflow/core:lib"), + ], + deps = deps, + ) + tf_cc_shared_object( + name = name, + srcs = srcs, + deps = deps, + data = if_static([name + "_check_deps"]), + copts = copts + tf_copts(is_external = True), + features = ["windows_export_all_symbols"], + linkopts = linkopts + select({ + "//conditions:default": [ + "-lm", + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + clean_dep("@org_tensorflow//tensorflow:macos"): [], + }), + **kwargs + ) + +def tf_custom_op_py_library( + name, + srcs = [], + dso = [], + kernels = [], + srcs_version = "PY3", + visibility = None, + deps = [], + **kwargs): + _ignore = [kernels] + _make_tags_mutable(kwargs) + _plain_py_library( + name = name, + data = dso, + srcs = srcs, + srcs_version = srcs_version, + visibility = visibility, + deps = deps, + **kwargs + ) + +# In tf_py_wrap_cc_opensource generated libraries +# module init functions are not exported unless +# they contain one of the keywords in the version file +# this prevents custom python modules. +# This function attempts to append init_module_name to list of +# exported functions in version script +def _append_init_to_versionscript_impl(ctx): + mod_name = ctx.attr.module_name + if ctx.attr.is_version_script: + ctx.actions.expand_template( + template = ctx.file.template_file, + output = ctx.outputs.versionscript, + substitutions = { + "global:": "global:\n init_%s;\n _init_%s;\n PyInit_*;\n _PyInit_*;" % (mod_name, mod_name), + }, + is_executable = False, + ) + else: + ctx.actions.expand_template( + template = ctx.file.template_file, + output = ctx.outputs.versionscript, + substitutions = { + "*tensorflow*": "*tensorflow*\ninit_%s\n_init_%s\nPyInit_*\n_PyInit_*\n" % (mod_name, mod_name), + }, + is_executable = False, + ) + +_append_init_to_versionscript = rule( + attrs = { + "module_name": attr.string(mandatory = True), + "template_file": attr.label( + allow_single_file = True, + mandatory = True, + ), + "is_version_script": attr.bool( + default = True, + doc = "whether target is a ld version script or exported symbol list", + mandatory = False, + ), + }, + outputs = {"versionscript": "%{name}.lds"}, + implementation = _append_init_to_versionscript_impl, +) + +# This macro should only be used for pywrap_tensorflow_internal.so. +# It was copied and refined from the original tf_py_wrap_cc_opensource rule. +# buildozer: disable=function-docstring-args +def pywrap_tensorflow_macro_opensource( + name, + srcs = [], + roots = [], + deps = [], + dynamic_deps = [], + static_deps = [], + exports_filter = [], + copts = [], + version_script = None, + win_def_file = None): + """Builds the pywrap_tensorflow_internal shared object.""" + module_name = name.split("/")[-1] + + # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so + # and use that as the name for the rule producing the .so file. + cc_library_base = "/".join(name.split("/")[:-1] + ["_" + module_name]) + + # TODO(b/137885063): tf_cc_shared_object needs to be cleaned up; we really + # shouldn't be passing a name qualified with .so here. + cc_shared_library_name = cc_library_base + ".so" + cc_library_pyd_name = "/".join( + name.split("/")[:-1] + ["_" + module_name + ".pyd"], + ) + + # We need pybind11 to export the shared object PyInit symbol only in OSS. + extra_deps = [clean_dep("@pybind11")] + + if not version_script: + version_script = select({ + "@org_tensorflow//tensorflow:macos": clean_dep("@org_tensorflow//tensorflow:tf_exported_symbols.lds"), + "//conditions:default": clean_dep("@org_tensorflow//tensorflow:tf_version_script.lds"), + }) + vscriptname = name + "_versionscript" + _append_init_to_versionscript( + name = vscriptname, + is_version_script = select({ + "@org_tensorflow//tensorflow:macos": False, + "//conditions:default": True, + }), + module_name = module_name, + template_file = version_script, + ) + extra_linkopts = select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols + # not being exported. There should be a better way to deal with this. + "-Wl,-w", + "-Wl,-exported_symbols_list,$(location %s.lds)" % vscriptname, + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,--version-script", + "$(location %s.lds)" % vscriptname, + ], + }) + additional_linker_inputs = if_windows([], otherwise = ["%s.lds" % vscriptname]) + + # This is needed so that libtensorflow_cc is included in the pip package. + srcs += select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [clean_dep("@org_tensorflow//tensorflow:libtensorflow_cc.%s.dylib" % VERSION_MAJOR)], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [clean_dep("@org_tensorflow//tensorflow:libtensorflow_cc.so.%s" % VERSION_MAJOR)], + }) + + tf_cc_shared_library_opensource( + name = cc_shared_library_name, + srcs = srcs, + # framework_so is no longer needed as libtf.so is included via the extra_deps. + framework_so = [], + copts = copts + if_not_windows([ + "-Wno-self-assign", + "-Wno-sign-compare", + "-Wno-write-strings", + ]), + linkopts = extra_linkopts, + linkstatic = 1, + roots = roots, + deps = deps + extra_deps, + dynamic_deps = dynamic_deps, + static_deps = static_deps, + exports_filter = exports_filter, + win_def_file = win_def_file, + additional_linker_inputs = additional_linker_inputs, + ) + + # When a non-versioned .so is added as a 'src' to a bazel target, it uses + # -l%(so_name) instead of -l:%(so_file) during linking. When -l%(so_name) + # is passed to ld, it will look for an associated file with the schema + # lib%(so_name).so. Since pywrap_tensorflow is not explicitly versioned + # and is not prefixed with lib_, we add a rule for the creation of an .so + # file with the canonical lib schema (e.g. libNAME.so), so that + # -l%(so_name) is resolved during linking. + # + # See: https://github.com/bazelbuild/bazel/blob/7a6808260a733d50983c1adf0cf5a7493472267f/src/main/java/com/google/devtools/build/lib/rules/cpp/LibrariesToLinkCollector.java#L319 + for pattern in SHARED_LIBRARY_NAME_PATTERNS: + name_os = pattern % (cc_library_base, "") + native.genrule( + name = name_os + "_rule", + srcs = [":" + cc_shared_library_name], + outs = [name_os], + cmd = "cp $< $@", + ) + + native.genrule( + name = "gen_" + cc_library_pyd_name, + srcs = [":" + cc_shared_library_name], + outs = [cc_library_pyd_name], + cmd = "cp $< $@", + ) + + # TODO(amitpatankar): Remove this py_library reference and + # move the dependencies to pywrap_tensorflow. This can + # eliminate one layer of Python import redundancy. We would + # have to change all pywrap_tensorflow imports to + # pywrap_tensorflow_internal. + + # Bazel requires an empty .py file for pywrap_tensorflow_internal.py. + empty_py_file = [name + ".py"] + native.genrule( + name = "empty_py_file_rule", + outs = empty_py_file, + cmd = "touch $@", + ) + + # TODO(b/271333181): This should be done more generally on Windows for every dll dependency + # (there is only one currently) that is not in the same directory, otherwise Python will fail to + # link the pyd (which is just a dll) because of missing dependencies. + _create_symlink("ml_dtypes.so", "@org_tensorflow//tensorflow/tsl/python/lib/core:ml_dtypes.so") + + _plain_py_library( + name = name, + srcs = [":" + name + ".py"], + srcs_version = "PY3", + data = select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [ + ":" + cc_library_pyd_name, + ":ml_dtypes.so", + "@org_tensorflow//tensorflow/tsl/python/lib/core:ml_dtypes.so", + ], + "//conditions:default": [ + ":" + cc_shared_library_name, + ], + }), + ) + +# Export open source version of pywrap_tensorflow_macro under base name as well. +pywrap_tensorflow_macro = pywrap_tensorflow_macro_opensource + +# This macro is for running python tests against system installed pip package +# on Windows. +# +# py_test is built as an executable python zip file on Windows, which contains all +# dependencies of the target. Because of the C++ extensions, it would be very +# inefficient if the py_test zips all runfiles, plus we don't need them when running +# tests against system installed pip package. So we'd like to get rid of the deps +# of py_test in this case. +# +# In order to trigger the tests without bazel clean after getting rid of deps, +# we introduce the following : +# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker +# file of the pip package, the test gets to rerun when the pip package change. +# Note that this only works on Windows. See the definition of +# //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons. +# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test. +def py_test(deps = [], data = [], kernels = [], exec_properties = None, test_rule = _plain_py_test, **kwargs): + if not exec_properties: + exec_properties = tf_exec_properties(kwargs) + + _make_tags_mutable(kwargs) + test_rule( + deps = select({ + "//conditions:default": deps, + clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): [], + }), + data = data + select({ + "//conditions:default": kernels, + clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): ["@org_tensorflow//tensorflow/tools/pip_package:win_pip_package_marker"], + }), + exec_properties = exec_properties, + **kwargs + ) + +register_extension_info( + extension = py_test, + label_regex_for_dep = "{extension_name}", +) + +# Similar to py_test above, this macro is used to exclude dependencies for some py_binary +# targets in order to reduce the size of //tensorflow/tools/pip_package:simple_console_windows. +# See https://github.com/tensorflow/tensorflow/issues/22390 +def py_binary(name, deps = [], **kwargs): + # Add an extra target for dependencies to avoid nested select statement. + _plain_py_library( + name = name + "_deps", + deps = deps, + ) + + # Python version placeholder + _make_tags_mutable(kwargs) + _plain_py_binary( + name = name, + deps = select({ + "//conditions:default": [":" + name + "_deps"], + clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): [], + }), + **kwargs + ) + +def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs): + # Types not enforced in OSS. + _make_tags_mutable(kwargs) + _plain_py_library(name = name, **kwargs) + +# Tensorflow uses rules_python 0.0.1, and in that version of rules_python, +# the rules require the tags value to be a mutable list because they +# modify it in-place. Later versions of rules_python don't have this +# requirement. +def _make_tags_mutable(kwargs): + if "tags" in kwargs and kwargs["tags"] != None: + # The value might be a frozen list, which looks just like + # a regular list. So always make a copy. + kwargs["tags"] = list(kwargs["tags"]) + +def tf_py_test( + name, + srcs, + size = "medium", + data = [], + main = None, + args = [], + tags = [], + shard_count = 1, + additional_visibility = [], + kernels = [], + flaky = 0, + xla_enable_strict_auto_jit = False, + xla_enabled = False, + grpc_enabled = False, + tfrt_enabled = False, + # `tfrt_enabled` is set for some test targets, and if we enable + # TFRT tests just by that, this will enable TFRT builds for open source. + # TFRT open source is not fully integrated yet so we need a temporary + # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal` + # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is + # only applied for internal builds. + # TODO(b/156911178): Revert this temporary workaround once TFRT open source + # is fully integrated with TF. + tfrt_enabled_internal = False, + **kwargs): + """Create one or more python tests with extra tensorflow dependencies.""" + xla_test_true_list = [] + if "additional_deps" in kwargs: + fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.") + deps = kwargs.pop("deps", []) + + # xla_enable_strict_auto_jit is used to run Tensorflow unit tests with all XLA compilable + # kernels compiled with XLA. + if xla_enable_strict_auto_jit: + xla_enabled = True + xla_test_true_list.append("@org_tensorflow//tensorflow/python/framework:is_xla_test_true") + if xla_enabled: + deps = deps + tf_additional_xla_deps_py() + if grpc_enabled: + deps = deps + tf_additional_grpc_deps_py() + + # NOTE(ebrevdo): This is a workaround for depset() not being able to tell + # the difference between 'dep' and 'clean_dep(dep)'. + for to_add in [ + "@org_tensorflow//tensorflow/python:extra_py_tests_deps", + ]: + if to_add not in deps and clean_dep(to_add) not in deps: + deps.append(clean_dep(to_add)) + + env = kwargs.pop("env", {}) + + # Python version placeholder + kwargs.setdefault("srcs_version", "PY3") + py_test( + name = name, + size = size, + srcs = srcs, + args = args, + data = data, + flaky = flaky, + kernels = kernels, + main = main, + shard_count = shard_count, + tags = tags, + visibility = [clean_dep("@org_tensorflow//tensorflow:internal")] + + additional_visibility, + deps = depset(deps + xla_test_true_list), + env = env, + **kwargs + ) + if tfrt_enabled_internal: + tfrt_env = {} + tfrt_env.update(env) + tfrt_env["EXPERIMENTAL_ENABLE_TFRT"] = "1" + + # None `main` defaults to `name` + ".py" in `py_test` target. However, since we + # are appending _tfrt. it becomes `name` + "_tfrt.py" effectively. So force + # set `main` argument without `_tfrt`. + if main == None: + main = name + ".py" + + py_test( + env = tfrt_env, + name = name + "_tfrt", + size = size, + srcs = srcs, + args = args, + data = data, + flaky = flaky, + kernels = kernels, + main = main, + shard_count = shard_count, + tags = tags + ["tfrt"], + visibility = [clean_dep("@org_tensorflow//tensorflow:internal")] + + additional_visibility, + deps = depset(deps + xla_test_true_list), + **kwargs + ) + +register_extension_info( + extension = tf_py_test, + label_regex_for_dep = "{extension_name}(_tfrt)?", +) + +def gpu_py_test( + name, + srcs, + size = "medium", + data = [], + main = None, + args = [], + shard_count = 1, + kernels = [], + tags = [], + flaky = 0, + xla_enable_strict_auto_jit = False, + xla_enabled = False, + grpc_enabled = False, + xla_tags = [], # additional tags for xla_gpu tests + **kwargs): + if main == None: + main = name + ".py" + if "additional_deps" in kwargs: + fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.") + configs = ["cpu", "gpu"] + if "multi_gpu" in tags or "multi_and_single_gpu" in tags: + configs = configs + ["2gpu"] + + targets = [] + + for config in configs: + test_name = name + test_tags = tags + if config == "gpu": + test_tags = test_tags + tf_gpu_tests_tags() + if config == "2gpu": + test_tags = test_tags + two_gpu_tags + if "requires-gpu-nvidia" in test_tags: + test_tags.remove("requires-gpu-nvidia") + + # TODO(b/215751004): CPU on XLA tests are skipped intentionally. + if config != "cpu" and xla_enable_strict_auto_jit: + strict_auto_jit_test_name = test_name + "_xla_" + config + tf_py_test( + name = strict_auto_jit_test_name, + size = size, + srcs = srcs, + args = args, + data = data, + flaky = flaky, + grpc_enabled = grpc_enabled, + kernels = kernels, + main = main, + shard_count = shard_count, + tags = test_tags + xla_tags + ["xla", "manual"], + xla_enabled = xla_enabled, + xla_enable_strict_auto_jit = True, + **kwargs + ) + targets.append(strict_auto_jit_test_name) + + test_name = test_name + "_" + config + + tf_py_test( + name = test_name, + size = size, + srcs = srcs, + args = args, + data = data, + flaky = flaky, + grpc_enabled = grpc_enabled, + kernels = kernels, + main = main, + shard_count = shard_count, + tags = test_tags, + xla_enabled = xla_enabled, + xla_enable_strict_auto_jit = False, + **kwargs + ) + targets.append(test_name) + + native.test_suite(name = name, tests = targets, tags = tags) + +# terminology changes: saving cuda_* definition for compatibility +def cuda_py_test(*args, **kwargs): + gpu_py_test(*args, **kwargs) + +register_extension_info( + extension = gpu_py_test, + label_regex_for_dep = "{extension_name}_cpu", +) + +def py_tests( + name, + srcs, + size = "medium", + kernels = [], + data = [], + tags = [], + shard_count = 1, + prefix = "", + xla_enable_strict_auto_jit = False, + xla_enabled = False, + grpc_enabled = False, + tfrt_enabled = False, + **kwargs): + if "additional_deps" in kwargs: + fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.") + for src in srcs: + test_name = src.split("/")[-1].split(".")[0] + if prefix: + test_name = "%s_%s" % (prefix, test_name) + tf_py_test( + name = test_name, + size = size, + srcs = [src], + data = data, + grpc_enabled = grpc_enabled, + kernels = kernels, + main = src, + shard_count = shard_count, + tags = tags, + xla_enabled = xla_enabled, + xla_enable_strict_auto_jit = xla_enable_strict_auto_jit, + tfrt_enabled = tfrt_enabled, + **kwargs + ) + +# Creates a genrule named for running tools/proto_text's generator to +# make the proto_text functions, for the protos passed in . +# +# Return a struct with fields (hdrs, srcs) containing the names of the +# generated files. +def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None, compatible_with = None): + out_hdrs = ( + [ + p.replace(".proto", ".pb_text.h") + for p in srcs + ] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs] + ) + out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs] + native.genrule( + name = name + "_srcs", + srcs = srcs + protodeps + [clean_dep("@org_tensorflow//tensorflow/tools/proto_text:placeholder.txt")], + outs = out_hdrs + out_srcs, + visibility = visibility, + cmd = + "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " + + "$(@D) " + srcs_relative_dir + " $(SRCS)", + tools = [ + clean_dep("@org_tensorflow//tensorflow/tools/proto_text:gen_proto_text_functions"), + ], + compatible_with = compatible_with, + ) + + native.filegroup( + name = name + "_hdrs", + srcs = out_hdrs, + visibility = visibility, + compatible_with = compatible_with, + ) + + cc_library( + compatible_with = compatible_with, + name = name, + srcs = out_srcs, + hdrs = out_hdrs, + visibility = visibility, + deps = deps, + alwayslink = 1, + ) + +def tf_genrule_cmd_append_to_srcs(to_append): + return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append + + " >> $(@)") + +def _local_exec_transition_impl(settings, attr): + return { + # Force all targets in the subgraph to build on the local machine. + "//command_line_option:modify_execution_info": ".*=+no-remote-exec", + } + +# A transition that forces all targets in the subgraph to be built locally. +_local_exec_transition = transition( + implementation = _local_exec_transition_impl, + inputs = [], + outputs = [ + "//command_line_option:modify_execution_info", + ], +) + +def _local_genrule_impl(ctx): + ctx.actions.run_shell( + outputs = [ctx.outputs.out], + inputs = [f for t in ctx.attr.srcs for f in t.files.to_list()], + tools = [ctx.executable.exec_tool], + arguments = [f.path for t in ctx.attr.srcs for f in t.files.to_list()] + + [ctx.outputs.out.path], + command = "%s %s" % (ctx.executable.exec_tool.path, ctx.attr.arguments), + execution_requirements = {"no-remote-exec": ""}, + use_default_shell_env = True, + ) + +# A genrule that executes locally and forces the tool it runs to be built locally. +# For python, we want to build all py_binary rules locally that we also want +# to execute locally, as the remote image might use a different python version. +# TODO(klimek): Currently we still need to annotate the py_binary rules to use +# the local platform when building. When we know how to change the platform +# (https://github.com/bazelbuild/bazel/issues/11035) use this to not require +# annotating the py_binary rules. +_local_genrule_internal = rule( + implementation = _local_genrule_impl, + attrs = { + "out": attr.output(), + "exec_tool": attr.label( + executable = True, + cfg = _local_exec_transition, + allow_files = True, + ), + "arguments": attr.string(), + "srcs": attr.label_list( + allow_files = True, + ), + "_whitelist_function_transition": attr.label(default = "@bazel_tools//tools/whitelists/function_transition_whitelist"), + }, +) + +# Wrap the rule in a macro so we can pass in exec_compatible_with. +def _local_genrule(**kwargs): + _local_genrule_internal( + exec_compatible_with = [ + "@local_execution_config_platform//:platform_constraint", + ], + **kwargs + ) + +def tf_version_info_genrule(name, out, compatible_with = None): + # TODO(gunan): Investigate making this action hermetic so we do not need + # to run it locally. + _local_genrule( + name = name, + out = out, + compatible_with = compatible_with, + exec_tool = "@org_tensorflow//tensorflow/tools/git:gen_git_source", + srcs = [ + "@local_config_git//:gen/spec.json", + "@local_config_git//:gen/head", + "@local_config_git//:gen/branch_ref", + ], + arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}", + ) + +def _dict_to_kv(d): + """Convert a dictionary to a space-joined list of key=value pairs.""" + return " " + " ".join(["%s=%s" % (k, v) for k, v in d.items()]) + +def tf_py_build_info_genrule(name, out): + _local_genrule( + name = name, + out = out, + exec_tool = "@org_tensorflow//tensorflow/tools/build_info:gen_build_info", + arguments = + "--raw_generate \"$@\" " + + " --key_value" + + " is_rocm_build=" + if_rocm("True", "False") + + " is_cuda_build=" + if_cuda("True", "False") + + " is_tensorrt_build=" + if_tensorrt("True", "False") + + if_windows(_dict_to_kv({ + "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll", + }), "") + if_windows_cuda(_dict_to_kv({ + "nvcuda_dll_name": "nvcuda.dll", + "cudart_dll_name": "cudart{cuda_version}.dll", + "cudnn_dll_name": "cudnn{cudnn_version}.dll", + }), ""), + ) + +def cc_library_with_android_deps( + deps, + android_deps = [], + common_deps = [], + copts = tf_copts(), + **kwargs): + deps = if_not_android(deps) + if_android(android_deps) + common_deps + cc_library(deps = deps, copts = copts, **kwargs) + +def tensorflow_opensource_extra_deps(): + return [] + +# Builds a pybind11 compatible library. +def pybind_library( + name, + copts = [], + features = [], + tags = [], + deps = [], + **kwargs): + # Mark common dependencies as required for build_cleaner. + tags = tags + ["req_dep=" + clean_dep("//third_party/pybind11"), "req_dep=@local_config_python//:python_headers"] + + native.cc_library( + name = name, + copts = copts + ["-fexceptions"], + features = features + [ + "-use_header_modules", # Required for pybind11. + "-parse_headers", + ], + tags = tags, + deps = deps + [clean_dep("//third_party/pybind11"), "@local_config_python//:python_headers"], + **kwargs + ) + +# buildozer: disable=function-docstring-args +def pybind_extension_opensource( + name, + srcs, + module_name = None, # Unused. + hdrs = [], + dynamic_deps = [], + static_deps = [], + deps = [], + additional_exported_symbols = [], + compatible_with = None, + copts = [], + data = [], + defines = [], + deprecation = None, + enable_stub_generation = False, # Unused. + features = [], + link_in_framework = False, + licenses = None, + linkopts = [], + pytype_deps = [], + pytype_srcs = [], + restricted_to = None, + srcs_version = "PY3", + testonly = None, + visibility = None, + win_def_file = None): + """Builds a generic Python extension module.""" + _ignore = [enable_stub_generation, module_name] # buildifier: disable=unused-variable + p = name.rfind("/") + if p == -1: + sname = name + prefix = "" + else: + sname = name[p + 1:] + prefix = name[:p + 1] + so_file = "%s%s.so" % (prefix, sname) + filegroup_name = "%s_filegroup" % name + pyd_file = "%s%s.pyd" % (prefix, sname) + exported_symbols = [ + "init%s" % sname, + "init_%s" % sname, + "PyInit_%s" % sname, + ] + additional_exported_symbols + + exported_symbols_file = "%s-exported-symbols.lds" % name + version_script_file = "%s-version-script.lds" % name + + exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols]) + version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols]) + + native.genrule( + name = name + "_exported_symbols", + outs = [exported_symbols_file], + cmd = "echo '%s' >$@" % exported_symbols_output, + output_licenses = ["unencumbered"], + visibility = ["//visibility:private"], + testonly = testonly, + ) + + native.genrule( + name = name + "_version_script", + outs = [version_script_file], + cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output, + output_licenses = ["unencumbered"], + visibility = ["//visibility:private"], + testonly = testonly, + ) + + if static_deps: + cc_library_name = so_file + "_cclib" + cc_library( + name = cc_library_name, + hdrs = hdrs, + srcs = srcs + hdrs, + data = data, + deps = deps, + compatible_with = compatible_with, + copts = copts + [ + "-fno-strict-aliasing", + "-fexceptions", + ] + select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-fvisibility=hidden", + ], + }), + defines = defines, + features = features + ["-use_header_modules"], + restricted_to = restricted_to, + testonly = testonly, + visibility = visibility, + ) + + cc_shared_library( + name = so_file, + roots = [cc_library_name], + dynamic_deps = dynamic_deps, + static_deps = static_deps, + additional_linker_inputs = [exported_symbols_file, version_script_file], + compatible_with = compatible_with, + deprecation = deprecation, + features = features + ["-use_header_modules"], + licenses = licenses, + restricted_to = restricted_to, + shared_lib_name = so_file, + testonly = testonly, + user_link_flags = linkopts + _rpath_user_link_flags(name) + select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols + # not being exported. There should be a better way to deal with this. + "-Wl,-w", + "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,--version-script", + "$(location %s)" % version_script_file, + ], + }), + visibility = visibility, + ) + + # cc_shared_library can generate more than one file. + # Solution to avoid the error "variable '$<' : more than one input file." + filegroup( + name = filegroup_name, + srcs = [so_file], + output_group = "main_shared_library_output", + testonly = testonly, + ) + else: + if link_in_framework: + srcs += tf_binary_additional_srcs() + + cc_binary( + name = so_file, + srcs = srcs + hdrs, + data = data, + copts = copts + [ + "-fno-strict-aliasing", + "-fexceptions", + ] + select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-fvisibility=hidden", + ], + }), + linkopts = linkopts + _rpath_linkopts(name) + select({ + clean_dep("@org_tensorflow//tensorflow:macos"): [ + # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols + # not being exported. There should be a better way to deal with this. + "-Wl,-w", + "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, + ], + clean_dep("@org_tensorflow//tensorflow:windows"): [], + "//conditions:default": [ + "-Wl,--version-script", + "$(location %s)" % version_script_file, + ], + }), + deps = deps + [ + exported_symbols_file, + version_script_file, + ], + defines = defines, + features = features + ["-use_header_modules"], + linkshared = 1, + testonly = testonly, + licenses = licenses, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + ) + + # For Windows, emulate the above filegroup with the shared object. + native.alias( + name = filegroup_name, + actual = so_file, + ) + + # For Windows only. + native.genrule( + name = name + "_pyd_copy", + srcs = [filegroup_name], + outs = [pyd_file], + cmd = "cp $< $@", + output_to_bindir = True, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + testonly = testonly, + ) + + _plain_py_library( + name = name, + data = select({ + clean_dep("@org_tensorflow//tensorflow:windows"): [pyd_file], + "//conditions:default": [so_file], + }) + pytype_srcs, + deps = pytype_deps, + srcs_version = srcs_version, + licenses = licenses, + testonly = testonly, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + ) + +# Export open source version of pybind_extension under base name as well. +pybind_extension = pybind_extension_opensource + +# Note: we cannot add //third_party/tf_runtime:__subpackages__ here, +# because that builds all of tf_runtime's packages, and some of them +# are known not to build on big endian systems. +# See b/148087476 and +# https://github.com/tensorflow/tensorflow/issues/57844. +# TODO(b/254083070): remove this definition once the packages move to TSL. +def tsl_async_value_deps(): + return [ + "@tf_runtime//:async_value", + "@tf_runtime//:dtype", + "@tf_runtime//:support", + "@tf_runtime//:concurrent_vector", + "@tf_runtime//:ref_count", + "@tf_runtime//third_party/llvm_derived:unique_any", + "@tf_runtime//third_party/llvm_derived:in_place", + ] + +def tf_python_pybind_static_deps(testonly = False): + # TODO(b/146808376): Reduce the dependencies to those that are really needed. + static_deps = [ + "//:__subpackages__", + "@FP16//:__subpackages__", + "@FXdiv//:__subpackages__", + "@XNNPACK//:__subpackages__", + "@arm_neon_2_x86_sse//:__subpackages__", + "@bazel_tools//:__subpackages__", + "@boringssl//:__subpackages__", + "@clog//:__subpackages__", + "@com_github_cares_cares//:__subpackages__", + "@com_github_googlecloudplatform_tensorflow_gcp_tools//:__subpackages__", + "@com_github_grpc_grpc//:__subpackages__", + "@com_google_absl//:__subpackages__", + "@com_google_googleapis//:__subpackages__", + "@com_google_protobuf//:__subpackages__", + "@com_googlesource_code_re2//:__subpackages__", + "@compute_library//:__subpackages__", + "@cpuinfo//:__subpackages__", + "@cudnn_frontend_archive//:__subpackages__", # TFRT integration for TensorFlow. + "@curl//:__subpackages__", + "@dlpack//:__subpackages__", + "@double_conversion//:__subpackages__", + "@eigen_archive//:__subpackages__", + "@farmhash_archive//:__subpackages__", + "@farmhash_gpu_archive//:__subpackages__", + "@fft2d//:__subpackages__", + "@flatbuffers//:__subpackages__", + "@gemmlowp//:__subpackages__", + "@gif//:__subpackages__", + "@highwayhash//:__subpackages__", + "@hwloc//:__subpackages__", + "@icu//:__subpackages__", + "@jsoncpp_git//:__subpackages__", + "@libjpeg_turbo//:__subpackages__", + "@llvm-project//:__subpackages__", + "@llvm_openmp//:__subpackages__", + "@llvm_terminfo//:__subpackages__", + "@llvm_zlib//:__subpackages__", + "@local_config_cuda//:__subpackages__", + "@local_config_git//:__subpackages__", + "@local_config_nccl//:__subpackages__", + "@local_config_python//:__subpackages__", + "@local_config_rocm//:__subpackages__", + "@local_config_tensorrt//:__subpackages__", + "@local_execution_config_platform//:__subpackages__", + "@mkl_dnn_acl_compatible//:__subpackages__", + "@nsync//:__subpackages__", + "@nccl_archive//:__subpackages__", + "@onednn//:__subpackages__", + "@org_sqlite//:__subpackages__", + "@platforms//:__subpackages__", + "@png//:__subpackages__", + "@pthreadpool//:__subpackages__", + "@pybind11//:__subpackages__", + "@ruy//:__subpackages__", + "@snappy//:__subpackages__", + "@sobol_data//:__subpackages__", + "@stablehlo//:__subpackages__", + "@tf_runtime//:__subpackages__", + "@upb//:__subpackages__", + "@zlib//:__subpackages__", + ] + static_deps += tsl_async_value_deps() + static_deps += [] if not testonly else [ + "@com_google_benchmark//:__subpackages__", + "@com_google_googletest//:__subpackages__", + ] + return if_oss(static_deps) + +# buildozer: enable=function-docstring-args +def tf_python_pybind_extension_opensource( + name, + srcs, + module_name = None, + hdrs = [], # TODO(b/264128506): Drop after migration to cc_shared_library. + deps = [], + dynamic_deps = [], + static_deps = [], + compatible_with = None, + copts = [], + defines = [], + features = [], + testonly = False, + visibility = None, + win_def_file = None): + """A wrapper macro for pybind_extension_opensource that is used in tensorflow/python/BUILD. + + Please do not use it anywhere else as it may behave unexpectedly. b/146445820 + + It is used for targets under //third_party/tensorflow/python that link + against libtensorflow_framework.so and pywrap_tensorflow_internal.so. + """ + extended_deps = deps + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]) + extended_deps += [] if dynamic_deps else if_windows([], ["@org_tensorflow//tensorflow:libtensorflow_framework_import_lib"]) + tf_binary_pybind_deps() + pybind_extension_opensource( + name, + srcs, + module_name = module_name, + hdrs = hdrs, + dynamic_deps = dynamic_deps, + static_deps = static_deps, + deps = extended_deps, + compatible_with = compatible_with, + copts = copts, + defines = defines, + features = features, + testonly = testonly, + visibility = visibility, + win_def_file = win_def_file, + ) + +# Export open source version of tf_python_pybind_extension under base name as well. +tf_python_pybind_extension = tf_python_pybind_extension_opensource + +def tf_pybind_cc_library_wrapper_opensource(name, deps, visibility = None, **kwargs): + """Wrapper for cc_library and proto dependencies used by tf_python_pybind_extension_opensource. + + This wrapper ensures that cc libraries' and protos' headers are made + available to pybind code, without creating ODR violations in the dynamically + linked case. The symbols in these deps symbols should be linked to, and + exported by, the core pywrap_tensorflow_internal.so + """ + cc_header_only_library(name = name, deps = deps, visibility = visibility, **kwargs) + +# Export open source version of tf_pybind_cc_library_wrapper under base name as well. +tf_pybind_cc_library_wrapper = tf_pybind_cc_library_wrapper_opensource + +if_cuda_or_rocm = _if_cuda_or_rocm + +def tf_monitoring_framework_deps(link_to_tensorflow_framework = True): + """Get the monitoring libs that will be linked to the tensorflow framework. + + Currently in OSS, the protos must be statically linked to the tensorflow + framework, whereas the grpc should not be linked here. + """ + return select({ + "@org_tensorflow//tensorflow:stackdriver_support": [ + "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter_protos", + ], + "//conditions:default": [], + }) + +def tf_monitoring_python_deps(): + """Get the monitoring libs that will be linked to the python wrapper. + + Currently in OSS, the grpc must be statically linked to the python wrapper + whereas the protos should not be linked here. + """ + return select({ + "@org_tensorflow//tensorflow:stackdriver_support": [ + "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter", + ], + "//conditions:default": [], + }) + +# Teams sharing the same repo can provide their own ops_to_register.h file using +# this function, and pass in -Ipath/to/repo flag when building the target. +def tf_selective_registration_deps(): + return [] + +def tf_jit_compilation_passes_extra_deps(): + return [] + +def if_mlir(if_true, if_false = []): + return select({ + str(Label("@org_tensorflow//tensorflow:with_mlir_support")): if_true, + "//conditions:default": if_false, + }) + +def tf_enable_mlir_bridge(): + return select({ + str(Label("@org_tensorflow//tensorflow:enable_mlir_bridge")): [ + "@org_tensorflow//tensorflow/python/framework:is_mlir_bridge_test_true", + ], + str(Label("@org_tensorflow//tensorflow:disable_mlir_bridge")): [ + "@org_tensorflow//tensorflow/python/framework:is_mlir_bridge_test_false", + ], + "//conditions:default": [], + }) + +def tfcompile_target_cpu(name = ""): + return "" + +def tfcompile_dfsan_enabled(): + return False + +def tfcompile_dfsan_abilists(): + return [] + +def tf_external_workspace_visible(visibility): + # External workspaces can see this target. + return ["//visibility:public"] + +def _filegroup_as_file_impl(ctx): + out = ctx.actions.declare_file(ctx.label.name) + ctx.actions.write( + output = out, + content = "\n".join([f.short_path for f in ctx.files.dep]), + ) + return DefaultInfo(files = depset([out])) + +_filegroup_as_file = rule( + implementation = _filegroup_as_file_impl, + attrs = { + "dep": attr.label(), + }, +) + +def filegroup_as_file(name, dep, visibility = []): + """Creates a filegroup ${name}_file which contains the file ${name}.""" + _filegroup_as_file(name = name, dep = dep) + native.filegroup( + name = name + "_file", + srcs = [name], + visibility = visibility, + ) + +def tf_grpc_dependencies(): + return ["@org_tensorflow//tensorflow:grpc"] + +def tf_grpc_cc_dependencies(): + return ["@org_tensorflow//tensorflow:grpc++"] + +def get_compatible_with_portable(): + return [] + +def filegroup(**kwargs): + native.filegroup(**kwargs) + +def genrule(**kwargs): + native.genrule(**kwargs) + +def internal_tfrt_deps(): + return [] + +def _tf_gen_options_header_impl(ctx): + header_depset = depset([ctx.outputs.output_header]) + + define_vals = {True: "true", False: "false"} + substitutions = {} + for target, identifier in ctx.attr.build_settings.items(): + setting_val = target[BuildSettingInfo].value + lines = [ + "// %s" % target.label, + "#define TF_OPTION_%s() %s" % (identifier, define_vals[setting_val]), + ] + substitutions["#define_option %s" % identifier] = "\n".join(lines) + + ctx.actions.expand_template( + template = ctx.file.template, + output = ctx.outputs.output_header, + substitutions = substitutions, + ) + + return [ + DefaultInfo(files = header_depset), + ] + +tf_gen_options_header = rule( + attrs = { + "output_header": attr.output( + doc = "File path for the generated header (output)", + mandatory = True, + ), + "template": attr.label( + doc = """Template for the header. + For each option name 'X' (see build_settings attribute), + '#define_option X' results in a macro 'TF_OPTION_X()' + """, + allow_single_file = True, + mandatory = True, + ), + "build_settings": attr.label_keyed_string_dict( + doc = """Dictionary from build-setting labels to option names. Example: + {"@org_tensorflow//tensorflow:x_setting" : "X"} + """, + providers = [BuildSettingInfo], + ), + }, + implementation = _tf_gen_options_header_impl, + doc = """ + Generates a header file for Bazel build settings. + + This is an alternative to setting preprocessor defines on the compiler + command line. It has a few advantages: + - Usage of the options requires #include-ing the header, and thus a + Bazel-level dependency. + - Each option has a definition site in source code, which mentions the + corresponding Bazel setting. This is particularly useful when + navigating code with the assistance of static analysis (e.g. + https://cs.opensource.google/tensorflow). + - Each option is represented as a FUNCTION()-style macro, which is always + defined (i.e. one uses #if instead of #ifdef). This allows forms like + 'if constexpr (TF_OPTION_FOO()) { ... }', and helps catch missing + dependencies (if 'F' is undefined, '#if F()' results in an error). + """, +) + +# These flags are used selectively to disable benign ptxas warnings for some +# build targets. On clang "-Xcuda-ptxas --disable-warnings" is sufficient, but +# that does not work on some versions of GCC. So for now this is empty in the +# open source build. +def tf_disable_ptxas_warning_flags(): + return [] + +# Use this to replace the `non_portable_tf_deps` (i.e., tensorflow/core/...) with +# tensorflow/core:portable_tensorflow_lib_lite when building portably. +def replace_with_portable_tf_lib_when_required(non_portable_tf_deps, use_lib_with_runtime = False): + portable_tf_lib = "@org_tensorflow//tensorflow/core:portable_tensorflow_lib_lite" + + return select({ + "@org_tensorflow//tensorflow:android": [portable_tf_lib], + "@org_tensorflow//tensorflow:ios": [portable_tf_lib], + "//conditions:default": non_portable_tf_deps, + }) + +def tf_python_framework_friends(): + return ["@org_tensorflow//tensorflow:__subpackages__"] diff --git a/build_deps/requirements.in b/build_deps/requirements.in new file mode 100644 index 00000000..175e899b --- /dev/null +++ b/build_deps/requirements.in @@ -0,0 +1,37 @@ +# Requirements for the Federated Compute Python development environment. +# +# * For packages that have a stable release, we use a version that is +# compatible with that release (e.g. `~=x.y`). See +# https://peps.python.org/pep-0440/#compatible-release for more information. +# * For packages that do not have a stable release, we use a version that +# matches a release that has been tested (e.g. `==x.y.z`). See +# https://peps.python.org/pep-0440/#version-matching for more information. +# +# Note: There is bug in `pip` when multiple packages use the compatible release +# operator `~=` to specify a version and one of those versions ends in `0`. See +# https://github.com/pypa/pip/issues/9613 for more information. In this case, +# use the equivalent clause `>=x.0,==x.*` instead of `~=x.0`. +# +# This assumes that the packages follow Semantic Versioning, see +# https://semver.org/. If a package follows a different versioning scheme or +# requires unique handling, we use a different version specifier and comment the +# versioning scheme or reasoning. + +absl-py~=1.4 +attrs~=23.1 +dm-tree~=0.1.8 +dill == 0.3.6 +pandas +fastparquet +portpicker>=1.6.0 +protobuf>=4.23 +pytest-xdist +pytest~=6.2.5 +scipy~=1.14.1 +tblib == 1.7.0 +tqdm +tf_keras +# The TensorFlow version should match what's specified in the WORKSPACE file for +# C++ targets. +tensorflow~=2.15.1 +typeguard~=2.13.3 diff --git a/build_deps/requirements_lock_3_10.txt b/build_deps/requirements_lock_3_10.txt new file mode 100644 index 00000000..419ad233 --- /dev/null +++ b/build_deps/requirements_lock_3_10.txt @@ -0,0 +1,983 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# bazel run //build_deps:requirements.update +# +--index-url https://pypi.tuna.tsinghua.edu.cn/simple + +absl-py==1.4.0 \ + --hash=sha256:0d3fe606adfa4f7db64792dd4c7aee4ee0c38ab75dfd353b7a83ed3e957fcb47 \ + --hash=sha256:d2c244d01048ba476e7c080bd2c6df5e141d211de80223460d5b3b8a2a58433d + # via + # -r build_deps/requirements.in + # tensorboard + # tensorflow +astunparse==1.6.3 \ + --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \ + --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8 + # via tensorflow +attrs==23.2.0 \ + --hash=sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30 \ + --hash=sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1 + # via + # -r build_deps/requirements.in + # pytest +cachetools==5.5.2 \ + --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \ + --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + # via google-auth +certifi==2024.8.30 \ + --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ + --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 + # via requests +charset-normalizer==3.4.0 \ + --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \ + --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \ + --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \ + --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \ + --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \ + --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \ + --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \ + --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \ + --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \ + --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \ + --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \ + --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \ + --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \ + --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \ + --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \ + --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \ + --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \ + --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \ + --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \ + --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \ + --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \ + --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \ + --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \ + --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \ + --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \ + --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \ + --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \ + --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \ + --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \ + --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \ + --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \ + --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \ + --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \ + --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \ + --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \ + --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \ + --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \ + --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \ + --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \ + --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \ + --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \ + --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \ + --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \ + --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \ + --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \ + --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \ + --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \ + --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \ + --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \ + --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \ + --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \ + --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \ + --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \ + --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \ + --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \ + --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \ + --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \ + --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \ + --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \ + --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \ + --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \ + --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \ + --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \ + --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \ + --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \ + --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \ + --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \ + --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \ + --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \ + --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \ + --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \ + --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \ + --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \ + --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \ + --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \ + --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \ + --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \ + --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \ + --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \ + --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \ + --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \ + --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \ + --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \ + --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \ + --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \ + --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \ + --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \ + --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \ + --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \ + --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \ + --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \ + --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \ + --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \ + --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \ + --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \ + --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \ + --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \ + --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \ + --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \ + --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \ + --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \ + --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \ + --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \ + --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \ + --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482 + # via requests +cramjam==2.10.0 \ + --hash=sha256:001fc2572adc655406fb899087f57a740e58a800b05acdccac8bf5759b617d90 \ + --hash=sha256:04f54bea9ce39c440d1ac6901fe4d647f9218dd5cd8fe903c6fe9c42bf5e1f3b \ + --hash=sha256:05793857773ec62101edf2c0d22d8edc955707727124f637d2f6cc138e5f97aa \ + --hash=sha256:06ad4a8b368d30ded1d932d9eed647962fbe44923269185a6bbd5e0d11cc39ab \ + --hash=sha256:0acb17e3681138b48300b27d3409742c81d5734ec39c650a60a764c135197840 \ + --hash=sha256:0d27fe3e316f9ae7fe1367b6daf0ffc993c1c66edae588165ac0f41f91a5a6b1 \ + --hash=sha256:112638a4cdf806509d2d2661cb519d239d731bd5fd2e95f211c48ac0f0deeab5 \ + --hash=sha256:11c5ef0c70d6bdd8e1d8afed8b0430709b22decc3865eb6c0656aa00117a7b3d \ + --hash=sha256:17dda15edf256362edb30dcb1d5ecdcd727d946c6be0d1b130e736f3f49487dc \ + --hash=sha256:1a200b74220dcd80c2bb99e3bfe1cdb1e4ed0f5c071959f4316abd65f9ef1e39 \ + --hash=sha256:1c071765bdd5eefa3b2157a61e84d72e161b63f95eb702a0133fee293800a619 \ + --hash=sha256:1e826469cfbb6dcd5b967591e52855073267835229674cfa3d327088805855da \ + --hash=sha256:22a7ab05c62b0a71fcd6db4274af1508c5ea039a43fb143ac50a62f86e6f32f7 \ + --hash=sha256:2464bdf0e2432e0f07a834f48c16022cd7f4648ed18badf52c32c13d6722518c \ + --hash=sha256:260732e3b5c56d6182586f3a7fc5e3f3641b27bfbad5883e8d8e292af85a6870 \ + --hash=sha256:26c44f17938cf00a339899ce6ea7ba12af7b1210d707a80a7f14724fba39869b \ + --hash=sha256:27b2625c0840b9a5522eba30b165940084391762492e03b9d640fca5074016ae \ + --hash=sha256:28a13c0317e71121b2059ffa8beefa2b185be241c52f740f6eb261f0067186db \ + --hash=sha256:2c1eb6e6c3d5c1cc3f7c7f8a52e034340a3c454641f019687fa94077c05da5c2 \ + --hash=sha256:2c24907c972aca7b56c8326307e15d78f56199852dda1e67e4e54c2672afede4 \ + --hash=sha256:2c7008bb54bdc5d130c0e8581925dfcbdc6f0a4d2051de7a153bfced9a31910f \ + --hash=sha256:2e419b65538786fc1f0cf776612262d4bf6c9449983d3fc0d0acfd86594fe551 \ + --hash=sha256:337ceb50bde7708b2a4068f3000625c23ceb1b2497edce2e21fd08ef58549170 \ + --hash=sha256:3484f1595eef64cefed05804d7ec8a88695f89086c49b086634e44c16f3d4769 \ + --hash=sha256:3596b6ceaf85f872c1e56295c6ec80bb15fdd71e7ed9e0e5c3e654563dcc40a2 \ + --hash=sha256:35bcecff38648908a4833928a892a1e7a32611171785bef27015107426bc1d9d \ + --hash=sha256:38fba4594dd0e2b7423ef403039e63774086ebb0696d9060db20093f18a2f43e \ + --hash=sha256:3a94fe7024137ed8bf200308000d106874afe52ff203f852f43b3547eddfa10e \ + --hash=sha256:3e0b70fe7796b63b87cb7ebfaad0ebaca7574fdf177311952f74b8bda6522fb8 \ + --hash=sha256:42dcd7c83104edae70004a8dc494e4e57de4940e3019e5d2cbec2830d5908a85 \ + --hash=sha256:44c15f6117031a84497433b5f55d30ee72d438fdcba9778fec0c5ca5d416aa96 \ + --hash=sha256:44c2660ee7c4c269646955e4e40c2693f803fbad12398bb31b2ad00cfc6027b8 \ + --hash=sha256:4b201aacc7a06079b063cfbcf5efe78b1e65c7279b2828d06ffaa90a8316579d \ + --hash=sha256:4b3e0067ae3513e4cbd0efbabbe5a2bcfa2c2d4bddc67188eeb0751b9a02fdb7 \ + --hash=sha256:4ba90f7b8f986934f33aad8cc029cf7c74842d3ecd5eda71f7531330d38a8dc4 \ + --hash=sha256:4c7bab3703babb93c9dd4444ac9797d01ec46cf521e247d3319bfb292414d053 \ + --hash=sha256:5018c7414047f640b126df02e9286a8da7cc620798cea2b39bac79731c2ee336 \ + --hash=sha256:50b59e981f219d6840ac43cda8e885aff1457944ddbabaa16ac047690bfd6ad1 \ + --hash=sha256:51eb00c72d4a93e4a2ddcc751ba2a7a1318026247e80742866912ec82b39e5ce \ + --hash=sha256:5264ac242697fbb1cfffa79d0153cbc4c088538bd99d60cfa374e8a8b83e2bb5 \ + --hash=sha256:570c81f991033e624874475ade96b601f1db2c51b3e69c324072adcfb23ef5aa \ + --hash=sha256:5b21b1672814ecce88f1da76635f0483d2d877d4cb8998db3692792f46279bf1 \ + --hash=sha256:5b34f4678d386c64d3be402fdf67f75e8f1869627ea2ec4decd43e828d3b6fba \ + --hash=sha256:5c52805c7ccb533fe42d3d36c91d237c97c3b6551cd6b32f98b79eeb30d0f139 \ + --hash=sha256:61b7f3c81e5e9015e73e5f423706b2f5e85a07ce79dea35645fad93505ff06cf \ + --hash=sha256:636a48e2d01fe8d7955e9523efd2f8efce55a0221f3b5d5b4bdf37c7ff056bf1 \ + --hash=sha256:645827af834a64145ba4b06f703342b2dbe1d40d1a48fb04e82373bd95cf68e2 \ + --hash=sha256:647553c44cf6b5ce2d9b56e743cc1eab886940d776b36438183e807bb5a7a42b \ + --hash=sha256:6655d04942f7c02087a6bba4bdc8d88961aa8ddf3fb9a05b3bad06d2d1ca321b \ + --hash=sha256:68362d87372a90b9717536238c81d74d7feb4a14392ac239ceb61c1c199a9bac \ + --hash=sha256:6d86c1e2006fe82a8679ed851c2462a6019b57255b3902d16ac35df4a37f6cdd \ + --hash=sha256:73b6ffc8ffe6546462ccc7e34ca3acd9eb3984e1232645f498544a7eab6b8aca \ + --hash=sha256:7699d61c712bc77907c48fe63a21fffa03c4dd70401e1d14e368af031fde7c21 \ + --hash=sha256:76e4e42f2ecf1aca0a710adaa23000a192efb81a2aee3bcc16761f1777f08a74 \ + --hash=sha256:77192bc1a9897ecd91cf977a5d5f990373e35a8d028c9141c8c3d3680a4a4cd7 \ + --hash=sha256:7ab6f36c772109c974890eafff2a841ddbf38ea1293b01a778b28f26089a890d \ + --hash=sha256:7dda9be2caf067ac21c4aa63497833e0984908b66849c07aaa42b1cfa93f5e1c \ + --hash=sha256:7ddbf6a3d3def7ae46638ebf87d7746ccebf22f885a87884ac24d97943af3f30 \ + --hash=sha256:8695857e0b0b5289fabb6c200b95e2b18d8575551ddd9d50746b3d78b6fb5aa8 \ + --hash=sha256:86b29e349064821ceeb14d60d01a11a0788f94e73ed4b3a5c3f9fac7aa4e2cd7 \ + --hash=sha256:88754dd516f0e2f4dd242880b8e760dc854e917315a17fe3fc626475bea9b252 \ + --hash=sha256:8b40d46d2aa566f8e3def953279cce0191e47364b453cda492db12a84dd97f78 \ + --hash=sha256:8bb0b6aaaa5f37091e05d756a3337faf0ddcffe8a68dbe8a710731b0d555ec8f \ + --hash=sha256:91ab85752a08dc875a05742cfda0234d7a70fadda07dd0b0582cfe991911f332 \ + --hash=sha256:92fd6e784ade210c3522bc627b3938821d12fac52acefe4d6630460e243e28de \ + --hash=sha256:967f5f0f22bf5dba4e4d7abe9594b28f5da95606225a50555926ff6e975d84dd \ + --hash=sha256:9cadef44f5ad4c5b4d06ba3c28464d70241a40539c0343b1821ba43102b6a9fc \ + --hash=sha256:9e20ebea6ec77232cd12e4084c8be6d03534dc5f3d027d365b32766beafce6c3 \ + --hash=sha256:a01e89e99ba066dfa2df40fe99a2371565f4a3adc6811a73c8019d9929a312e8 \ + --hash=sha256:a04376601c8f9714fb3a6a0a1699b85aab665d9d952a2a31fb37cf70e1be1fba \ + --hash=sha256:a094ca72440364bc1d0a793555875e515b0d7cc0eef171f4cd49c7e4855ba06e \ + --hash=sha256:a120fc0514c9ed9a4051d040ddd36176241d4f54c4a37d8e4f3d29ac9bdb4c3a \ + --hash=sha256:a2742eea6e336961167c5b6a2393fa04d54bdb10980f0d60ea36ed0a824e9a20 \ + --hash=sha256:a2923b8cd2fcbd22e0842decb66bf925a9e95bda165490d037c355e5df8fef68 \ + --hash=sha256:a71ab695a16c6d5aeae1f02fcc37fbd1ae876e8fb339337aca187012a3d6c0a2 \ + --hash=sha256:ac5a8a3ef660e6869a7761cd0664223eb546b2d17e9121c8ab0ad46353635611 \ + --hash=sha256:acef0e2c4d9f38428721a0ec878dee3fb73a35e640593d99c9803457dbb65214 \ + --hash=sha256:adf484b06063134ae604d4fc826d942af7e751c9d0b2fcab5bf1058a8ebe242b \ + --hash=sha256:afa36aa006d7692718fce427ecb276211918447f806f80c19096a627f5122e3d \ + --hash=sha256:b07fe3e48c881a75a11f722e1d5b052173b5e7c78b22518f659b8c9b4ac4c937 \ + --hash=sha256:b8dee2e4a402dac2df110e7b02fae49507a63b44b6fd91350cf069f31545a925 \ + --hash=sha256:ba19308b8e19cdaadfbf47142f52b705d2cbfb8edd84a8271573e50fa7fa022d \ + --hash=sha256:bcedda2ef2560e6e62cac03734ab1ad28616206b4d4f2d138440b4f43e18c395 \ + --hash=sha256:bf1321a40da930edeff418d561dfb03e6d59d5b8ab5cbab1c4b03ff0aa4c6d21 \ + --hash=sha256:c6afff7e9da53afb8d11eae27a20ee5709e2943b39af6c949b38424d0f271569 \ + --hash=sha256:cddd12ee5a2ef4100478db7f5563a9cdb8bc0a067fbd8ccd1ecdc446d2e6a41a \ + --hash=sha256:ce11be5722c9d433c5e1eb3980f16eb7d80828b9614f089e28f4f1724fc8973f \ + --hash=sha256:ce208a3e4043b8ce89e5d90047da16882456ea395577b1ee07e8215dce7d7c91 \ + --hash=sha256:d46fd5a9e8eb5d56eccc6191a55e3e1e2b3ab24b19ab87563a2299a39c855fd7 \ + --hash=sha256:d61a21e4153589bd53ffe71b553f93f2afbc8fb7baf63c91a83c933347473083 \ + --hash=sha256:d84581c869d279fab437182d5db2b590d44975084e8d50b164947f7aaa2c5f25 \ + --hash=sha256:de3e4be5aa71b73c2640c9b86e435ec033592f7f79787937f8342259106a63ae \ + --hash=sha256:def47645b1b970fd97f063da852b0ddc4f5bdee9af8d5b718d9682c7b828d89d \ + --hash=sha256:e0744e391ea8baf0ddea5a180b0aa71a6a302490c14d7a37add730bf0172c7c6 \ + --hash=sha256:e193918c81139361f3f45db19696d31847601f2c0e79a38618f34d7bff6ee704 \ + --hash=sha256:e1c03360c1760f8608dc5ce1ddd7e5491180765360cae8104b428d5f86fbe1b9 \ + --hash=sha256:e2d216ed4aca2090eabdd354204ae55ed3e13333d1a5b271981543696e634672 \ + --hash=sha256:e3012564760394dff89e7a10c5a244f8885cd155aec07bdbe2d6dc46be398614 \ + --hash=sha256:e821dd487384ae8004e977c3b13135ad6665ccf8c9874e68441cad1146e66d8a \ + --hash=sha256:eafdc9d1721afcb4be9d20b980b61d404a592c19067197976a4077f52727bd1a \ + --hash=sha256:f25db473667774725e4f34e738d644ffb205bf0bdc0e8146870a1104c5f42e4a \ + --hash=sha256:fb73ee9616e3efd2cf3857b019c66f9bf287bb47139ea48425850da2ae508670 \ + --hash=sha256:ff7b95bd299c9360e7cb8d226002d58e2917f594ea5af0373efc713f896622b9 + # via fastparquet +dill==0.3.6 \ + --hash=sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0 \ + --hash=sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373 + # via -r build_deps/requirements.in +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via -r build_deps/requirements.in +execnet==2.1.1 \ + --hash=sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc \ + --hash=sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3 + # via pytest-xdist +fastparquet==2024.11.0 \ + --hash=sha256:053695c2f730b78a2d3925df7cd5c6444d6c1560076af907993361cc7accf3e2 \ + --hash=sha256:0a52eecc6270ae15f0d51347c3f762703dd667ca486f127dc0a21e7e59856ae5 \ + --hash=sha256:0b74333914f454344458dab9d1432fda9b70d62e28dc7acb1512d937ef1424ee \ + --hash=sha256:0e2d7f02f57231e6c86d26e9ea71953737202f20e948790e5d4db6d6a1a150dc \ + --hash=sha256:1ae953c0e3832ae3936b6d92fde493ac7d8b775d7d59d02f7f46f67e1c21ed24 \ + --hash=sha256:29d5c718817bcd765fc519b17f759cad4945974421ecc1931d3bdc3e05e57fa9 \ + --hash=sha256:36b5c9bd2ffaaa26ff45d59a6cefe58503dd748e0c7fad80dd905749da0f2b9e \ + --hash=sha256:374cdfa745aa7d5188430528d5841cf823eb9ad16df72ad6dadd898ccccce3be \ + --hash=sha256:403d31109d398b6be7ce84fa3483fc277c6a23f0b321348c0a505eb098a041cb \ + --hash=sha256:41d1610130b5cb1ce36467766191c5418cba8631e2bfe3affffaf13f9be4e7a8 \ + --hash=sha256:46b2db02fc2a1507939d35441c8ab211d53afd75d82eec9767d1c3656402859b \ + --hash=sha256:4abd3426607335e5ad09be29ef4eeccdf097710e44420deac16893cee64ea0d8 \ + --hash=sha256:4c8401bfd86cccaf0ab7c0ade58c91ae19317ff6092e1d4ad96c2178197d8124 \ + --hash=sha256:561202e8f0e859ccc1aa77c4aaad1d7901b2d50fd6f624ca018bae4c3c7a62ce \ + --hash=sha256:5914ecfa766b7763201b9f49d832a5e89c2dccad470ca4f9c9b228d9a8349756 \ + --hash=sha256:59e5c5b51083d5b82572cdb7aed0346e3181e3ac9d2e45759da2e804bdafa7ee \ + --hash=sha256:60ccf587410f0979105e17036df61bb60e1c2b81880dc91895cdb4ee65b71e7f \ + --hash=sha256:63e0e416e25c15daa174aad8ba991c2e9e5b0dc347e5aed5562124261400f87b \ + --hash=sha256:6595d3771b3d587a31137e985f751b4d599d5c8e9af9c4858e373fdf5c3f8720 \ + --hash=sha256:6b7df5d3b61a19d76e209fe8d3133759af1c139e04ebc6d43f3cc2d8045ef338 \ + --hash=sha256:6b936dcf40ca5fff9e70383d48811b1482b871ff74af857cb4db5f4d072f01ab \ + --hash=sha256:6ec7b398a86432993441d0a08dfae59e29649c803ed64ec4b1d7c3e0855b14cb \ + --hash=sha256:74a0b3c40ab373442c0fda96b75a36e88745d8b138fcc3a6143e04682cbbb8ca \ + --hash=sha256:869e167a4067116b4a27eb7adbe597130b2e2e9cfc0f3e84f60e2e182a933f23 \ + --hash=sha256:8b35823ac7a194134e5f82fa4a9659e42e8f9ad1f2d22a55fbb7b9e4053aabbb \ + --hash=sha256:9a9387e77ac608d8978774caaf1e19de67eaa1386806e514dcb19f741b19cfe5 \ + --hash=sha256:a3afdef2895c9f459135a00a7ed3ceafebfbce918a9e7b5d550e4fae39c1b64d \ + --hash=sha256:a5ad5fc14b0567e700bea3cd528a0bd45a6f9371370b49de8889fb3d10a6574a \ + --hash=sha256:bdadf7b6bad789125b823bfc5b0a719ba5c4a2ef965f973702d3ea89cff057f6 \ + --hash=sha256:cbbb9057a26acf0abad7adf58781ee357258b7708ee44a289e3bee97e2f55d42 \ + --hash=sha256:d20632964e65530374ff7cddd42cc06aa0a1388934903693d6d22592a5ba827b \ + --hash=sha256:d24c923a2d9d22a5e7564245f856e6462d524d57982ac8f7479cde991ff73362 \ + --hash=sha256:d281edd625c33628ba028d3221180283d6161bc5ceb55eae1f0ca1678f864f26 \ + --hash=sha256:dbad4b014782bd38b58b8e9f514fe958cfa7a6c4e187859232d29fd5c5ddd849 \ + --hash=sha256:dc475993232c6a64f350aeb928013a807eb93f78675810fd019cbcff39f6baf3 \ + --hash=sha256:e29ff7a367fafa57c6896fb6abc84126e2466811aefd3e4ad4070b9e18820e54 \ + --hash=sha256:e3b1fc73fd3e1b70b0de254bae7feb890436cb67e99458b88cb9bd3cc44db419 \ + --hash=sha256:eb3356862fba2f9b2ea8e679d66901f466c92be8e023439fe854bc392fbf40a6 \ + --hash=sha256:f9cca4c6b5969df5561c13786f9d116300db1ec22c7941e237cfca4ce602f59b \ + --hash=sha256:fa56b19a29008c34cfe8831e810f770080debcbffc69aabd1df4d47572181f9c \ + --hash=sha256:fbe4468146b633d8f09d7b196fea0547f213cb5ce5f76e9d1beb29eaa9593a93 + # via -r build_deps/requirements.in +flatbuffers==24.12.23 \ + --hash=sha256:2910b0bc6ae9b6db78dd2b18d0b7a0709ba240fb5585f286a3a2b30785c22dac \ + --hash=sha256:c418e0d48890f4142b92fd3e343e73a48f194e1f80075ddcc5793779b3585444 + # via tensorflow +fsspec==2025.3.2 \ + --hash=sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711 \ + --hash=sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6 + # via fastparquet +gast==0.6.0 \ + --hash=sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54 \ + --hash=sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb + # via tensorflow +google-auth==2.39.0 \ + --hash=sha256:0150b6711e97fb9f52fe599f55648950cc4540015565d8fbb31be2ad6e1548a2 \ + --hash=sha256:73222d43cdc35a3aeacbfdcaf73142a97839f10de930550d89ebfe1d0a00cde7 + # via + # google-auth-oauthlib + # tensorboard +google-auth-oauthlib==1.2.1 \ + --hash=sha256:2d58a27262d55aa1b87678c3ba7142a080098cbc2024f903c62355deb235d91f \ + --hash=sha256:afd0cad092a2eaa53cd8e8298557d6de1034c6cb4a740500b5357b648af97263 + # via tensorboard +google-pasta==0.2.0 \ + --hash=sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954 \ + --hash=sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed \ + --hash=sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e + # via tensorflow +grpcio==1.68.1 \ + --hash=sha256:025f790c056815b3bf53da850dd70ebb849fd755a4b1ac822cb65cd631e37d43 \ + --hash=sha256:04cfd68bf4f38f5bb959ee2361a7546916bd9a50f78617a346b3aeb2b42e2161 \ + --hash=sha256:0feb02205a27caca128627bd1df4ee7212db051019a9afa76f4bb6a1a80ca95e \ + --hash=sha256:1098f03dedc3b9810810568060dea4ac0822b4062f537b0f53aa015269be0a76 \ + --hash=sha256:12941d533f3cd45d46f202e3667be8ebf6bcb3573629c7ec12c3e211d99cfccf \ + --hash=sha256:255b1635b0ed81e9f91da4fcc8d43b7ea5520090b9a9ad9340d147066d1d3613 \ + --hash=sha256:298ee7f80e26f9483f0b6f94cc0a046caf54400a11b644713bb5b3d8eb387600 \ + --hash=sha256:2c4cec6177bf325eb6faa6bd834d2ff6aa8bb3b29012cceb4937b86f8b74323c \ + --hash=sha256:2cc1fd04af8399971bcd4f43bd98c22d01029ea2e56e69c34daf2bf8470e47f5 \ + --hash=sha256:334ab917792904245a028f10e803fcd5b6f36a7b2173a820c0b5b076555825e1 \ + --hash=sha256:3522c77d7e6606d6665ec8d50e867f13f946a4e00c7df46768f1c85089eae515 \ + --hash=sha256:37ea3be171f3cf3e7b7e412a98b77685eba9d4fd67421f4a34686a63a65d99f9 \ + --hash=sha256:390eee4225a661c5cd133c09f5da1ee3c84498dc265fd292a6912b65c421c78c \ + --hash=sha256:3aed6544e4d523cd6b3119b0916cef3d15ef2da51e088211e4d1eb91a6c7f4f1 \ + --hash=sha256:3ceb56c4285754e33bb3c2fa777d055e96e6932351a3082ce3559be47f8024f0 \ + --hash=sha256:44a8502dd5de653ae6a73e2de50a401d84184f0331d0ac3daeb044e66d5c5054 \ + --hash=sha256:4b177f5547f1b995826ef529d2eef89cca2f830dd8b2c99ffd5fde4da734ba73 \ + --hash=sha256:4efac5481c696d5cb124ff1c119a78bddbfdd13fc499e3bc0ca81e95fc573684 \ + --hash=sha256:52fbf85aa71263380d330f4fce9f013c0798242e31ede05fcee7fbe40ccfc20d \ + --hash=sha256:55857c71641064f01ff0541a1776bfe04a59db5558e82897d35a7793e525774c \ + --hash=sha256:66a24f3d45c33550703f0abb8b656515b0ab777970fa275693a2f6dc8e35f1c1 \ + --hash=sha256:6ab2d912ca39c51f46baf2a0d92aa265aa96b2443266fc50d234fa88bf877d8e \ + --hash=sha256:77d65165fc35cff6e954e7fd4229e05ec76102d4406d4576528d3a3635fc6172 \ + --hash=sha256:7dfc914cc31c906297b30463dde0b9be48e36939575eaf2a0a22a8096e69afe5 \ + --hash=sha256:7f20ebec257af55694d8f993e162ddf0d36bd82d4e57f74b31c67b3c6d63d8b2 \ + --hash=sha256:80af6f1e69c5e68a2be529990684abdd31ed6622e988bf18850075c81bb1ad6e \ + --hash=sha256:83bbf5807dc3ee94ce1de2dfe8a356e1d74101e4b9d7aa8c720cc4818a34aded \ + --hash=sha256:8720c25cd9ac25dd04ee02b69256d0ce35bf8a0f29e20577427355272230965a \ + --hash=sha256:8829924fffb25386995a31998ccbbeaa7367223e647e0122043dfc485a87c666 \ + --hash=sha256:8a3869a6661ec8f81d93f4597da50336718bde9eb13267a699ac7e0a1d6d0bea \ + --hash=sha256:8cb620037a2fd9eeee97b4531880e439ebfcd6d7d78f2e7dcc3726428ab5ef63 \ + --hash=sha256:919d7f18f63bcad3a0f81146188e90274fde800a94e35d42ffe9eadf6a9a6330 \ + --hash=sha256:95c87ce2a97434dffe7327a4071839ab8e8bffd0054cc74cbe971fba98aedd60 \ + --hash=sha256:963cc8d7d79b12c56008aabd8b457f400952dbea8997dd185f155e2f228db079 \ + --hash=sha256:96f473cdacfdd506008a5d7579c9f6a7ff245a9ade92c3c0265eb76cc591914f \ + --hash=sha256:9d1fae6bbf0816415b81db1e82fb3bf56f7857273c84dcbe68cbe046e58e1ccd \ + --hash=sha256:a0c8ddabef9c8f41617f213e527254c41e8b96ea9d387c632af878d05db9229c \ + --hash=sha256:a1b988b40f2fd9de5c820f3a701a43339d8dcf2cb2f1ca137e2c02671cc83ac1 \ + --hash=sha256:a47faedc9ea2e7a3b6569795c040aae5895a19dde0c728a48d3c5d7995fda385 \ + --hash=sha256:a8040f85dcb9830d8bbb033ae66d272614cec6faceee88d37a88a9bd1a7a704e \ + --hash=sha256:b33bd114fa5a83f03ec6b7b262ef9f5cac549d4126f1dc702078767b10c46ed9 \ + --hash=sha256:c08079b4934b0bf0a8847f42c197b1d12cba6495a3d43febd7e99ecd1cdc8d54 \ + --hash=sha256:c28848761a6520c5c6071d2904a18d339a796ebe6b800adc8b3f474c5ce3c3ad \ + --hash=sha256:cb400138e73969eb5e0535d1d06cae6a6f7a15f2cc74add320e2130b8179211a \ + --hash=sha256:cbb5780e2e740b6b4f2d208e90453591036ff80c02cc605fea1af8e6fc6b1bbe \ + --hash=sha256:ccf2ebd2de2d6661e2520dae293298a3803a98ebfc099275f113ce1f6c2a80f1 \ + --hash=sha256:d35740e3f45f60f3c37b1e6f2f4702c23867b9ce21c6410254c9c682237da68d \ + --hash=sha256:d99abcd61760ebb34bdff37e5a3ba333c5cc09feda8c1ad42547bea0416ada78 \ + --hash=sha256:ddda1aa22495d8acd9dfbafff2866438d12faec4d024ebc2e656784d96328ad0 \ + --hash=sha256:dffd29a2961f3263a16d73945b57cd44a8fd0b235740cb14056f0612329b345e \ + --hash=sha256:e4842e4872ae4ae0f5497bf60a0498fa778c192cc7a9e87877abd2814aca9475 \ + --hash=sha256:e8dbe3e00771bfe3d04feed8210fc6617006d06d9a2679b74605b9fed3e8362c \ + --hash=sha256:ee2e743e51cb964b4975de572aa8fb95b633f496f9fcb5e257893df3be854746 \ + --hash=sha256:eeb38ff04ab6e5756a2aef6ad8d94e89bb4a51ef96e20f45c44ba190fa0bcaad \ + --hash=sha256:f8261fa2a5f679abeb2a0a93ad056d765cdca1c47745eda3f2d87f874ff4b8c9 + # via + # tensorboard + # tensorflow +h5py==3.12.1 \ + --hash=sha256:018a4597f35092ae3fb28ee851fdc756d2b88c96336b8480e124ce1ac6fb9166 \ + --hash=sha256:050a4f2c9126054515169c49cb900949814987f0c7ae74c341b0c9f9b5056834 \ + --hash=sha256:06a903a4e4e9e3ebbc8b548959c3c2552ca2d70dac14fcfa650d9261c66939ed \ + --hash=sha256:1473348139b885393125126258ae2d70753ef7e9cec8e7848434f385ae72069e \ + --hash=sha256:2f0f1a382cbf494679c07b4371f90c70391dedb027d517ac94fa2c05299dacda \ + --hash=sha256:326d70b53d31baa61f00b8aa5f95c2fcb9621a3ee8365d770c551a13dbbcbfdf \ + --hash=sha256:3b15d8dbd912c97541312c0e07438864d27dbca857c5ad634de68110c6beb1c2 \ + --hash=sha256:3fdf95092d60e8130ba6ae0ef7a9bd4ade8edbe3569c13ebbaf39baefffc5ba4 \ + --hash=sha256:4532c7e97fbef3d029735db8b6f5bf01222d9ece41e309b20d63cfaae2fb5c4d \ + --hash=sha256:513171e90ed92236fc2ca363ce7a2fc6f2827375efcbb0cc7fbdd7fe11fecafc \ + --hash=sha256:52ab036c6c97055b85b2a242cb540ff9590bacfda0c03dd0cf0661b311f522f8 \ + --hash=sha256:577d618d6b6dea3da07d13cc903ef9634cde5596b13e832476dd861aaf651f3e \ + --hash=sha256:59400f88343b79655a242068a9c900001a34b63e3afb040bd7cdf717e440f653 \ + --hash=sha256:59685fe40d8c1fbbee088c88cd4da415a2f8bee5c270337dc5a1c4aa634e3307 \ + --hash=sha256:5c4b41d1019322a5afc5082864dfd6359f8935ecd37c11ac0029be78c5d112c9 \ + --hash=sha256:62be1fc0ef195891949b2c627ec06bc8e837ff62d5b911b6e42e38e0f20a897d \ + --hash=sha256:6fdf6d7936fa824acfa27305fe2d9f39968e539d831c5bae0e0d83ed521ad1ac \ + --hash=sha256:7b3b8f3b48717e46c6a790e3128d39c61ab595ae0a7237f06dfad6a3b51d5351 \ + --hash=sha256:84342bffd1f82d4f036433e7039e241a243531a1d3acd7341b35ae58cdab05bf \ + --hash=sha256:ad8a76557880aed5234cfe7279805f4ab5ce16b17954606cca90d578d3e713ef \ + --hash=sha256:ba51c0c5e029bb5420a343586ff79d56e7455d496d18a30309616fdbeed1068f \ + --hash=sha256:cb65f619dfbdd15e662423e8d257780f9a66677eae5b4b3fc9dca70b5fd2d2a3 \ + --hash=sha256:ccd9006d92232727d23f784795191bfd02294a4f2ba68708825cb1da39511a93 \ + --hash=sha256:d2b8dd64f127d8b324f5d2cd1c0fd6f68af69084e9e47d27efeb9e28e685af3e \ + --hash=sha256:d3e465aee0ec353949f0f46bf6c6f9790a2006af896cee7c178a8c3e5090aa32 \ + --hash=sha256:e4d51919110a030913201422fb07987db4338eba5ec8c5a15d6fab8e03d443fc + # via tensorflow +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 + # via requests +iniconfig==2.0.0 \ + --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ + --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 + # via pytest +keras==2.15.0 \ + --hash=sha256:2dcc6d2e30cf9c951064b63c1f4c404b966c59caf09e01f3549138ec8ee0dd1f \ + --hash=sha256:81871d298c064dc4ac6b58440fdae67bfcf47c8d7ad28580fab401834c06a575 + # via tensorflow +libclang==18.1.1 \ + --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \ + --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \ + --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \ + --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \ + --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \ + --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \ + --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \ + --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \ + --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \ + --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe + # via tensorflow +markdown==3.7 \ + --hash=sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2 \ + --hash=sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803 + # via tensorboard +markupsafe==3.0.2 \ + --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \ + --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \ + --hash=sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0 \ + --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \ + --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \ + --hash=sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13 \ + --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \ + --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \ + --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \ + --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \ + --hash=sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0 \ + --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \ + --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \ + --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \ + --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \ + --hash=sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff \ + --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \ + --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \ + --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \ + --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \ + --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \ + --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \ + --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \ + --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \ + --hash=sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a \ + --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \ + --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \ + --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \ + --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \ + --hash=sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144 \ + --hash=sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f \ + --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \ + --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \ + --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \ + --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \ + --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \ + --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \ + --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \ + --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \ + --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \ + --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \ + --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \ + --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \ + --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \ + --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \ + --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \ + --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \ + --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \ + --hash=sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29 \ + --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \ + --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \ + --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \ + --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \ + --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \ + --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \ + --hash=sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a \ + --hash=sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178 \ + --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \ + --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \ + --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \ + --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50 + # via werkzeug +ml-dtypes==0.3.2 \ + --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \ + --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \ + --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \ + --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \ + --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \ + --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \ + --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \ + --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \ + --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \ + --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \ + --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \ + --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \ + --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \ + --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \ + --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \ + --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \ + --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4 + # via tensorflow +numpy==1.26.4 \ + --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ + --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ + --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ + --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ + --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ + --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ + --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ + --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ + --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ + --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ + --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ + --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ + --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ + --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ + --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ + --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ + --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ + --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ + --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ + --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ + --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ + --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ + --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ + --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ + --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ + --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ + --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ + --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ + --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ + --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ + --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ + --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ + --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ + --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ + --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ + --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f + # via + # fastparquet + # h5py + # ml-dtypes + # pandas + # scipy + # tensorboard + # tensorflow +oauthlib==3.2.2 \ + --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \ + --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918 + # via requests-oauthlib +opt-einsum==3.4.0 \ + --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \ + --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac + # via tensorflow +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 + # via + # fastparquet + # pytest + # tensorflow +pandas==2.2.3 \ + --hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \ + --hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \ + --hash=sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5 \ + --hash=sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4 \ + --hash=sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0 \ + --hash=sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32 \ + --hash=sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea \ + --hash=sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28 \ + --hash=sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f \ + --hash=sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348 \ + --hash=sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18 \ + --hash=sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468 \ + --hash=sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5 \ + --hash=sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e \ + --hash=sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667 \ + --hash=sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645 \ + --hash=sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13 \ + --hash=sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30 \ + --hash=sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3 \ + --hash=sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d \ + --hash=sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb \ + --hash=sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3 \ + --hash=sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039 \ + --hash=sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8 \ + --hash=sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd \ + --hash=sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761 \ + --hash=sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659 \ + --hash=sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57 \ + --hash=sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c \ + --hash=sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c \ + --hash=sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4 \ + --hash=sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a \ + --hash=sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9 \ + --hash=sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42 \ + --hash=sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2 \ + --hash=sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39 \ + --hash=sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc \ + --hash=sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698 \ + --hash=sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed \ + --hash=sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015 \ + --hash=sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24 \ + --hash=sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319 + # via + # -r build_deps/requirements.in + # fastparquet +pluggy==1.5.0 \ + --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \ + --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 + # via pytest +portpicker==1.6.0 \ + --hash=sha256:b2787a41404cf7edbe29b07b9e0ed863b09f2665dcc01c1eb0c2261c1e7d0755 \ + --hash=sha256:bd507fd6f96f65ee02781f2e674e9dc6c99bbfa6e3c39992e3916204c9d431fa + # via -r build_deps/requirements.in +protobuf==4.25.5 \ + --hash=sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41 \ + --hash=sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea \ + --hash=sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8 \ + --hash=sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45 \ + --hash=sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584 \ + --hash=sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d \ + --hash=sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1 \ + --hash=sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f \ + --hash=sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a \ + --hash=sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173 \ + --hash=sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331 + # via + # -r build_deps/requirements.in + # tensorboard + # tensorflow +psutil==6.1.1 \ + --hash=sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca \ + --hash=sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377 \ + --hash=sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468 \ + --hash=sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3 \ + --hash=sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603 \ + --hash=sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac \ + --hash=sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303 \ + --hash=sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4 \ + --hash=sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160 \ + --hash=sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8 \ + --hash=sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003 \ + --hash=sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030 \ + --hash=sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777 \ + --hash=sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5 \ + --hash=sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53 \ + --hash=sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649 \ + --hash=sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8 + # via portpicker +py==1.11.0 \ + --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ + --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 + # via pytest +pyasn1==0.6.1 \ + --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \ + --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 \ + --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \ + --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6 + # via google-auth +pytest==6.2.5 \ + --hash=sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89 \ + --hash=sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134 + # via + # -r build_deps/requirements.in + # pytest-xdist +pytest-xdist==3.5.0 \ + --hash=sha256:cbb36f3d67e0c478baa57fa4edc8843887e0f6cfc42d677530a36d7472b32d8a \ + --hash=sha256:d075629c7e00b611df89f490a5063944bee7a4362a5ff11c7cc7824a03dfce24 + # via -r build_deps/requirements.in +python-dateutil==2.9.0.post0 \ + --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ + --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 + # via pandas +pytz==2025.1 \ + --hash=sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57 \ + --hash=sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e + # via pandas +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via + # requests-oauthlib + # tensorboard +requests-oauthlib==2.0.0 \ + --hash=sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 \ + --hash=sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9 + # via google-auth-oauthlib +rsa==4.9.1 \ + --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \ + --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75 + # via google-auth +scipy==1.14.1 \ + --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \ + --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \ + --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \ + --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \ + --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \ + --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \ + --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \ + --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \ + --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \ + --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \ + --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \ + --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \ + --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \ + --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \ + --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \ + --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \ + --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \ + --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \ + --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \ + --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \ + --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \ + --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \ + --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \ + --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \ + --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \ + --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \ + --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \ + --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \ + --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \ + --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \ + --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \ + --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \ + --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 + # via -r build_deps/requirements.in +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # astunparse + # google-pasta + # python-dateutil + # tensorboard + # tensorflow +tblib==1.7.0 \ + --hash=sha256:059bd77306ea7b419d4f76016aef6d7027cc8a0785579b5aad198803435f882c \ + --hash=sha256:289fa7359e580950e7d9743eab36b0691f0310fce64dee7d9c31065b8f723e23 + # via -r build_deps/requirements.in +tensorboard==2.15.2 \ + --hash=sha256:a6f6443728064d962caea6d34653e220e34ef8df764cb06a8212c17e1a8f0622 + # via tensorflow +tensorboard-data-server==0.7.2 \ + --hash=sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb \ + --hash=sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60 \ + --hash=sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530 + # via tensorboard +tensorflow==2.15.1 \ + --hash=sha256:10132acc072d59696c71ce7221d2d8e0e3ff1e6bc8688dbac6d7aed8e675b710 \ + --hash=sha256:30c5ef9c758ec9ff7ce2aff76b71c980bc5119b879071c2cc623b1591a497a1a \ + --hash=sha256:432788ac5d1234b9e9b7c7f73603a5655271a28c293329c52c7c0b9434a1184e \ + --hash=sha256:6761efe511e6ee0f893f60738fefbcc51d6dc386eeaaafea59d21899ef369ffd \ + --hash=sha256:89b5aa1022dec47e567512eaf4e1271b8e6c1ff1984e30d0d9127bd1093ed4c5 \ + --hash=sha256:8e5431d45ceb416c2b1b6de87378054fbac7d2ed35d45b102d89a786613fffdc \ + --hash=sha256:91b51a507007d63a70b65be307d701088d15042a6399c0e2312b53072226e909 \ + --hash=sha256:a49f8755c74a89553294a99ab25aa87ab1cddbfa40fe58387e09f64f0578cedc \ + --hash=sha256:aa926114d1e13ffe5b2ea59c3f195216f26646d7fe36e9e5207b291e4b7902ff \ + --hash=sha256:aaf3cfa290597ebbdf19d1a78729e3f555e459506cd58f8d7399359ac5e02a05 \ + --hash=sha256:b75815b6a601edad52b4181e9805c8fcd04813a6ab1d5cd8127188dfd2788e20 \ + --hash=sha256:bb0edd69103c154245c5f209f0507355cc68ba7e4de350084bc31edc562478e4 \ + --hash=sha256:e73d43dbc68d8c711e70edecc4ac70472799a25ec4ec18a84d479ee18033d3c5 \ + --hash=sha256:ea290e435464cf0794f657b48786e5fa413362abe55ed771c172c25980d070ce \ + --hash=sha256:f8e85821317c9c0fbf1256e9f721cfb1400ba1e09becb844b3ddd91f744805fc + # via + # -r build_deps/requirements.in + # tf-keras +tensorflow-estimator==2.15.0 \ + --hash=sha256:aedf21eec7fb2dc91150fc91a1ce12bc44dbb72278a08b58e79ff87c9e28f153 + # via tensorflow +tensorflow-io-gcs-filesystem==0.37.1 \ + --hash=sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95 \ + --hash=sha256:249c12b830165841411ba71e08215d0e94277a49c551e6dd5d72aab54fe5491b \ + --hash=sha256:257aab23470a0796978efc9c2bcf8b0bc80f22e6298612a4c0a50d3f4e88060c \ + --hash=sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c \ + --hash=sha256:32c50ab4e29a23c1f91cd0f9ab8c381a0ab10f45ef5c5252e94965916041737c \ + --hash=sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed \ + --hash=sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda \ + --hash=sha256:8943036bbf84e7a2be3705cb56f9c9df7c48c9e614bb941f0936c58e3ca89d6f \ + --hash=sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70 \ + --hash=sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27 \ + --hash=sha256:b02f9c5f94fd62773954a04f69b68c4d576d076fd0db4ca25d5479f0fbfcdbad \ + --hash=sha256:ee5da49019670ed364f3e5fb86b46420841a6c3cb52a300553c63841671b3e6d \ + --hash=sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556 \ + --hash=sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f \ + --hash=sha256:fe8dcc6d222258a080ac3dfcaaaa347325ce36a7a046277f6b3e19abc1efb3c5 \ + --hash=sha256:ffebb6666a7bfc28005f4fbbb111a455b5e7d6cd3b12752b7050863ecb27d5cc + # via tensorflow +termcolor==2.5.0 \ + --hash=sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8 \ + --hash=sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f + # via tensorflow +tf-keras==2.15.1 \ + --hash=sha256:40ab605cecc7759c657cb2bccd9efaacd6fc2369a6c1eba8053890afeac46886 \ + --hash=sha256:8beaef46b8b4f1158de1410e7c0cf82f008b9e8c4ab3443f54ac1aaef9c2ad74 + # via -r build_deps/requirements.in +toml==0.10.2 \ + --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ + --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f + # via pytest +tqdm==4.67.1 \ + --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \ + --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2 + # via -r build_deps/requirements.in +typeguard==2.13.3 \ + --hash=sha256:00edaa8da3a133674796cf5ea87d9f4b4c367d77476e185e80251cc13dfbb8c4 \ + --hash=sha256:5e3e3be01e887e7eafae5af63d1f36c849aaa94e3a0112097312aabfa16284f1 + # via -r build_deps/requirements.in +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ + --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 + # via tensorflow +tzdata==2025.1 \ + --hash=sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694 \ + --hash=sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639 + # via pandas +urllib3==2.2.3 \ + --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ + --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 + # via requests +werkzeug==3.1.3 \ + --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \ + --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746 + # via tensorboard +wheel==0.45.1 \ + --hash=sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729 \ + --hash=sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248 + # via astunparse +wrapt==1.14.1 \ + --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \ + --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \ + --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \ + --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \ + --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \ + --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \ + --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \ + --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \ + --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \ + --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \ + --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \ + --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \ + --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \ + --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \ + --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \ + --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \ + --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \ + --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \ + --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \ + --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \ + --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \ + --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \ + --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \ + --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \ + --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \ + --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \ + --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \ + --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \ + --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \ + --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \ + --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \ + --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \ + --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \ + --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \ + --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \ + --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \ + --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \ + --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \ + --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \ + --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \ + --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \ + --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \ + --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \ + --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \ + --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \ + --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \ + --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \ + --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \ + --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \ + --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \ + --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \ + --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \ + --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \ + --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \ + --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \ + --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \ + --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \ + --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \ + --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \ + --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \ + --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \ + --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \ + --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \ + --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \ + --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \ + --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \ + --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \ + --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \ + --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \ + --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \ + --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \ + --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \ + --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \ + --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af + # via tensorflow + +# The following packages are considered to be unsafe in a requirements file: +setuptools==75.6.0 \ + --hash=sha256:8199222558df7c86216af4f84c30e9b34a61d8ba19366cc914424cdbd28252f6 \ + --hash=sha256:ce74b49e8f7110f9bf04883b730f4765b774ef3ef28f722cce7c273d253aaf7d + # via + # tensorboard + # tensorflow diff --git a/build_deps/requirements_lock_3_11.txt b/build_deps/requirements_lock_3_11.txt new file mode 100644 index 00000000..a68dbad4 --- /dev/null +++ b/build_deps/requirements_lock_3_11.txt @@ -0,0 +1,733 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# bazel run //build:requirements.update +# +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html + +absl-py==2.1.0 \ + --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \ + --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff + # via + # -r build/requirements.in + # chex + # clu + # google-benchmark + # ml-collections + # optax + # orbax-checkpoint +certifi==2024.8.30 \ + --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ + --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 + # via requests +charset-normalizer==3.4.0 \ + --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \ + --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \ + --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \ + --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \ + --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \ + --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \ + --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \ + --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \ + --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \ + --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \ + --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \ + --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \ + --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \ + --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \ + --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \ + --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \ + --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \ + --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \ + --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \ + --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \ + --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \ + --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \ + --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \ + --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \ + --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \ + --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \ + --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \ + --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \ + --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \ + --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \ + --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \ + --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \ + --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \ + --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \ + --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \ + --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \ + --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \ + --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \ + --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \ + --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \ + --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \ + --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \ + --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \ + --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \ + --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \ + --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \ + --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \ + --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \ + --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \ + --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \ + --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \ + --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \ + --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \ + --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \ + --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \ + --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \ + --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \ + --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \ + --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \ + --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \ + --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \ + --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \ + --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \ + --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \ + --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \ + --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \ + --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \ + --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \ + --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \ + --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \ + --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \ + --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \ + --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \ + --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \ + --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \ + --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \ + --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \ + --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \ + --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \ + --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \ + --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \ + --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \ + --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \ + --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \ + --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \ + --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \ + --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \ + --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \ + --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \ + --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \ + --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \ + --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \ + --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \ + --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \ + --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \ + --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \ + --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \ + --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \ + --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \ + --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \ + --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \ + --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \ + --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \ + --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \ + --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482 + # via requests +chex==0.1.87 \ + --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \ + --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16 + # via optax +clu==0.0.12 \ + --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \ + --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4 + # via -r build/requirements.in +contextlib2==21.6.0 \ + --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \ + --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869 + # via ml-collections +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via -r build/requirements.in +einops==0.8.0 \ + --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \ + --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f + # via -r build/requirements.in +etils[epath,epy]==1.10.0 \ + --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \ + --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05 + # via + # clu + # optax + # orbax-checkpoint +flax==0.10.1 \ + --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \ + --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3 + # via + # -r build/requirements.in + # clu +fsspec==2024.10.0 \ + --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \ + --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493 + # via etils +google-benchmark==1.8.3 \ + --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \ + --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \ + --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \ + --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \ + --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \ + --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \ + --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \ + --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \ + --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \ + --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \ + --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \ + --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \ + --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \ + --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \ + --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \ + --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \ + --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \ + --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \ + --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \ + --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \ + --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c + # via -r build/requirements.in +humanize==4.11.0 \ + --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \ + --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be + # via orbax-checkpoint +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 + # via requests +importlib-resources==6.4.5 \ + --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \ + --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717 + # via etils +jax[tpu]==0.4.35 \ + --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \ + --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325 + # via + # -r build/requirements.in + # chex + # clu + # flax + # optax + # orbax-checkpoint +jaxlib==0.4.35 \ + --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \ + --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \ + --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \ + --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \ + --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \ + --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \ + --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \ + --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \ + --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \ + --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \ + --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \ + --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \ + --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \ + --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \ + --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \ + --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \ + --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \ + --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \ + --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \ + --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb + # via + # chex + # clu + # jax + # optax +libtpu==0.0.2 \ + --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f + # via jax +libtpu-nightly==0.1.dev20241010+nightly.cleanup \ + --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232 + # via jax +markdown-it-py==3.0.0 \ + --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ + --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb + # via rich +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py +ml-collections==0.1.1 \ + --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc + # via clu +ml-dtypes==0.5.0 \ + --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \ + --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \ + --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \ + --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \ + --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \ + --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \ + --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \ + --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \ + --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \ + --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \ + --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \ + --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \ + --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \ + --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \ + --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \ + --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \ + --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \ + --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \ + --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \ + --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \ + --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599 + # via + # jax + # jaxlib + # tensorstore +msgpack==1.1.0 \ + --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \ + --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \ + --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \ + --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \ + --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \ + --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \ + --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \ + --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \ + --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \ + --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \ + --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \ + --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \ + --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \ + --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \ + --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \ + --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \ + --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \ + --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \ + --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \ + --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \ + --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \ + --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \ + --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \ + --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \ + --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \ + --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \ + --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \ + --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \ + --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \ + --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \ + --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \ + --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \ + --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \ + --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \ + --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \ + --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \ + --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \ + --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \ + --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \ + --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \ + --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \ + --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \ + --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \ + --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \ + --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \ + --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \ + --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \ + --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \ + --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \ + --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \ + --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \ + --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \ + --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \ + --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \ + --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \ + --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \ + --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \ + --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \ + --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \ + --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \ + --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \ + --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \ + --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \ + --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788 + # via + # flax + # orbax-checkpoint +nest-asyncio==1.6.0 \ + --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \ + --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c + # via orbax-checkpoint +numpy==2.1.3 \ + --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \ + --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \ + --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \ + --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \ + --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \ + --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \ + --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \ + --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \ + --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \ + --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \ + --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \ + --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \ + --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \ + --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \ + --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \ + --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \ + --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \ + --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \ + --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \ + --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \ + --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \ + --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \ + --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \ + --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \ + --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \ + --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \ + --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \ + --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \ + --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \ + --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \ + --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \ + --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \ + --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \ + --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \ + --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \ + --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \ + --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \ + --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \ + --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \ + --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \ + --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \ + --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \ + --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \ + --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \ + --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \ + --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \ + --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \ + --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \ + --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \ + --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \ + --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \ + --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \ + --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \ + --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \ + --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4 + # via + # -r build/requirements.in + # chex + # clu + # flax + # jax + # jaxlib + # ml-dtypes + # optax + # orbax-checkpoint + # scipy + # tensorstore +opt-einsum==3.4.0 \ + --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \ + --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac + # via jax +optax==0.2.3 \ + --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \ + --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9 + # via + # -r build/requirements.in + # flax +orbax==0.1.9 \ + --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218 + # via -r build/requirements.in +orbax-checkpoint==0.8.0 \ + --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \ + --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc + # via + # flax + # orbax +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 + # via clu +protobuf==5.28.3 \ + --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \ + --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \ + --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \ + --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \ + --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \ + --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \ + --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \ + --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \ + --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \ + --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \ + --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed + # via + # -r build/requirements.in + # orbax-checkpoint +pygments==2.18.0 \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via rich +pyyaml==6.0.2 \ + --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \ + --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \ + --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \ + --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \ + --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \ + --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \ + --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \ + --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \ + --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \ + --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \ + --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \ + --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \ + --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \ + --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \ + --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \ + --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \ + --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \ + --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \ + --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \ + --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \ + --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \ + --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \ + --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \ + --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \ + --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \ + --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \ + --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \ + --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \ + --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \ + --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \ + --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \ + --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \ + --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \ + --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \ + --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \ + --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \ + --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \ + --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \ + --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \ + --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \ + --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \ + --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \ + --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \ + --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \ + --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \ + --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \ + --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \ + --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \ + --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \ + --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \ + --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \ + --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \ + --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4 + # via + # flax + # ml-collections + # orbax-checkpoint +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via jax +rich==13.9.4 \ + --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \ + --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90 + # via flax +scipy==1.14.1 \ + --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \ + --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \ + --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \ + --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \ + --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \ + --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \ + --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \ + --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \ + --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \ + --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \ + --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \ + --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \ + --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \ + --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \ + --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \ + --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \ + --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \ + --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \ + --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \ + --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \ + --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \ + --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \ + --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \ + --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \ + --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \ + --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \ + --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \ + --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \ + --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \ + --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \ + --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \ + --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \ + --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 + # via + # jax + # jaxlib +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via ml-collections +tensorstore==0.1.67 \ + --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \ + --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \ + --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \ + --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \ + --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \ + --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \ + --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \ + --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \ + --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \ + --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \ + --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \ + --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \ + --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \ + --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \ + --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \ + --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \ + --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \ + --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \ + --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \ + --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \ + --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908 + # via + # flax + # orbax-checkpoint +toolz==1.0.0 \ + --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \ + --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02 + # via chex +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ + --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 + # via + # chex + # clu + # etils + # flax + # orbax-checkpoint +urllib3==2.2.3 \ + --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ + --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 + # via requests +wrapt==1.16.0 \ + --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \ + --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \ + --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \ + --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \ + --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \ + --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \ + --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \ + --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \ + --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \ + --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \ + --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \ + --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \ + --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \ + --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \ + --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \ + --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \ + --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \ + --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \ + --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \ + --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \ + --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \ + --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \ + --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \ + --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \ + --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \ + --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \ + --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \ + --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \ + --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \ + --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \ + --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \ + --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \ + --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \ + --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \ + --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \ + --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \ + --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \ + --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \ + --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \ + --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \ + --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \ + --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \ + --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \ + --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \ + --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \ + --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \ + --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \ + --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \ + --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \ + --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \ + --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \ + --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \ + --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \ + --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \ + --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \ + --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \ + --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \ + --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \ + --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \ + --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \ + --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \ + --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \ + --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \ + --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \ + --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \ + --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \ + --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \ + --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \ + --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \ + --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4 + # via clu +zipp==3.20.2 \ + --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \ + --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29 + # via etils diff --git a/build_deps/requirements_lock_3_12.txt b/build_deps/requirements_lock_3_12.txt new file mode 100644 index 00000000..86ac5e4c --- /dev/null +++ b/build_deps/requirements_lock_3_12.txt @@ -0,0 +1,739 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# bazel run //build:requirements.update +# +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html + +absl-py==2.1.0 \ + --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \ + --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff + # via + # -r build/requirements.in + # chex + # clu + # google-benchmark + # ml-collections + # optax + # orbax-checkpoint +certifi==2024.8.30 \ + --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ + --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 + # via requests +charset-normalizer==3.4.0 \ + --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \ + --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \ + --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \ + --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \ + --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \ + --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \ + --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \ + --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \ + --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \ + --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \ + --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \ + --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \ + --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \ + --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \ + --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \ + --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \ + --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \ + --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \ + --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \ + --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \ + --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \ + --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \ + --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \ + --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \ + --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \ + --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \ + --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \ + --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \ + --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \ + --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \ + --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \ + --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \ + --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \ + --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \ + --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \ + --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \ + --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \ + --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \ + --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \ + --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \ + --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \ + --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \ + --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \ + --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \ + --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \ + --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \ + --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \ + --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \ + --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \ + --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \ + --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \ + --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \ + --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \ + --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \ + --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \ + --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \ + --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \ + --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \ + --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \ + --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \ + --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \ + --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \ + --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \ + --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \ + --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \ + --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \ + --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \ + --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \ + --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \ + --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \ + --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \ + --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \ + --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \ + --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \ + --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \ + --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \ + --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \ + --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \ + --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \ + --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \ + --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \ + --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \ + --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \ + --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \ + --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \ + --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \ + --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \ + --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \ + --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \ + --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \ + --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \ + --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \ + --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \ + --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \ + --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \ + --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \ + --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \ + --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \ + --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \ + --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \ + --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \ + --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \ + --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \ + --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \ + --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482 + # via requests +chex==0.1.87 \ + --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \ + --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16 + # via optax +clu==0.0.12 \ + --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \ + --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4 + # via -r build/requirements.in +contextlib2==21.6.0 \ + --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \ + --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869 + # via ml-collections +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via -r build/requirements.in +einops==0.8.0 \ + --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \ + --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f + # via -r build/requirements.in +etils[epath,epy]==1.10.0 \ + --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \ + --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05 + # via + # clu + # optax + # orbax-checkpoint +flax==0.10.1 \ + --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \ + --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3 + # via + # -r build/requirements.in + # clu +fsspec==2024.10.0 \ + --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \ + --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493 + # via etils +google-benchmark==1.8.3 \ + --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \ + --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \ + --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \ + --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \ + --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \ + --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \ + --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \ + --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \ + --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \ + --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \ + --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \ + --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \ + --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \ + --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \ + --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \ + --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \ + --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \ + --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \ + --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \ + --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \ + --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c + # via -r build/requirements.in +humanize==4.11.0 \ + --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \ + --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be + # via orbax-checkpoint +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 + # via requests +importlib-resources==6.4.5 \ + --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \ + --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717 + # via etils +jax[tpu]==0.4.35 \ + --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \ + --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325 + # via + # -r build/requirements.in + # chex + # clu + # flax + # optax + # orbax-checkpoint +jaxlib==0.4.35 \ + --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \ + --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \ + --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \ + --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \ + --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \ + --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \ + --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \ + --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \ + --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \ + --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \ + --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \ + --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \ + --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \ + --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \ + --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \ + --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \ + --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \ + --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \ + --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \ + --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb + # via + # chex + # clu + # jax + # optax +libtpu==0.0.2 \ + --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f + # via jax +libtpu-nightly==0.1.dev20241010+nightly.cleanup \ + --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232 + # via jax +markdown-it-py==3.0.0 \ + --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ + --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb + # via rich +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py +ml-collections==0.1.1 \ + --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc + # via clu +ml-dtypes==0.5.0 \ + --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \ + --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \ + --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \ + --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \ + --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \ + --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \ + --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \ + --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \ + --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \ + --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \ + --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \ + --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \ + --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \ + --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \ + --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \ + --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \ + --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \ + --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \ + --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \ + --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \ + --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599 + # via + # jax + # jaxlib + # tensorstore +msgpack==1.1.0 \ + --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \ + --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \ + --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \ + --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \ + --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \ + --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \ + --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \ + --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \ + --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \ + --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \ + --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \ + --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \ + --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \ + --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \ + --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \ + --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \ + --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \ + --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \ + --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \ + --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \ + --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \ + --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \ + --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \ + --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \ + --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \ + --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \ + --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \ + --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \ + --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \ + --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \ + --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \ + --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \ + --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \ + --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \ + --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \ + --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \ + --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \ + --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \ + --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \ + --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \ + --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \ + --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \ + --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \ + --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \ + --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \ + --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \ + --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \ + --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \ + --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \ + --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \ + --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \ + --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \ + --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \ + --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \ + --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \ + --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \ + --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \ + --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \ + --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \ + --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \ + --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \ + --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \ + --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \ + --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788 + # via + # flax + # orbax-checkpoint +nest-asyncio==1.6.0 \ + --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \ + --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c + # via orbax-checkpoint +numpy==2.1.3 \ + --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \ + --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \ + --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \ + --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \ + --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \ + --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \ + --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \ + --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \ + --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \ + --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \ + --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \ + --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \ + --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \ + --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \ + --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \ + --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \ + --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \ + --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \ + --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \ + --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \ + --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \ + --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \ + --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \ + --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \ + --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \ + --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \ + --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \ + --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \ + --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \ + --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \ + --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \ + --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \ + --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \ + --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \ + --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \ + --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \ + --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \ + --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \ + --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \ + --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \ + --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \ + --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \ + --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \ + --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \ + --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \ + --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \ + --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \ + --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \ + --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \ + --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \ + --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \ + --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \ + --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \ + --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \ + --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4 + # via + # -r build/requirements.in + # chex + # clu + # flax + # jax + # jaxlib + # ml-dtypes + # optax + # orbax-checkpoint + # scipy + # tensorstore +opt-einsum==3.4.0 \ + --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \ + --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac + # via jax +optax==0.2.3 \ + --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \ + --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9 + # via + # -r build/requirements.in + # flax +orbax==0.1.9 \ + --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218 + # via -r build/requirements.in +orbax-checkpoint==0.8.0 \ + --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \ + --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc + # via + # flax + # orbax +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 + # via clu +protobuf==5.28.3 \ + --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \ + --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \ + --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \ + --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \ + --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \ + --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \ + --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \ + --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \ + --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \ + --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \ + --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed + # via + # -r build/requirements.in + # orbax-checkpoint +pygments==2.18.0 \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via rich +pyyaml==6.0.2 \ + --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \ + --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \ + --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \ + --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \ + --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \ + --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \ + --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \ + --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \ + --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \ + --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \ + --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \ + --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \ + --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \ + --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \ + --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \ + --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \ + --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \ + --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \ + --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \ + --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \ + --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \ + --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \ + --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \ + --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \ + --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \ + --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \ + --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \ + --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \ + --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \ + --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \ + --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \ + --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \ + --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \ + --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \ + --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \ + --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \ + --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \ + --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \ + --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \ + --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \ + --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \ + --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \ + --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \ + --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \ + --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \ + --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \ + --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \ + --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \ + --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \ + --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \ + --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \ + --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \ + --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4 + # via + # flax + # ml-collections + # orbax-checkpoint +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via jax +rich==13.9.4 \ + --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \ + --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90 + # via flax +scipy==1.14.1 \ + --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \ + --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \ + --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \ + --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \ + --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \ + --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \ + --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \ + --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \ + --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \ + --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \ + --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \ + --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \ + --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \ + --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \ + --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \ + --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \ + --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \ + --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \ + --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \ + --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \ + --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \ + --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \ + --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \ + --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \ + --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \ + --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \ + --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \ + --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \ + --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \ + --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \ + --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \ + --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \ + --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 + # via + # jax + # jaxlib +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via ml-collections +tensorstore==0.1.67 \ + --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \ + --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \ + --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \ + --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \ + --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \ + --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \ + --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \ + --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \ + --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \ + --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \ + --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \ + --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \ + --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \ + --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \ + --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \ + --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \ + --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \ + --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \ + --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \ + --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \ + --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908 + # via + # flax + # orbax-checkpoint +toolz==1.0.0 \ + --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \ + --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02 + # via chex +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ + --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 + # via + # chex + # clu + # etils + # flax + # orbax-checkpoint +urllib3==2.2.3 \ + --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ + --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 + # via requests +wrapt==1.16.0 \ + --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \ + --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \ + --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \ + --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \ + --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \ + --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \ + --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \ + --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \ + --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \ + --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \ + --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \ + --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \ + --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \ + --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \ + --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \ + --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \ + --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \ + --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \ + --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \ + --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \ + --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \ + --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \ + --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \ + --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \ + --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \ + --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \ + --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \ + --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \ + --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \ + --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \ + --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \ + --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \ + --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \ + --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \ + --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \ + --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \ + --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \ + --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \ + --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \ + --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \ + --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \ + --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \ + --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \ + --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \ + --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \ + --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \ + --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \ + --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \ + --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \ + --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \ + --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \ + --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \ + --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \ + --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \ + --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \ + --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \ + --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \ + --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \ + --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \ + --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \ + --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \ + --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \ + --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \ + --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \ + --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \ + --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \ + --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \ + --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \ + --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \ + --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4 + # via clu +zipp==3.20.2 \ + --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \ + --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29 + # via etils + +# The following packages are considered to be unsafe in a requirements file: +setuptools==75.3.0 \ + --hash=sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd \ + --hash=sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686 + # via chex diff --git a/build_deps/requirements_lock_3_13.txt b/build_deps/requirements_lock_3_13.txt new file mode 100644 index 00000000..3683d95d --- /dev/null +++ b/build_deps/requirements_lock_3_13.txt @@ -0,0 +1,739 @@ +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# +# bazel run //build:requirements.update +# +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html + +absl-py==2.1.0 \ + --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \ + --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff + # via + # -r build/requirements.in + # chex + # clu + # google-benchmark + # ml-collections + # optax + # orbax-checkpoint +certifi==2024.8.30 \ + --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ + --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 + # via requests +charset-normalizer==3.4.0 \ + --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \ + --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \ + --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \ + --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \ + --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \ + --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \ + --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \ + --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \ + --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \ + --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \ + --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \ + --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \ + --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \ + --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \ + --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \ + --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \ + --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \ + --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \ + --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \ + --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \ + --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \ + --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \ + --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \ + --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \ + --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \ + --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \ + --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \ + --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \ + --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \ + --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \ + --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \ + --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \ + --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \ + --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \ + --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \ + --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \ + --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \ + --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \ + --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \ + --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \ + --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \ + --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \ + --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \ + --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \ + --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \ + --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \ + --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \ + --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \ + --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \ + --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \ + --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \ + --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \ + --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \ + --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \ + --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \ + --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \ + --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \ + --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \ + --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \ + --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \ + --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \ + --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \ + --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \ + --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \ + --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \ + --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \ + --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \ + --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \ + --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \ + --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \ + --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \ + --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \ + --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \ + --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \ + --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \ + --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \ + --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \ + --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \ + --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \ + --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \ + --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \ + --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \ + --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \ + --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \ + --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \ + --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \ + --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \ + --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \ + --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \ + --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \ + --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \ + --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \ + --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \ + --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \ + --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \ + --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \ + --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \ + --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \ + --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \ + --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \ + --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \ + --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \ + --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \ + --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \ + --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482 + # via requests +chex==0.1.87 \ + --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \ + --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16 + # via optax +clu==0.0.12 \ + --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \ + --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4 + # via -r build/requirements.in +contextlib2==21.6.0 \ + --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \ + --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869 + # via ml-collections +dm-tree==0.1.8 \ + --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \ + --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \ + --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \ + --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \ + --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \ + --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \ + --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \ + --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \ + --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \ + --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \ + --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \ + --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \ + --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \ + --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \ + --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \ + --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \ + --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \ + --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \ + --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \ + --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \ + --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \ + --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \ + --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \ + --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \ + --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \ + --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \ + --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \ + --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \ + --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \ + --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \ + --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \ + --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \ + --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \ + --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \ + --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \ + --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \ + --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \ + --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \ + --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \ + --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \ + --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \ + --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \ + --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \ + --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \ + --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \ + --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d + # via -r build/requirements.in +einops==0.8.0 \ + --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \ + --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f + # via -r build/requirements.in +etils[epath,epy]==1.10.0 \ + --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \ + --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05 + # via + # clu + # optax + # orbax-checkpoint +flax==0.10.1 \ + --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \ + --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3 + # via + # -r build/requirements.in + # clu +fsspec==2024.10.0 \ + --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \ + --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493 + # via etils +google-benchmark==1.8.3 \ + --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \ + --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \ + --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \ + --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \ + --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \ + --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \ + --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \ + --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \ + --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \ + --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \ + --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \ + --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \ + --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \ + --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \ + --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \ + --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \ + --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \ + --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \ + --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \ + --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \ + --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c + # via -r build/requirements.in +humanize==4.11.0 \ + --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \ + --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be + # via orbax-checkpoint +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 + # via requests +importlib-resources==6.4.5 \ + --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \ + --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717 + # via etils +jax[tpu]==0.4.35 \ + --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \ + --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325 + # via + # -r build/requirements.in + # chex + # clu + # flax + # optax + # orbax-checkpoint +jaxlib==0.4.35 \ + --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \ + --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \ + --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \ + --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \ + --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \ + --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \ + --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \ + --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \ + --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \ + --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \ + --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \ + --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \ + --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \ + --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \ + --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \ + --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \ + --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \ + --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \ + --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \ + --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb + # via + # chex + # clu + # jax + # optax +libtpu==0.0.2 \ + --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f + # via jax +libtpu-nightly==0.1.dev20241010+nightly.cleanup \ + --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232 + # via jax +markdown-it-py==3.0.0 \ + --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ + --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb + # via rich +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py +ml-collections==0.1.1 \ + --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc + # via clu +ml-dtypes==0.5.0 \ + --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \ + --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \ + --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \ + --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \ + --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \ + --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \ + --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \ + --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \ + --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \ + --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \ + --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \ + --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \ + --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \ + --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \ + --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \ + --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \ + --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \ + --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \ + --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \ + --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \ + --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599 + # via + # jax + # jaxlib + # tensorstore +msgpack==1.1.0 \ + --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \ + --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \ + --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \ + --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \ + --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \ + --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \ + --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \ + --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \ + --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \ + --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \ + --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \ + --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \ + --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \ + --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \ + --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \ + --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \ + --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \ + --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \ + --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \ + --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \ + --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \ + --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \ + --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \ + --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \ + --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \ + --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \ + --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \ + --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \ + --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \ + --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \ + --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \ + --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \ + --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \ + --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \ + --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \ + --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \ + --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \ + --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \ + --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \ + --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \ + --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \ + --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \ + --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \ + --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \ + --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \ + --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \ + --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \ + --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \ + --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \ + --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \ + --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \ + --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \ + --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \ + --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \ + --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \ + --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \ + --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \ + --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \ + --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \ + --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \ + --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \ + --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \ + --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \ + --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788 + # via + # flax + # orbax-checkpoint +nest-asyncio==1.6.0 \ + --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \ + --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c + # via orbax-checkpoint +numpy==2.1.3 \ + --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \ + --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \ + --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \ + --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \ + --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \ + --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \ + --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \ + --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \ + --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \ + --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \ + --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \ + --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \ + --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \ + --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \ + --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \ + --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \ + --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \ + --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \ + --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \ + --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \ + --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \ + --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \ + --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \ + --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \ + --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \ + --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \ + --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \ + --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \ + --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \ + --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \ + --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \ + --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \ + --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \ + --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \ + --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \ + --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \ + --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \ + --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \ + --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \ + --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \ + --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \ + --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \ + --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \ + --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \ + --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \ + --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \ + --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \ + --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \ + --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \ + --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \ + --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \ + --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \ + --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \ + --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \ + --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4 + # via + # -r build/requirements.in + # chex + # clu + # flax + # jax + # jaxlib + # ml-dtypes + # optax + # orbax-checkpoint + # scipy + # tensorstore +opt-einsum==3.4.0 \ + --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \ + --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac + # via jax +optax==0.2.3 \ + --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \ + --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9 + # via + # -r build/requirements.in + # flax +orbax==0.1.9 \ + --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218 + # via -r build/requirements.in +orbax-checkpoint==0.8.0 \ + --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \ + --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc + # via + # flax + # orbax +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 + # via clu +protobuf==5.28.3 \ + --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \ + --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \ + --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \ + --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \ + --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \ + --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \ + --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \ + --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \ + --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \ + --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \ + --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed + # via + # -r build/requirements.in + # orbax-checkpoint +pygments==2.18.0 \ + --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ + --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + # via rich +pyyaml==6.0.2 \ + --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \ + --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \ + --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \ + --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \ + --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \ + --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \ + --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \ + --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \ + --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \ + --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \ + --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \ + --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \ + --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \ + --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \ + --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \ + --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \ + --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \ + --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \ + --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \ + --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \ + --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \ + --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \ + --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \ + --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \ + --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \ + --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \ + --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \ + --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \ + --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \ + --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \ + --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \ + --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \ + --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \ + --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \ + --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \ + --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \ + --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \ + --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \ + --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \ + --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \ + --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \ + --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \ + --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \ + --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \ + --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \ + --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \ + --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \ + --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \ + --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \ + --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \ + --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \ + --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \ + --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4 + # via + # flax + # ml-collections + # orbax-checkpoint +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 + # via jax +rich==13.9.4 \ + --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \ + --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90 + # via flax +scipy==1.14.1 \ + --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \ + --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \ + --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \ + --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \ + --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \ + --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \ + --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \ + --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \ + --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \ + --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \ + --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \ + --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \ + --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \ + --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \ + --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \ + --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \ + --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \ + --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \ + --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \ + --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \ + --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \ + --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \ + --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \ + --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \ + --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \ + --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \ + --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \ + --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \ + --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \ + --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \ + --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \ + --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \ + --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 + # via + # jax + # jaxlib +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via ml-collections +tensorstore==0.1.67 \ + --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \ + --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \ + --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \ + --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \ + --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \ + --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \ + --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \ + --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \ + --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \ + --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \ + --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \ + --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \ + --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \ + --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \ + --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \ + --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \ + --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \ + --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \ + --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \ + --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \ + --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908 + # via + # flax + # orbax-checkpoint +toolz==1.0.0 \ + --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \ + --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02 + # via chex +typing-extensions==4.12.2 \ + --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ + --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 + # via + # chex + # clu + # etils + # flax + # orbax-checkpoint +urllib3==2.2.3 \ + --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ + --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 + # via requests +wrapt==1.16.0 \ + --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \ + --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \ + --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \ + --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \ + --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \ + --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \ + --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \ + --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \ + --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \ + --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \ + --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \ + --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \ + --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \ + --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \ + --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \ + --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \ + --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \ + --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \ + --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \ + --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \ + --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \ + --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \ + --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \ + --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \ + --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \ + --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \ + --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \ + --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \ + --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \ + --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \ + --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \ + --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \ + --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \ + --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \ + --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \ + --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \ + --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \ + --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \ + --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \ + --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \ + --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \ + --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \ + --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \ + --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \ + --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \ + --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \ + --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \ + --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \ + --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \ + --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \ + --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \ + --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \ + --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \ + --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \ + --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \ + --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \ + --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \ + --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \ + --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \ + --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \ + --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \ + --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \ + --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \ + --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \ + --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \ + --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \ + --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \ + --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \ + --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \ + --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4 + # via clu +zipp==3.20.2 \ + --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \ + --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29 + # via etils + +# The following packages are considered to be unsafe in a requirements file: +setuptools==75.3.0 \ + --hash=sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd \ + --hash=sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686 + # via chex diff --git a/build_deps/tf_dependency/build_defs.bzl.tpl b/build_deps/tf_dependency/build_defs.bzl.tpl index 48542fb8..84e2163d 100644 --- a/build_deps/tf_dependency/build_defs.bzl.tpl +++ b/build_deps/tf_dependency/build_defs.bzl.tpl @@ -2,3 +2,4 @@ D_GLIBCXX_USE_CXX11_ABI = "%{tf_cx11_abi}" CPLUSPLUS_VERSION = "%{tf_cplusplus_ver}" +DTF_VERSION_INTEGER = "%{tf_version_integer}" \ No newline at end of file diff --git a/build_deps/tf_dependency/tf_configure.bzl b/build_deps/tf_dependency/tf_configure.bzl index be03e21d..0fdb70fe 100644 --- a/build_deps/tf_dependency/tf_configure.bzl +++ b/build_deps/tf_dependency/tf_configure.bzl @@ -12,6 +12,8 @@ _TF_CXX11_ABI_FLAG = "TF_CXX11_ABI_FLAG" _TF_CPLUSPLUS_VER = "TF_CPLUSPLUS_VER" +_TF_VERSION_INTEGER = "TF_VERSION_INTEGER" + def _tpl(repository_ctx, tpl, substitutions = {}, out = None): if not out: out = tpl @@ -211,6 +213,7 @@ def _tf_pip_impl(repository_ctx): tf_shared_cc_library_path = "%s/%s" % (tf_shared_library_dir, tf_shared_cc_library_name) tf_cx11_abi = "-D_GLIBCXX_USE_CXX11_ABI=%s" % (repository_ctx.os.environ[_TF_CXX11_ABI_FLAG]) tf_cplusplus_ver = "-std=%s" % repository_ctx.os.environ[_TF_CPLUSPLUS_VER] + tf_version_integer = "-DTF_VERSION_INTEGER=%s" % (repository_ctx.os.environ[_TF_VERSION_INTEGER]) tf_shared_library_rule = _symlink_genrule_for_dir( repository_ctx, @@ -244,6 +247,7 @@ def _tf_pip_impl(repository_ctx): { "%{tf_cx11_abi}": tf_cx11_abi, "%{tf_cplusplus_ver}": tf_cplusplus_ver, + "%{tf_version_integer}": tf_version_integer, }, ) diff --git a/build_deps/toolchains/gpu/crosstool/BUILD.tpl b/build_deps/toolchains/gpu/crosstool/BUILD.tpl deleted file mode 100644 index de954b78..00000000 --- a/build_deps/toolchains/gpu/crosstool/BUILD.tpl +++ /dev/null @@ -1,69 +0,0 @@ -licenses(["restricted"]) - -package(default_visibility = ["//visibility:public"]) - -load(":cc_toolchain_config.bzl", "cc_toolchain_config") - - -toolchain( - name = "toolchain-linux-x86_64", - exec_compatible_with = [ - "@platforms//os:linux", - "@platforms//cpu:x86_64", - ], - target_compatible_with = [ - "@platforms//os:linux", - "@platforms//cpu:x86_64", - ], - toolchain = ":cc-compiler-local", - toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", -) - -cc_toolchain_suite( - name = "toolchain", - toolchains = { - "local|compiler": ":cc-compiler-local", - "k8": ":cc-compiler-local", - "ppc": ":cc-compiler-local", - "aarch64": ":cc-compiler-local", - }, -) - -cc_toolchain( - name = "cc-compiler-local", - all_files = "%{linker_files}", - compiler_files = ":empty", - dwp_files = ":empty", - linker_files = "%{linker_files}", - objcopy_files = ":empty", - strip_files = ":empty", - # To support linker flags that need to go to the start of command line - # we need the toolchain to support parameter files. Parameter files are - # last on the command line and contain all shared libraries to link, so all - # regular options will be left of them. - supports_param_files = 1, - toolchain_config = ":cc-compiler-local-config", - toolchain_identifier = "local_linux", -) - -cc_toolchain_config( - name = "cc-compiler-local-config", - cpu = "local", - builtin_include_directories = "%{cxx_builtin_include_directories}".split(","), - extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"], - host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc", - host_compiler_prefix = "/usr/bin", - host_compiler_warnings = [], - host_unfiltered_compile_flags = [], - linker_bin_path = "/usr/bin", -) - -filegroup( - name = "empty", - srcs = [], -) - -filegroup( - name = "crosstool_wrapper_driver_is_not_gcc", - srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"], -) diff --git a/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl b/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl deleted file mode 100644 index 1a13ac84..00000000 --- a/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl +++ /dev/null @@ -1,1409 +0,0 @@ -major_version: "local" -minor_version: "" -default_target_cpu: "same_as_host" - -toolchain { - abi_version: "local" - abi_libc_version: "local" - compiler: "compiler" - host_system_name: "local" - needsPic: true - target_libc: "local" - target_cpu: "local" - target_system_name: "local" - toolchain_identifier: "local_linux" - - feature { - name: "c++11" - flag_set { - action: "c++-compile" - flag_group { - flag: "-std=c++11" - } - } - } - - feature { - name: "stdlib" - flag_set { - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "-lstdc++" - } - } - } - - feature { - name: "determinism" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # Make C++ compilation deterministic. Use linkstamping instead of these - # compiler symbols. - flag: "-Wno-builtin-macro-redefined" - flag: "-D__DATE__=\"redacted\"" - flag: "-D__TIMESTAMP__=\"redacted\"" - flag: "-D__TIME__=\"redacted\"" - } - } - } - - feature { - name: "alwayslink" - flag_set { - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - action: "c++-link-executable" - flag_group { - flag: "-Wl,-no-as-needed" - } - } - } - - # This feature will be enabled for builds that support pic by bazel. - feature { - name: "pic" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - expand_if_all_available: "pic" - flag: "-fPIC" - } - flag_group { - expand_if_none_available: "pic" - flag: "-fPIE" - } - } - } - - # Security hardening on by default. - feature { - name: "hardening" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases. - # We need to undef it before redefining it as some distributions now - # have it enabled by default. - flag: "-U_FORTIFY_SOURCE" - flag: "-D_FORTIFY_SOURCE=1" - flag: "-fstack-protector" - } - } - flag_set { - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "-Wl,-z,relro,-z,now" - } - } - flag_set { - action: "c++-link-executable" - flag_group { - flag: "-pie" - flag: "-Wl,-z,relro,-z,now" - } - } - } - - feature { - name: "warnings" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # All warnings are enabled. Maybe enable -Werror as well? - flag: "-Wall" - %{host_compiler_warnings} - } - } - } - - # Keep stack frames for debugging, even in opt mode. - feature { - name: "frame-pointer" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-fno-omit-frame-pointer" - } - } - } - - feature { - name: "build-id" - flag_set { - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - # Stamp the binary with a unique identifier. - flag: "-Wl,--build-id=md5" - flag: "-Wl,--hash-style=gnu" - } - } - } - - feature { - name: "no-canonical-prefixes" - flag_set { - action: "c-compile" - action: "c++-compile" - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "-no-canonical-prefixes" - %{extra_no_canonical_prefixes_flags} - } - } - } - - feature { - name: "disable-assertions" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-DNDEBUG" - } - } - } - - feature { - name: "linker-bin-path" - - flag_set { - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - %{linker_bin_path_flag} - } - } - } - - feature { - name: "common" - implies: "stdlib" - implies: "c++11" - implies: "determinism" - implies: "alwayslink" - implies: "hardening" - implies: "warnings" - implies: "frame-pointer" - implies: "build-id" - implies: "no-canonical-prefixes" - implies: "linker-bin-path" - } - - feature { - name: "opt" - implies: "common" - implies: "disable-assertions" - - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # No debug symbols. - # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt - # or even generally? However, that can't happen here, as it requires - # special handling in Bazel. - flag: "-g0" - - # Conservative choice for -O - # -O3 can increase binary size and even slow down the resulting binaries. - # Profile first and / or use FDO if you need better performance than this. - flag: "-O2" - - # Removal of unused code and data at link time (can this increase binary size in some cases?). - flag: "-ffunction-sections" - flag: "-fdata-sections" - } - } - flag_set { - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - action: "c++-link-executable" - flag_group { - flag: "-Wl,--gc-sections" - } - } - } - - feature { - name: "fastbuild" - implies: "common" - } - - feature { - name: "dbg" - implies: "common" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-g" - } - } - } - - # Set clang as a C/C++ compiler. - tool_path { name: "gcc" path: "%{host_compiler_path}" } - - # Use the default system toolchain for everything else. - tool_path { name: "ar" path: "/usr/bin/ar" } - tool_path { name: "compat-ld" path: "/usr/bin/ld" } - tool_path { name: "cpp" path: "/usr/bin/cpp" } - tool_path { name: "dwp" path: "/usr/bin/dwp" } - tool_path { name: "gcov" path: "/usr/bin/gcov" } - tool_path { name: "ld" path: "/usr/bin/ld" } - tool_path { name: "nm" path: "/usr/bin/nm" } - tool_path { name: "objcopy" path: "/usr/bin/objcopy" } - tool_path { name: "objdump" path: "/usr/bin/objdump" } - tool_path { name: "strip" path: "/usr/bin/strip" } - - # Enabled dynamic linking. - linking_mode_flags { mode: DYNAMIC } - -%{host_compiler_includes} -} - -toolchain { - abi_version: "local" - abi_libc_version: "local" - compiler: "compiler" - host_system_name: "local" - needsPic: true - target_libc: "macosx" - target_cpu: "darwin" - target_system_name: "local" - toolchain_identifier: "local_darwin" - feature { - name: "c++11" - flag_set { - action: "c++-compile" - flag_group { - flag: "-std=c++11" - } - } - } - - feature { - name: "stdlib" - flag_set { - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "-lc++" - } - } - } - - feature { - name: "determinism" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # Make C++ compilation deterministic. Use linkstamping instead of these - # compiler symbols. - flag: "-Wno-builtin-macro-redefined" - flag: "-D__DATE__=\"redacted\"" - flag: "-D__TIMESTAMP__=\"redacted\"" - flag: "-D__TIME__=\"redacted\"" - } - } - } - - # This feature will be enabled for builds that support pic by bazel. - feature { - name: "pic" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - expand_if_all_available: "pic" - flag: "-fPIC" - } - flag_group { - expand_if_none_available: "pic" - flag: "-fPIE" - } - } - } - - # Security hardening on by default. - feature { - name: "hardening" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases. - # We need to undef it before redefining it as some distributions now - # have it enabled by default. - flag: "-U_FORTIFY_SOURCE" - flag: "-D_FORTIFY_SOURCE=1" - flag: "-fstack-protector" - } - } - flag_set { - action: "c++-link-executable" - flag_group { - flag: "-pie" - } - } - } - - feature { - name: "warnings" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # All warnings are enabled. Maybe enable -Werror as well? - flag: "-Wall" - %{host_compiler_warnings} - } - } - } - - # Keep stack frames for debugging, even in opt mode. - feature { - name: "frame-pointer" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-fno-omit-frame-pointer" - } - } - } - - feature { - name: "no-canonical-prefixes" - flag_set { - action: "c-compile" - action: "c++-compile" - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag:"-no-canonical-prefixes" - } - } - } - - feature { - name: "disable-assertions" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-DNDEBUG" - } - } - } - - feature { - name: "linker-bin-path" - - flag_set { - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - flag_group { - %{linker_bin_path_flag} - } - } - } - - feature { - name: "undefined-dynamic" - flag_set { - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - action: "c++-link-executable" - flag_group { - flag: "-undefined" - flag: "dynamic_lookup" - } - } - } - - feature { - name: "common" - implies: "stdlib" - implies: "c++11" - implies: "determinism" - implies: "hardening" - implies: "warnings" - implies: "frame-pointer" - implies: "no-canonical-prefixes" - implies: "linker-bin-path" - implies: "undefined-dynamic" - } - - feature { - name: "opt" - implies: "common" - implies: "disable-assertions" - - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - # No debug symbols. - # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt - # or even generally? However, that can't happen here, as it requires - # special handling in Bazel. - flag: "-g0" - - # Conservative choice for -O - # -O3 can increase binary size and even slow down the resulting binaries. - # Profile first and / or use FDO if you need better performance than this. - flag: "-O2" - - # Removal of unused code and data at link time (can this increase binary size in some cases?). - flag: "-ffunction-sections" - flag: "-fdata-sections" - } - } - } - - feature { - name: "fastbuild" - implies: "common" - } - - feature { - name: "dbg" - implies: "common" - flag_set { - action: "c-compile" - action: "c++-compile" - flag_group { - flag: "-g" - } - } - } - - # Set clang as a C/C++ compiler. - tool_path { name: "gcc" path: "%{host_compiler_path}" } - - # Use the default system toolchain for everything else. - tool_path { name: "ar" path: "/usr/bin/libtool" } - tool_path { name: "compat-ld" path: "/usr/bin/ld" } - tool_path { name: "cpp" path: "/usr/bin/cpp" } - tool_path { name: "dwp" path: "/usr/bin/dwp" } - tool_path { name: "gcov" path: "/usr/bin/gcov" } - tool_path { name: "ld" path: "/usr/bin/ld" } - tool_path { name: "nm" path: "/usr/bin/nm" } - tool_path { name: "objcopy" path: "/usr/bin/objcopy" } - tool_path { name: "objdump" path: "/usr/bin/objdump" } - tool_path { name: "strip" path: "/usr/bin/strip" } - - # Enabled dynamic linking. - linking_mode_flags { mode: DYNAMIC } - -%{host_compiler_includes} -} - -toolchain { - toolchain_identifier: "local_windows" - host_system_name: "local" - target_system_name: "local" - - abi_version: "local" - abi_libc_version: "local" - target_cpu: "x64_windows" - compiler: "msvc-cl" - target_libc: "msvcrt" - -%{cxx_builtin_include_directory} - - tool_path { - name: "ar" - path: "%{msvc_lib_path}" - } - tool_path { - name: "ml" - path: "%{msvc_ml_path}" - } - tool_path { - name: "cpp" - path: "%{msvc_cl_path}" - } - tool_path { - name: "gcc" - path: "%{msvc_cl_path}" - } - tool_path { - name: "gcov" - path: "wrapper/bin/msvc_nop.bat" - } - tool_path { - name: "ld" - path: "%{msvc_link_path}" - } - tool_path { - name: "nm" - path: "wrapper/bin/msvc_nop.bat" - } - tool_path { - name: "objcopy" - path: "wrapper/bin/msvc_nop.bat" - } - tool_path { - name: "objdump" - path: "wrapper/bin/msvc_nop.bat" - } - tool_path { - name: "strip" - path: "wrapper/bin/msvc_nop.bat" - } - supports_interface_shared_objects: true - - # TODO(pcloudy): Review those flags below, they should be defined by cl.exe - compiler_flag: "/DCOMPILER_MSVC" - - # Don't define min/max macros in windows.h. - compiler_flag: "/DNOMINMAX" - - # Platform defines. - compiler_flag: "/D_WIN32_WINNT=0x0600" - # Turn off warning messages. - compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE" - compiler_flag: "/D_CRT_SECURE_NO_WARNINGS" - compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS" - - # Useful options to have on for compilation. - # Increase the capacity of object files to 2^32 sections. - compiler_flag: "/bigobj" - # Allocate 500MB for precomputed headers. - compiler_flag: "/Zm500" - # Use unsigned char by default. - compiler_flag: "/J" - # Use function level linking. - compiler_flag: "/Gy" - # Use string pooling. - compiler_flag: "/GF" - # Catch C++ exceptions only and tell the compiler to assume that functions declared - # as extern "C" never throw a C++ exception. - compiler_flag: "/EHsc" - - # Globally disabled warnings. - # Don't warn about elements of array being be default initialized. - compiler_flag: "/wd4351" - # Don't warn about no matching delete found. - compiler_flag: "/wd4291" - # Don't warn about diamond inheritance patterns. - compiler_flag: "/wd4250" - # Don't warn about insecure functions (e.g. non _s functions). - compiler_flag: "/wd4996" - - linker_flag: "/MACHINE:X64" - - feature { - name: "no_legacy_features" - } - - # TODO(klimek): Previously we were using a .bat file to start python to run - # the python script that can redirect to nvcc - unfortunately .bat files - # have a rather short maximum length for command lines (8k). Instead, we - # now use the python binary as the compiler and pass the python script to - # it at the start of the command line. Investigate different possibilities - # to run the nvcc wrapper, either using pyinstaller --onefile, or writing - # a small C++ wrapper to redirect. - feature { - name: "redirector" - enabled: true - flag_set { - action: "c-compile" - action: "c++-compile" - action: "c++-module-compile" - action: "c++-module-codegen" - action: "c++-header-parsing" - action: "assemble" - action: "preprocess-assemble" - flag_group { - flag: "-B" - flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py" - } - } - } - - # Suppress startup banner. - feature { - name: "nologo" - flag_set { - action: "c-compile" - action: "c++-compile" - action: "c++-module-compile" - action: "c++-module-codegen" - action: "c++-header-parsing" - action: "assemble" - action: "preprocess-assemble" - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - action: "c++-link-static-library" - flag_group { - flag: "/nologo" - } - } - } - - feature { - name: 'has_configured_linker_path' - } - - # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary - feature { - name: 'no_stripping' - } - - # This feature indicates this is a toolchain targeting Windows. - feature { - name: 'targets_windows' - implies: 'copy_dynamic_libraries_to_binary' - enabled: true - } - - feature { - name: 'copy_dynamic_libraries_to_binary' - } - - action_config { - config_name: 'assemble' - action_name: 'assemble' - tool { - tool_path: '%{msvc_ml_path}' - } - implies: 'compiler_input_flags' - implies: 'compiler_output_flags' - implies: 'nologo' - implies: 'msvc_env' - implies: 'sysroot' - } - - action_config { - config_name: 'preprocess-assemble' - action_name: 'preprocess-assemble' - tool { - tool_path: '%{msvc_ml_path}' - } - implies: 'compiler_input_flags' - implies: 'compiler_output_flags' - implies: 'nologo' - implies: 'msvc_env' - implies: 'sysroot' - } - - action_config { - config_name: 'c-compile' - action_name: 'c-compile' - tool { - tool_path: '%{msvc_cl_path}' - } - implies: 'compiler_input_flags' - implies: 'compiler_output_flags' - implies: 'legacy_compile_flags' - implies: 'nologo' - implies: 'msvc_env' - implies: 'parse_showincludes' - implies: 'user_compile_flags' - implies: 'sysroot' - implies: 'unfiltered_compile_flags' - } - - action_config { - config_name: 'c++-compile' - action_name: 'c++-compile' - tool { - tool_path: '%{msvc_cl_path}' - } - implies: 'compiler_input_flags' - implies: 'compiler_output_flags' - implies: 'legacy_compile_flags' - implies: 'nologo' - implies: 'msvc_env' - implies: 'parse_showincludes' - implies: 'user_compile_flags' - implies: 'sysroot' - implies: 'unfiltered_compile_flags' - } - - action_config { - config_name: 'c++-link-executable' - action_name: 'c++-link-executable' - tool { - tool_path: '%{msvc_link_path}' - } - implies: 'nologo' - implies: 'linkstamps' - implies: 'output_execpath_flags' - implies: 'input_param_flags' - implies: 'user_link_flags' - implies: 'legacy_link_flags' - implies: 'linker_subsystem_flag' - implies: 'linker_param_file' - implies: 'msvc_env' - implies: 'no_stripping' - } - - action_config { - config_name: 'c++-link-dynamic-library' - action_name: 'c++-link-dynamic-library' - tool { - tool_path: '%{msvc_link_path}' - } - implies: 'nologo' - implies: 'shared_flag' - implies: 'linkstamps' - implies: 'output_execpath_flags' - implies: 'input_param_flags' - implies: 'user_link_flags' - implies: 'legacy_link_flags' - implies: 'linker_subsystem_flag' - implies: 'linker_param_file' - implies: 'msvc_env' - implies: 'no_stripping' - implies: 'has_configured_linker_path' - implies: 'def_file' - } - - action_config { - config_name: 'c++-link-nodeps-dynamic-library' - action_name: 'c++-link-nodeps-dynamic-library' - tool { - tool_path: '%{msvc_link_path}' - } - implies: 'nologo' - implies: 'shared_flag' - implies: 'linkstamps' - implies: 'output_execpath_flags' - implies: 'input_param_flags' - implies: 'user_link_flags' - implies: 'legacy_link_flags' - implies: 'linker_subsystem_flag' - implies: 'linker_param_file' - implies: 'msvc_env' - implies: 'no_stripping' - implies: 'has_configured_linker_path' - implies: 'def_file' - } - - action_config { - config_name: 'c++-link-static-library' - action_name: 'c++-link-static-library' - tool { - tool_path: '%{msvc_lib_path}' - } - implies: 'nologo' - implies: 'archiver_flags' - implies: 'input_param_flags' - implies: 'linker_param_file' - implies: 'msvc_env' - } - - # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are - # not used in this crosstool - feature { - name: 'legacy_compile_flags' - flag_set { - expand_if_all_available: 'legacy_compile_flags' - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - flag_group { - iterate_over: 'legacy_compile_flags' - flag: '%{legacy_compile_flags}' - } - } - } - - feature { - name: "msvc_env" - env_set { - action: "c-compile" - action: "c++-compile" - action: "c++-module-compile" - action: "c++-module-codegen" - action: "c++-header-parsing" - action: "assemble" - action: "preprocess-assemble" - action: "c++-link-executable" - action: "c++-link-dynamic-library" - action: "c++-link-nodeps-dynamic-library" - action: "c++-link-static-library" - env_entry { - key: "PATH" - value: "%{msvc_env_path}" - } - env_entry { - key: "INCLUDE" - value: "%{msvc_env_include}" - } - env_entry { - key: "LIB" - value: "%{msvc_env_lib}" - } - env_entry { - key: "TMP" - value: "%{msvc_env_tmp}" - } - env_entry { - key: "TEMP" - value: "%{msvc_env_tmp}" - } - } - } - - feature { - name: 'include_paths' - flag_set { - action: "assemble" - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - flag_group { - iterate_over: 'quote_include_paths' - flag: '/I%{quote_include_paths}' - } - flag_group { - iterate_over: 'include_paths' - flag: '/I%{include_paths}' - } - flag_group { - iterate_over: 'system_include_paths' - flag: '/I%{system_include_paths}' - } - } - } - - feature { - name: "preprocessor_defines" - flag_set { - action: "assemble" - action: "preprocess-assemble" - action: "c-compile" - action: "c++-compile" - action: "c++-header-parsing" - action: "c++-module-compile" - flag_group { - flag: "/D%{preprocessor_defines}" - iterate_over: "preprocessor_defines" - } - } - } - - # Tell Bazel to parse the output of /showIncludes - feature { - name: 'parse_showincludes' - flag_set { - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-module-compile' - action: 'c++-header-parsing' - flag_group { - flag: "/showIncludes" - } - } - } - - - feature { - name: 'generate_pdb_file' - requires: { - feature: 'dbg' - } - requires: { - feature: 'fastbuild' - } - } - - feature { - name: 'shared_flag' - flag_set { - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: '/DLL' - } - } - } - - feature { - name: 'linkstamps' - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - expand_if_all_available: 'linkstamp_paths' - flag_group { - iterate_over: 'linkstamp_paths' - flag: '%{linkstamp_paths}' - } - } - } - - feature { - name: 'output_execpath_flags' - flag_set { - expand_if_all_available: 'output_execpath' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: '/OUT:%{output_execpath}' - } - } - } - - feature { - name: 'archiver_flags' - flag_set { - expand_if_all_available: 'output_execpath' - action: 'c++-link-static-library' - flag_group { - flag: '/OUT:%{output_execpath}' - } - } - } - - feature { - name: 'input_param_flags' - flag_set { - expand_if_all_available: 'interface_library_output_path' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/IMPLIB:%{interface_library_output_path}" - } - } - flag_set { - expand_if_all_available: 'libopts' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - iterate_over: 'libopts' - flag: '%{libopts}' - } - } - flag_set { - expand_if_all_available: 'libraries_to_link' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - action: 'c++-link-static-library' - flag_group { - iterate_over: 'libraries_to_link' - flag_group { - expand_if_equal: { - variable: 'libraries_to_link.type' - value: 'object_file_group' - } - iterate_over: 'libraries_to_link.object_files' - flag_group { - flag: '%{libraries_to_link.object_files}' - } - } - flag_group { - expand_if_equal: { - variable: 'libraries_to_link.type' - value: 'object_file' - } - flag_group { - flag: '%{libraries_to_link.name}' - } - } - flag_group { - expand_if_equal: { - variable: 'libraries_to_link.type' - value: 'interface_library' - } - flag_group { - flag: '%{libraries_to_link.name}' - } - } - flag_group { - expand_if_equal: { - variable: 'libraries_to_link.type' - value: 'static_library' - } - flag_group { - expand_if_false: 'libraries_to_link.is_whole_archive' - flag: '%{libraries_to_link.name}' - } - flag_group { - expand_if_true: 'libraries_to_link.is_whole_archive' - flag: '/WHOLEARCHIVE:%{libraries_to_link.name}' - } - } - } - } - } - - # Since this feature is declared earlier in the CROSSTOOL than - # "user_link_flags", this feature will be applied prior to it anwyhere they - # are both implied. And since "user_link_flags" contains the linkopts from - # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD - # file. - feature { - name: 'linker_subsystem_flag' - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: '/SUBSYSTEM:CONSOLE' - } - } - } - - # The "user_link_flags" contains user-defined linkopts (from build rules) - # so it should be defined after features that declare user-overridable flags. - # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag - # but we want to let the user override it, therefore "link_flag_subsystem" is - # defined earlier in the CROSSTOOL file than "user_link_flags". - feature { - name: 'user_link_flags' - flag_set { - expand_if_all_available: 'user_link_flags' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - iterate_over: 'user_link_flags' - flag: '%{user_link_flags}' - } - } - } - feature { - name: 'legacy_link_flags' - flag_set { - expand_if_all_available: 'legacy_link_flags' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - iterate_over: 'legacy_link_flags' - flag: '%{legacy_link_flags}' - } - } - } - - feature { - name: 'linker_param_file' - flag_set { - expand_if_all_available: 'linker_param_file' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - action: 'c++-link-static-library' - flag_group { - flag: '@%{linker_param_file}' - } - } - } - - feature { - name: 'static_link_msvcrt' - } - - feature { - name: 'static_link_msvcrt_no_debug' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/MT" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEFAULTLIB:libcmt.lib" - } - } - requires: { feature: 'fastbuild'} - requires: { feature: 'opt'} - } - - feature { - name: 'dynamic_link_msvcrt_no_debug' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/MD" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEFAULTLIB:msvcrt.lib" - } - } - requires: { feature: 'fastbuild'} - requires: { feature: 'opt'} - } - - feature { - name: 'static_link_msvcrt_debug' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/MTd" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEFAULTLIB:libcmtd.lib" - } - } - requires: { feature: 'dbg'} - } - - feature { - name: 'dynamic_link_msvcrt_debug' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/MDd" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEFAULTLIB:msvcrtd.lib" - } - } - requires: { feature: 'dbg'} - } - - feature { - name: 'dbg' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/Od" - flag: "/Z7" - flag: "/DDEBUG" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEBUG:FULL" - flag: "/INCREMENTAL:NO" - } - } - implies: 'generate_pdb_file' - } - - feature { - name: 'fastbuild' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/Od" - flag: "/Z7" - flag: "/DDEBUG" - } - } - flag_set { - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEBUG:FASTLINK" - flag: "/INCREMENTAL:NO" - } - } - implies: 'generate_pdb_file' - } - - feature { - name: 'opt' - flag_set { - action: 'c-compile' - action: 'c++-compile' - flag_group { - flag: "/O2" - flag: "/DNDEBUG" - } - } - } - - feature { - name: 'user_compile_flags' - flag_set { - expand_if_all_available: 'user_compile_flags' - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - flag_group { - iterate_over: 'user_compile_flags' - flag: '%{user_compile_flags}' - } - } - } - - feature { - name: 'sysroot' - flag_set { - expand_if_all_available: 'sysroot' - action: 'assemble' - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - iterate_over: 'sysroot' - flag: '--sysroot=%{sysroot}' - } - } - } - - feature { - name: 'unfiltered_compile_flags' - flag_set { - expand_if_all_available: 'unfiltered_compile_flags' - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - flag_group { - iterate_over: 'unfiltered_compile_flags' - flag: '%{unfiltered_compile_flags}' - } - } - } - - feature { - name: 'compiler_output_flags' - flag_set { - action: 'assemble' - flag_group { - expand_if_all_available: 'output_file' - expand_if_none_available: 'output_assembly_file' - expand_if_none_available: 'output_preprocess_file' - flag: '/Fo%{output_file}' - flag: '/Zi' - } - } - flag_set { - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - flag_group { - expand_if_all_available: 'output_file' - expand_if_none_available: 'output_assembly_file' - expand_if_none_available: 'output_preprocess_file' - flag: '/Fo%{output_file}' - } - flag_group { - expand_if_all_available: 'output_file' - expand_if_all_available: 'output_assembly_file' - flag: '/Fa%{output_file}' - } - flag_group { - expand_if_all_available: 'output_file' - expand_if_all_available: 'output_preprocess_file' - flag: '/P' - flag: '/Fi%{output_file}' - } - } - } - - feature { - name: 'compiler_input_flags' - flag_set { - action: 'assemble' - action: 'preprocess-assemble' - action: 'c-compile' - action: 'c++-compile' - action: 'c++-header-parsing' - action: 'c++-module-compile' - action: 'c++-module-codegen' - flag_group { - expand_if_all_available: 'source_file' - flag: '/c' - flag: '%{source_file}' - } - } - } - - feature { - name : 'def_file', - flag_set { - expand_if_all_available: 'def_file_path' - action: 'c++-link-executable' - action: 'c++-link-dynamic-library' - action: "c++-link-nodeps-dynamic-library" - flag_group { - flag: "/DEF:%{def_file_path}" - # We can specify a different DLL name in DEF file, /ignore:4070 suppresses - # the warning message about DLL name doesn't match the default one. - # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx - flag: "/ignore:4070" - } - } - } - - feature { - name: 'windows_export_all_symbols' - } - - feature { - name: 'no_windows_export_all_symbols' - } - - linking_mode_flags { mode: DYNAMIC } -} diff --git a/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl b/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl deleted file mode 100755 index ba002b45..00000000 --- a/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl +++ /dev/null @@ -1,1493 +0,0 @@ -"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows.""" - -load( - "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl", - "action_config", - "env_entry", - "env_set", - "feature", - "feature_set", - "flag_group", - "flag_set", - "tool", - "tool_path", - "variable_with_value", -) -load( - "@bazel_tools//tools/build_defs/cc:action_names.bzl", - "ASSEMBLE_ACTION_NAME", - "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME", - "CLIF_MATCH_ACTION_NAME", - "CPP_COMPILE_ACTION_NAME", - "CPP_HEADER_PARSING_ACTION_NAME", - "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME", - "CPP_LINK_EXECUTABLE_ACTION_NAME", - "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME", - "CPP_LINK_STATIC_LIBRARY_ACTION_NAME", - "CPP_MODULE_CODEGEN_ACTION_NAME", - "CPP_MODULE_COMPILE_ACTION_NAME", - "C_COMPILE_ACTION_NAME", - "LINKSTAMP_COMPILE_ACTION_NAME", - "LTO_BACKEND_ACTION_NAME", - "LTO_INDEXING_ACTION_NAME", - "OBJCPP_COMPILE_ACTION_NAME", - "OBJCPP_EXECUTABLE_ACTION_NAME", - "OBJC_ARCHIVE_ACTION_NAME", - "OBJC_COMPILE_ACTION_NAME", - "OBJC_EXECUTABLE_ACTION_NAME", - "OBJC_FULLY_LINK_ACTION_NAME", - "PREPROCESS_ASSEMBLE_ACTION_NAME", - "STRIP_ACTION_NAME", -) - -ACTION_NAMES = struct( - assemble = ASSEMBLE_ACTION_NAME, - c_compile = C_COMPILE_ACTION_NAME, - cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME, - clif_match = CLIF_MATCH_ACTION_NAME, - cpp_compile = CPP_COMPILE_ACTION_NAME, - cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME, - cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME, - cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME, - cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME, - cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME, - cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME, - cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME, - ld_embed_data = "ld_embed_data", - linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME, - lto_backend = LTO_BACKEND_ACTION_NAME, - lto_indexing = LTO_INDEXING_ACTION_NAME, - objc_archive = OBJC_ARCHIVE_ACTION_NAME, - objc_compile = OBJC_COMPILE_ACTION_NAME, - objc_executable = OBJC_EXECUTABLE_ACTION_NAME, - objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME, - objcopy_embed_data = "objcopy_embed_data", - objcpp_compile = OBJCPP_COMPILE_ACTION_NAME, - objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME, - preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME, - strip = STRIP_ACTION_NAME, -) - -def _impl(ctx): - if (ctx.attr.cpu == "darwin"): - toolchain_identifier = "local_darwin" - elif (ctx.attr.cpu == "local"): - toolchain_identifier = "local_linux" - elif (ctx.attr.cpu == "x64_windows"): - toolchain_identifier = "local_windows" - else: - fail("Unreachable") - - host_system_name = "local" - - target_system_name = "local" - - if (ctx.attr.cpu == "darwin"): - target_cpu = "darwin" - elif (ctx.attr.cpu == "local"): - target_cpu = "local" - elif (ctx.attr.cpu == "x64_windows"): - target_cpu = "x64_windows" - else: - fail("Unreachable") - - if (ctx.attr.cpu == "local"): - target_libc = "local" - elif (ctx.attr.cpu == "darwin"): - target_libc = "macosx" - elif (ctx.attr.cpu == "x64_windows"): - target_libc = "msvcrt" - else: - fail("Unreachable") - - if (ctx.attr.cpu == "darwin" or - ctx.attr.cpu == "local"): - compiler = "compiler" - elif (ctx.attr.cpu == "x64_windows"): - compiler = "msvc-cl" - else: - fail("Unreachable") - - abi_version = "local" - - abi_libc_version = "local" - - cc_target_os = None - - builtin_sysroot = None - - all_link_actions = [ - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ] - - cpp_link_dynamic_library_action = action_config( - action_name = ACTION_NAMES.cpp_link_dynamic_library, - implies = [ - "nologo", - "shared_flag", - "linkstamps", - "output_execpath_flags", - "input_param_flags", - "user_link_flags", - "linker_subsystem_flag", - "linker_param_file", - "msvc_env", - "no_stripping", - "has_configured_linker_path", - "def_file", - ], - tools = [tool(path = ctx.attr.msvc_link_path)], - ) - - cpp_link_nodeps_dynamic_library_action = action_config( - action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library, - implies = [ - "nologo", - "shared_flag", - "linkstamps", - "output_execpath_flags", - "input_param_flags", - "user_link_flags", - "linker_subsystem_flag", - "linker_param_file", - "msvc_env", - "no_stripping", - "has_configured_linker_path", - "def_file", - ], - tools = [tool(path = ctx.attr.msvc_link_path)], - ) - - cpp_link_static_library_action = action_config( - action_name = ACTION_NAMES.cpp_link_static_library, - implies = [ - "nologo", - "archiver_flags", - "input_param_flags", - "linker_param_file", - "msvc_env", - ], - tools = [tool(path = ctx.attr.msvc_lib_path)], - ) - - assemble_action = action_config( - action_name = ACTION_NAMES.assemble, - implies = [ - "compiler_input_flags", - "compiler_output_flags", - "nologo", - "msvc_env", - "sysroot", - ], - tools = [tool(path = ctx.attr.msvc_ml_path)], - ) - - preprocess_assemble_action = action_config( - action_name = ACTION_NAMES.preprocess_assemble, - implies = [ - "compiler_input_flags", - "compiler_output_flags", - "nologo", - "msvc_env", - "sysroot", - ], - tools = [tool(path = ctx.attr.msvc_ml_path)], - ) - - c_compile_action = action_config( - action_name = ACTION_NAMES.c_compile, - implies = [ - "compiler_input_flags", - "compiler_output_flags", - "nologo", - "msvc_env", - "parse_showincludes", - "user_compile_flags", - "sysroot", - "unfiltered_compile_flags", - ], - tools = [tool(path = ctx.attr.msvc_cl_path)], - ) - - cpp_compile_action = action_config( - action_name = ACTION_NAMES.cpp_compile, - implies = [ - "compiler_input_flags", - "compiler_output_flags", - "nologo", - "msvc_env", - "parse_showincludes", - "user_compile_flags", - "sysroot", - "unfiltered_compile_flags", - ], - tools = [tool(path = ctx.attr.msvc_cl_path)], - ) - - cpp_link_executable_action = action_config( - action_name = ACTION_NAMES.cpp_link_executable, - implies = [ - "nologo", - "linkstamps", - "output_execpath_flags", - "input_param_flags", - "user_link_flags", - "linker_subsystem_flag", - "linker_param_file", - "msvc_env", - "no_stripping", - ], - tools = [tool(path = ctx.attr.msvc_link_path)], - ) - - if (ctx.attr.cpu == "darwin" or - ctx.attr.cpu == "local"): - action_configs = [] - elif (ctx.attr.cpu == "x64_windows"): - action_configs = [ - assemble_action, - preprocess_assemble_action, - c_compile_action, - cpp_compile_action, - cpp_link_executable_action, - cpp_link_dynamic_library_action, - cpp_link_nodeps_dynamic_library_action, - cpp_link_static_library_action, - ] - else: - fail("Unreachable") - - no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols") - - pic_feature = feature( - name = "pic", - enabled = True, - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group(flags = ["-fPIC"], expand_if_available = "pic"), - flag_group( - flags = ["-fPIE"], - expand_if_not_available = "pic", - ), - ], - ), - ], - ) - - preprocessor_defines_feature = feature( - name = "preprocessor_defines", - enabled = True, - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ], - flag_groups = [ - flag_group( - flags = ["/D%{preprocessor_defines}"], - iterate_over = "preprocessor_defines", - ), - ], - ), - ], - ) - - generate_pdb_file_feature = feature( - name = "generate_pdb_file", - requires = [ - feature_set(features = ["dbg"]), - feature_set(features = ["fastbuild"]), - ], - ) - - linkstamps_feature = feature( - name = "linkstamps", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group( - flags = ["%{linkstamp_paths}"], - iterate_over = "linkstamp_paths", - expand_if_available = "linkstamp_paths", - ), - ], - ), - ], - ) - - unfiltered_compile_flags_feature = feature( - name = "unfiltered_compile_flags", - flag_sets = ([ - flag_set( - actions = [ - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ], - flag_groups = [ - flag_group( - flags = ctx.attr.host_unfiltered_compile_flags, - ), - ], - ), - ] if ctx.attr.host_unfiltered_compile_flags else []), - ) - - determinism_feature = feature( - name = "determinism", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = [ - "-Wno-builtin-macro-redefined", - "-D__DATE__=\"redacted\"", - "-D__TIMESTAMP__=\"redacted\"", - "-D__TIME__=\"redacted\"", - ], - ), - ], - ), - ], - ) - - nologo_feature = feature( - name = "nologo", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ACTION_NAMES.cpp_link_static_library, - ], - flag_groups = [flag_group(flags = ["/nologo"])], - ), - ], - ) - - supports_pic_feature = feature(name = "supports_pic", enabled = True) - - output_execpath_flags_feature = feature( - name = "output_execpath_flags", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group( - flags = ["/OUT:%{output_execpath}"], - expand_if_available = "output_execpath", - ), - ], - ), - ], - ) - - default_link_flags_feature = feature( - name = "default_link_flags", - enabled = True, - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/MACHINE:X64"])], - ), - ], - ) - - if (ctx.attr.cpu == "local"): - hardening_feature = feature( - name = "hardening", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = [ - "-U_FORTIFY_SOURCE", - "-D_FORTIFY_SOURCE=1", - "-fstack-protector", - ], - ), - ], - ), - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])], - ), - flag_set( - actions = [ACTION_NAMES.cpp_link_executable], - flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])], - ), - ], - ) - elif (ctx.attr.cpu == "darwin"): - hardening_feature = feature( - name = "hardening", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = [ - "-U_FORTIFY_SOURCE", - "-D_FORTIFY_SOURCE=1", - "-fstack-protector", - ], - ), - ], - ), - flag_set( - actions = [ACTION_NAMES.cpp_link_executable], - flag_groups = [flag_group(flags = ["-pie"])], - ), - ], - ) - else: - hardening_feature = None - - supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True) - - targets_windows_feature = feature( - name = "targets_windows", - enabled = True, - implies = ["copy_dynamic_libraries_to_binary"], - ) - - msvc_env_feature = feature( - name = "msvc_env", - env_sets = [ - env_set( - actions = [ - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ACTION_NAMES.cpp_link_static_library, - ], - env_entries = [ - env_entry(key = "PATH", value = ctx.attr.msvc_env_path), - env_entry( - key = "INCLUDE", - value = ctx.attr.msvc_env_include, - ), - env_entry(key = "LIB", value = ctx.attr.msvc_env_lib), - env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp), - env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp), - ], - ), - ], - ) - - linker_subsystem_flag_feature = feature( - name = "linker_subsystem_flag", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])], - ), - ], - ) - - dynamic_link_msvcrt_no_debug_feature = feature( - name = "dynamic_link_msvcrt_no_debug", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/MD"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])], - ), - ], - requires = [ - feature_set(features = ["fastbuild"]), - feature_set(features = ["opt"]), - ], - ) - - warnings_feature = feature( - name = "warnings", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = ["-Wall"] + ctx.attr.host_compiler_warnings, - ), - ], - ), - ], - ) - - dynamic_link_msvcrt_debug_feature = feature( - name = "dynamic_link_msvcrt_debug", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/MDd"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])], - ), - ], - requires = [feature_set(features = ["dbg"])], - ) - - compiler_output_flags_feature = feature( - name = "compiler_output_flags", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.assemble], - flag_groups = [ - flag_group( - flag_groups = [ - flag_group( - flags = ["/Fo%{output_file}", "/Zi"], - expand_if_not_available = "output_preprocess_file", - ), - ], - expand_if_available = "output_file", - expand_if_not_available = "output_assembly_file", - ), - ], - ), - flag_set( - actions = [ - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ], - flag_groups = [ - flag_group( - flag_groups = [ - flag_group( - flags = ["/Fo%{output_file}"], - expand_if_not_available = "output_preprocess_file", - ), - ], - expand_if_available = "output_file", - expand_if_not_available = "output_assembly_file", - ), - flag_group( - flag_groups = [ - flag_group( - flags = ["/Fa%{output_file}"], - expand_if_available = "output_assembly_file", - ), - ], - expand_if_available = "output_file", - ), - flag_group( - flag_groups = [ - flag_group( - flags = ["/P", "/Fi%{output_file}"], - expand_if_available = "output_preprocess_file", - ), - ], - expand_if_available = "output_file", - ), - ], - ), - ], - ) - - default_compile_flags_feature = feature( - name = "default_compile_flags", - enabled = True, - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.linkstamp_compile, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ACTION_NAMES.lto_backend, - ACTION_NAMES.clif_match, - ], - flag_groups = [ - flag_group( - flags = [ - "/DCOMPILER_MSVC", - "/DNOMINMAX", - "/D_WIN32_WINNT=0x0600", - "/D_CRT_SECURE_NO_DEPRECATE", - "/D_CRT_SECURE_NO_WARNINGS", - "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS", - "/bigobj", - "/Zm500", - "/J", - "/Gy", - "/GF", - "/EHsc", - "/wd4351", - "/wd4291", - "/wd4250", - "/wd4996", - ], - ), - ], - ), - ], - ) - - static_link_msvcrt_debug_feature = feature( - name = "static_link_msvcrt_debug", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/MTd"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])], - ), - ], - requires = [feature_set(features = ["dbg"])], - ) - - static_link_msvcrt_feature = feature(name = "static_link_msvcrt") - - if (ctx.attr.cpu == "darwin" or - ctx.attr.cpu == "local"): - dbg_feature = feature( - name = "dbg", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["-g"])], - ), - ], - implies = ["common"], - ) - elif (ctx.attr.cpu == "x64_windows"): - dbg_feature = feature( - name = "dbg", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])], - ), - ], - implies = ["generate_pdb_file"], - ) - else: - dbg_feature = None - - undefined_dynamic_feature = feature( - name = "undefined-dynamic", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ACTION_NAMES.cpp_link_executable, - ], - flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])], - ), - ], - ) - - parse_showincludes_feature = feature( - name = "parse_showincludes", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_header_parsing, - ], - flag_groups = [flag_group(flags = ["/showIncludes"])], - ), - ], - ) - - linker_param_file_feature = feature( - name = "linker_param_file", - flag_sets = [ - flag_set( - actions = all_link_actions + - [ACTION_NAMES.cpp_link_static_library], - flag_groups = [ - flag_group( - flags = ["@%{linker_param_file}"], - expand_if_available = "linker_param_file", - ), - ], - ), - ], - ) - - static_link_msvcrt_no_debug_feature = feature( - name = "static_link_msvcrt_no_debug", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/MT"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])], - ), - ], - requires = [ - feature_set(features = ["fastbuild"]), - feature_set(features = ["opt"]), - ], - ) - - supports_interface_shared_libraries_feature = feature( - name = "supports_interface_shared_libraries", - enabled = True, - ) - - disable_assertions_feature = feature( - name = "disable-assertions", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["-DNDEBUG"])], - ), - ], - ) - - if (ctx.attr.cpu == "x64_windows"): - fastbuild_feature = feature( - name = "fastbuild", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])], - ), - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]), - ], - ), - ], - implies = ["generate_pdb_file"], - ) - elif (ctx.attr.cpu == "darwin" or - ctx.attr.cpu == "local"): - fastbuild_feature = feature(name = "fastbuild", implies = ["common"]) - else: - fastbuild_feature = None - - user_compile_flags_feature = feature( - name = "user_compile_flags", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ], - flag_groups = [ - flag_group( - flags = ["%{user_compile_flags}"], - iterate_over = "user_compile_flags", - expand_if_available = "user_compile_flags", - ), - ], - ), - ], - ) - - compiler_input_flags_feature = feature( - name = "compiler_input_flags", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ], - flag_groups = [ - flag_group( - flags = ["/c", "%{source_file}"], - expand_if_available = "source_file", - ), - ], - ), - ], - ) - - no_legacy_features_feature = feature(name = "no_legacy_features") - - archiver_flags_feature = feature( - name = "archiver_flags", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.cpp_link_static_library], - flag_groups = [ - flag_group( - flags = ["/OUT:%{output_execpath}"], - expand_if_available = "output_execpath", - ), - ], - ), - ], - ) - - redirector_feature = feature( - name = "redirector", - enabled = True, - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ], - flag_groups = [ - flag_group( - flags = [ - "-B", - "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py", - ], - ), - ], - ), - ], - ) - - linker_bin_path_feature = feature( - name = "linker-bin-path", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])], - ), - ], - ) - - if (ctx.attr.cpu == "local"): - opt_feature = feature( - name = "opt", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"], - ), - ], - ), - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ACTION_NAMES.cpp_link_executable, - ], - flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])], - ), - ], - implies = ["common", "disable-assertions"], - ) - elif (ctx.attr.cpu == "darwin"): - opt_feature = feature( - name = "opt", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [ - flag_group( - flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"], - ), - ], - ), - ], - implies = ["common", "disable-assertions"], - ) - elif (ctx.attr.cpu == "x64_windows"): - opt_feature = feature( - name = "opt", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])], - ), - ], - ) - else: - opt_feature = None - - include_paths_feature = feature( - name = "include_paths", - enabled = True, - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ], - flag_groups = [ - flag_group( - flags = ["/I%{quote_include_paths}"], - iterate_over = "quote_include_paths", - ), - flag_group( - flags = ["/I%{include_paths}"], - iterate_over = "include_paths", - ), - flag_group( - flags = ["/I%{system_include_paths}"], - iterate_over = "system_include_paths", - ), - ], - ), - ], - ) - - shared_flag_feature = feature( - name = "shared_flag", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [flag_group(flags = ["/DLL"])], - ), - ], - ) - - windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols") - - frame_pointer_feature = feature( - name = "frame-pointer", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])], - ), - ], - ) - - build_id_feature = feature( - name = "build-id", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group( - flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"], - ), - ], - ), - ], - ) - - sysroot_feature = feature( - name = "sysroot", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.assemble, - ACTION_NAMES.preprocess_assemble, - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_header_parsing, - ACTION_NAMES.cpp_module_compile, - ACTION_NAMES.cpp_module_codegen, - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [ - flag_group( - flags = ["--sysroot=%{sysroot}"], - iterate_over = "sysroot", - expand_if_available = "sysroot", - ), - ], - ), - ], - ) - - def_file_feature = feature( - name = "def_file", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group( - flags = ["/DEF:%{def_file_path}", "/ignore:4070"], - expand_if_available = "def_file_path", - ), - ], - ), - ], - ) - - if (ctx.attr.cpu == "darwin"): - stdlib_feature = feature( - name = "stdlib", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["-lc++"])], - ), - ], - ) - elif (ctx.attr.cpu == "local"): - stdlib_feature = feature( - name = "stdlib", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [flag_group(flags = ["-lstdc++"])], - ), - ], - ) - else: - stdlib_feature = None - - no_stripping_feature = feature(name = "no_stripping") - - alwayslink_feature = feature( - name = "alwayslink", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ACTION_NAMES.cpp_link_executable, - ], - flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])], - ), - ], - ) - - input_param_flags_feature = feature( - name = "input_param_flags", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [ - flag_group( - flags = ["/IMPLIB:%{interface_library_output_path}"], - expand_if_available = "interface_library_output_path", - ), - ], - ), - flag_set( - actions = all_link_actions + - [ACTION_NAMES.cpp_link_static_library], - flag_groups = [ - flag_group( - iterate_over = "libraries_to_link", - flag_groups = [ - flag_group( - iterate_over = "libraries_to_link.object_files", - flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])], - expand_if_equal = variable_with_value( - name = "libraries_to_link.type", - value = "object_file_group", - ), - ), - flag_group( - flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])], - expand_if_equal = variable_with_value( - name = "libraries_to_link.type", - value = "object_file", - ), - ), - flag_group( - flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])], - expand_if_equal = variable_with_value( - name = "libraries_to_link.type", - value = "interface_library", - ), - ), - flag_group( - flag_groups = [ - flag_group( - flags = ["%{libraries_to_link.name}"], - expand_if_false = "libraries_to_link.is_whole_archive", - ), - flag_group( - flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"], - expand_if_true = "libraries_to_link.is_whole_archive", - ), - ], - expand_if_equal = variable_with_value( - name = "libraries_to_link.type", - value = "static_library", - ), - ), - ], - expand_if_available = "libraries_to_link", - ), - ], - ), - ], - ) - - if (ctx.attr.cpu == "local"): - no_canonical_prefixes_feature = feature( - name = "no-canonical-prefixes", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [ - flag_group( - flags = [ - "-no-canonical-prefixes", - ] + ctx.attr.extra_no_canonical_prefixes_flags, - ), - ], - ), - ], - ) - elif (ctx.attr.cpu == "darwin"): - no_canonical_prefixes_feature = feature( - name = "no-canonical-prefixes", - flag_sets = [ - flag_set( - actions = [ - ACTION_NAMES.c_compile, - ACTION_NAMES.cpp_compile, - ACTION_NAMES.cpp_link_executable, - ACTION_NAMES.cpp_link_dynamic_library, - ACTION_NAMES.cpp_link_nodeps_dynamic_library, - ], - flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])], - ), - ], - ) - else: - no_canonical_prefixes_feature = None - - has_configured_linker_path_feature = feature(name = "has_configured_linker_path") - - copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary") - - user_link_flags_feature = feature( - name = "user_link_flags", - flag_sets = [ - flag_set( - actions = all_link_actions, - flag_groups = [ - flag_group( - flags = ["%{user_link_flags}"], - iterate_over = "user_link_flags", - expand_if_available = "user_link_flags", - ), - ], - ), - ], - ) - - cpp11_feature = feature( - name = "c++11", - flag_sets = [ - flag_set( - actions = [ACTION_NAMES.cpp_compile], - flag_groups = [flag_group(flags = ["-std=c++11"])], - ), - ], - ) - - if (ctx.attr.cpu == "local"): - common_feature = feature( - name = "common", - implies = [ - "stdlib", - "c++11", - "determinism", - "alwayslink", - "hardening", - "warnings", - "frame-pointer", - "build-id", - "no-canonical-prefixes", - "linker-bin-path", - ], - ) - elif (ctx.attr.cpu == "darwin"): - common_feature = feature( - name = "common", - implies = [ - "stdlib", - "c++11", - "determinism", - "hardening", - "warnings", - "frame-pointer", - "no-canonical-prefixes", - "linker-bin-path", - "undefined-dynamic", - ], - ) - else: - common_feature = None - - if (ctx.attr.cpu == "local"): - features = [ - cpp11_feature, - stdlib_feature, - determinism_feature, - alwayslink_feature, - pic_feature, - hardening_feature, - warnings_feature, - frame_pointer_feature, - build_id_feature, - no_canonical_prefixes_feature, - disable_assertions_feature, - linker_bin_path_feature, - common_feature, - opt_feature, - fastbuild_feature, - dbg_feature, - supports_dynamic_linker_feature, - supports_pic_feature, - ] - elif (ctx.attr.cpu == "darwin"): - features = [ - cpp11_feature, - stdlib_feature, - determinism_feature, - pic_feature, - hardening_feature, - warnings_feature, - frame_pointer_feature, - no_canonical_prefixes_feature, - disable_assertions_feature, - linker_bin_path_feature, - undefined_dynamic_feature, - common_feature, - opt_feature, - fastbuild_feature, - dbg_feature, - supports_dynamic_linker_feature, - supports_pic_feature, - ] - elif (ctx.attr.cpu == "x64_windows"): - features = [ - no_legacy_features_feature, - redirector_feature, - nologo_feature, - has_configured_linker_path_feature, - no_stripping_feature, - targets_windows_feature, - copy_dynamic_libraries_to_binary_feature, - default_compile_flags_feature, - msvc_env_feature, - include_paths_feature, - preprocessor_defines_feature, - parse_showincludes_feature, - generate_pdb_file_feature, - shared_flag_feature, - linkstamps_feature, - output_execpath_flags_feature, - archiver_flags_feature, - input_param_flags_feature, - linker_subsystem_flag_feature, - user_link_flags_feature, - default_link_flags_feature, - linker_param_file_feature, - static_link_msvcrt_feature, - static_link_msvcrt_no_debug_feature, - dynamic_link_msvcrt_no_debug_feature, - static_link_msvcrt_debug_feature, - dynamic_link_msvcrt_debug_feature, - dbg_feature, - fastbuild_feature, - opt_feature, - user_compile_flags_feature, - sysroot_feature, - unfiltered_compile_flags_feature, - compiler_output_flags_feature, - compiler_input_flags_feature, - def_file_feature, - windows_export_all_symbols_feature, - no_windows_export_all_symbols_feature, - supports_dynamic_linker_feature, - supports_interface_shared_libraries_feature, - ] - else: - fail("Unreachable") - - cxx_builtin_include_directories = ctx.attr.builtin_include_directories - - if (ctx.attr.cpu == "x64_windows"): - tool_paths = [ - tool_path(name = "ar", path = ctx.attr.msvc_lib_path), - tool_path(name = "ml", path = ctx.attr.msvc_ml_path), - tool_path(name = "cpp", path = ctx.attr.msvc_cl_path), - tool_path(name = "gcc", path = ctx.attr.msvc_cl_path), - tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"), - tool_path(name = "ld", path = ctx.attr.msvc_link_path), - tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"), - tool_path( - name = "objcopy", - path = "wrapper/bin/msvc_nop.bat", - ), - tool_path( - name = "objdump", - path = "wrapper/bin/msvc_nop.bat", - ), - tool_path( - name = "strip", - path = "wrapper/bin/msvc_nop.bat", - ), - ] - elif (ctx.attr.cpu == "local"): - tool_paths = [ - tool_path(name = "gcc", path = ctx.attr.host_compiler_path), - tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"), - tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"), - tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"), - tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"), - tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"), - tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"), - tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"), - tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"), - tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"), - tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"), - ] - elif (ctx.attr.cpu == "darwin"): - tool_paths = [ - tool_path(name = "gcc", path = ctx.attr.host_compiler_path), - tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"), - tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"), - tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"), - tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"), - tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"), - tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"), - tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"), - tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"), - tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"), - tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"), - ] - else: - fail("Unreachable") - - out = ctx.actions.declare_file(ctx.label.name) - ctx.actions.write(out, "Fake executable") - return [ - cc_common.create_cc_toolchain_config_info( - ctx = ctx, - features = features, - action_configs = action_configs, - artifact_name_patterns = [], - cxx_builtin_include_directories = cxx_builtin_include_directories, - toolchain_identifier = toolchain_identifier, - host_system_name = host_system_name, - target_system_name = target_system_name, - target_cpu = target_cpu, - target_libc = target_libc, - compiler = compiler, - abi_version = abi_version, - abi_libc_version = abi_libc_version, - tool_paths = tool_paths, - make_variables = [], - builtin_sysroot = builtin_sysroot, - cc_target_os = cc_target_os, - ), - DefaultInfo( - executable = out, - ), - ] - -cc_toolchain_config = rule( - attrs = { - "cpu": attr.string( - mandatory = True, - values = [ - "darwin", - "local", - "x64_windows", - ], - ), - "builtin_include_directories": attr.string_list(), - "extra_no_canonical_prefixes_flags": attr.string_list(), - "host_compiler_path": attr.string(), - "host_compiler_prefix": attr.string(), - "host_compiler_warnings": attr.string_list(), - "host_unfiltered_compile_flags": attr.string_list(), - "linker_bin_path": attr.string(), - "msvc_cl_path": attr.string(default = "msvc_not_used"), - "msvc_env_include": attr.string(default = "msvc_not_used"), - "msvc_env_lib": attr.string(default = "msvc_not_used"), - "msvc_env_path": attr.string(default = "msvc_not_used"), - "msvc_env_tmp": attr.string(default = "msvc_not_used"), - "msvc_lib_path": attr.string(default = "msvc_not_used"), - "msvc_link_path": attr.string(default = "msvc_not_used"), - "msvc_ml_path": attr.string(default = "msvc_not_used"), - }, - executable = True, - provides = [CcToolchainConfigInfo], - implementation = _impl, -) diff --git a/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl deleted file mode 100644 index 81c16c61..00000000 --- a/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Crosstool wrapper for compiling CUDA programs. - -SYNOPSIS: - crosstool_wrapper_is_not_gcc [options passed in by cc_library() - or cc_binary() rule] - -DESCRIPTION: - This script is expected to be called by the cc_library() or cc_binary() bazel - rules. When the option "-x cuda" is present in the list of arguments passed - to this script, it invokes the nvcc CUDA compiler. Most arguments are passed - as is as a string to --compiler-options of nvcc. When "-x cuda" is not - present, this wrapper invokes hybrid_driver_is_not_gcc with the input - arguments as is. - -NOTES: - Changes to the contents of this file must be propagated from - //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to - //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc -""" - -from __future__ import print_function - -__author__ = 'keveman@google.com (Manjunath Kudlur)' - -from argparse import ArgumentParser -import os -import subprocess -import re -import sys -import pipes - -# Template values set by cuda_autoconf. -CPU_COMPILER = ('%{cpu_compiler}') -GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}') - -NVCC_PATH = '%{nvcc_path}' -PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH) -NVCC_VERSION = '%{cuda_version}' - -def Log(s): - print('gpus/crosstool: {0}'.format(s)) - - -def GetOptionValue(argv, option): - """Extract the list of values for option from the argv list. - - Args: - argv: A list of strings, possibly the argv passed to main(). - option: The option whose value to extract, without the leading '-'. - - Returns: - A list of values, either directly following the option, - (eg., -opt val1 val2) or values collected from multiple occurrences of - the option (eg., -opt val1 -opt val2). - """ - - parser = ArgumentParser() - parser.add_argument('-' + option, nargs='*', action='append') - args, _ = parser.parse_known_args(argv) - if not args or not vars(args)[option]: - return [] - else: - return sum(vars(args)[option], []) - - -def GetHostCompilerOptions(argv): - """Collect the -isystem, -iquote, and --sysroot option values from argv. - - Args: - argv: A list of strings, possibly the argv passed to main(). - - Returns: - The string that can be used as the --compiler-options to nvcc. - """ - - parser = ArgumentParser() - parser.add_argument('-isystem', nargs='*', action='append') - parser.add_argument('-iquote', nargs='*', action='append') - parser.add_argument('--sysroot', nargs=1) - parser.add_argument('-g', nargs='*', action='append') - parser.add_argument('-fno-canonical-system-headers', action='store_true') - - args, _ = parser.parse_known_args(argv) - - opts = '' - - if args.isystem: - opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, [])) - if args.iquote: - opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, [])) - if args.g: - opts += ' -g' + ' -g'.join(sum(args.g, [])) - if args.fno_canonical_system_headers: - opts += ' -fno-canonical-system-headers' - if args.sysroot: - opts += ' --sysroot ' + args.sysroot[0] - - return opts - -def _update_options(nvcc_options): - if NVCC_VERSION in ("7.0",): - return nvcc_options - - update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" } - return [ update_options[opt] if opt in update_options else opt - for opt in nvcc_options ] - -def GetNvccOptions(argv): - """Collect the -nvcc_options values from argv. - - Args: - argv: A list of strings, possibly the argv passed to main(). - - Returns: - The string that can be passed directly to nvcc. - """ - - parser = ArgumentParser() - parser.add_argument('-nvcc_options', nargs='*', action='append') - - args, _ = parser.parse_known_args(argv) - - if args.nvcc_options: - options = _update_options(sum(args.nvcc_options, [])) - return ' '.join(['--'+a for a in options]) - return '' - - -def InvokeNvcc(argv, log=False): - """Call nvcc with arguments assembled from argv. - - Args: - argv: A list of strings, possibly the argv passed to main(). - log: True if logging is requested. - - Returns: - The return value of calling os.system('nvcc ' + args) - """ - - host_compiler_options = GetHostCompilerOptions(argv) - nvcc_compiler_options = GetNvccOptions(argv) - opt_option = GetOptionValue(argv, 'O') - m_options = GetOptionValue(argv, 'm') - m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']]) - include_options = GetOptionValue(argv, 'I') - out_file = GetOptionValue(argv, 'o') - depfiles = GetOptionValue(argv, 'MF') - defines = GetOptionValue(argv, 'D') - defines = ''.join([' -D' + define for define in defines]) - undefines = GetOptionValue(argv, 'U') - undefines = ''.join([' -U' + define for define in undefines]) - std_options = GetOptionValue(argv, 'std') - # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang. - nvcc_allowed_std_options = ["c++03", "c++11", "c++14", "c++17"] - std_options = ''.join([' -std=' + define - for define in std_options if define in nvcc_allowed_std_options]) - - # The list of source files get passed after the -c option. I don't know of - # any other reliable way to just get the list of source files to be compiled. - src_files = GetOptionValue(argv, 'c') - - # Pass -w through from host to nvcc, but don't do anything fancier with - # warnings-related flags, since they're not necessarily the same across - # compilers. - warning_options = ' -w' if '-w' in argv else '' - - if len(src_files) == 0: - return 1 - if len(out_file) != 1: - return 1 - - opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0) - else ' -g -G') - - includes = (' -I ' + ' -I '.join(include_options) - if len(include_options) > 0 - else '') - - # Unfortunately, there are other options that have -c prefix too. - # So allowing only those look like C/C++ files. - src_files = [f for f in src_files if - re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)] - srcs = ' '.join(src_files) - out = ' -o ' + out_file[0] - - supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ] - nvccopts = '-D_FORCE_INLINES ' - supported_cuda_compute_capabilities = sorted([ - x.replace(".", "") for x in supported_cuda_compute_capabilities]) - for capability in supported_cuda_compute_capabilities[:-1]: - nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % ( - capability, capability) - if supported_cuda_compute_capabilities: - capability = supported_cuda_compute_capabilities[-1] - nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % ( - capability, capability, capability) - nvccopts += ' ' + nvcc_compiler_options - nvccopts += undefines - nvccopts += defines - nvccopts += std_options - nvccopts += m_options - nvccopts += warning_options - - if depfiles: - # Generate the dependency file - depfile = depfiles[0] - cmd = (NVCC_PATH + ' ' + nvccopts + - ' --compiler-options "' + host_compiler_options + '"' + - ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH + - ' -I .' + - ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile) - if log: Log(cmd) - exit_status = os.system(cmd) - if exit_status != 0: - return exit_status - - cmd = (NVCC_PATH + ' ' + nvccopts + - ' --compiler-options "' + host_compiler_options + ' -fPIC"' + - ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH + - ' -I .' + - ' -x cu ' + opt + includes + ' -c ' + srcs + out) - - # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'. - # Need to investigate and fix. - cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd - if log: Log(cmd) - return os.system(cmd) - - -def main(): - parser = ArgumentParser() - parser.add_argument('-x', nargs=1) - parser.add_argument('--cuda_log', action='store_true') - args, leftover = parser.parse_known_args(sys.argv[1:]) - - if args.x and args.x[0] == 'cuda': - if args.cuda_log: Log('-x cuda') - leftover = [pipes.quote(s) for s in leftover] - if args.cuda_log: Log('using nvcc') - return InvokeNvcc(leftover, log=args.cuda_log) - - # Strip our flags before passing through to the CPU compiler for files which - # are not -x cuda. We can't just pass 'leftover' because it also strips -x. - # We not only want to pass -x to the CPU compiler, but also keep it in its - # relative location in the argv list (the compiler is actually sensitive to - # this). - cpu_compiler_flags = [flag for flag in sys.argv[1:] - if not flag.startswith(('--cuda_log'))] - - return subprocess.call([CPU_COMPILER] + cpu_compiler_flags) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl deleted file mode 100644 index 1a097568..00000000 --- a/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows. - -DESCRIPTION: - This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc -""" - -from __future__ import print_function - -from argparse import ArgumentParser -import os -import subprocess -import re -import sys -import pipes - -# Template values set by cuda_autoconf. -CPU_COMPILER = ('%{cpu_compiler}') -GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}') - -NVCC_PATH = '%{nvcc_path}' -NVCC_VERSION = '%{cuda_version}' -NVCC_TEMP_DIR = "%{nvcc_tmp_dir}" -supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ] - -def Log(s): - print('gpus/crosstool: {0}'.format(s)) - - -def GetOptionValue(argv, option): - """Extract the list of values for option from options. - - Args: - option: The option whose value to extract, without the leading '/'. - - Returns: - 1. A list of values, either directly following the option, - (eg., /opt val1 val2) or values collected from multiple occurrences of - the option (eg., /opt val1 /opt val2). - 2. The leftover options. - """ - - parser = ArgumentParser(prefix_chars='/') - parser.add_argument('/' + option, nargs='*', action='append') - args, leftover = parser.parse_known_args(argv) - if args and vars(args)[option]: - return (sum(vars(args)[option], []), leftover) - return ([], leftover) - -def _update_options(nvcc_options): - if NVCC_VERSION in ("7.0",): - return nvcc_options - - update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" } - return [ update_options[opt] if opt in update_options else opt - for opt in nvcc_options ] - -def GetNvccOptions(argv): - """Collect the -nvcc_options values from argv. - - Args: - argv: A list of strings, possibly the argv passed to main(). - - Returns: - 1. The string that can be passed directly to nvcc. - 2. The leftover options. - """ - - parser = ArgumentParser() - parser.add_argument('-nvcc_options', nargs='*', action='append') - - args, leftover = parser.parse_known_args(argv) - - if args.nvcc_options: - options = _update_options(sum(args.nvcc_options, [])) - return (['--' + a for a in options], leftover) - return ([], leftover) - - -def InvokeNvcc(argv, log=False): - """Call nvcc with arguments assembled from argv. - - Args: - argv: A list of strings, possibly the argv passed to main(). - log: True if logging is requested. - - Returns: - The return value of calling os.system('nvcc ' + args) - """ - - src_files = [f for f in argv if - re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)] - if len(src_files) == 0: - raise Error('No source files found for cuda compilation.') - - out_file = [ f for f in argv if f.startswith('/Fo') ] - if len(out_file) != 1: - raise Error('Please sepecify exactly one output file for cuda compilation.') - out = ['-o', out_file[0][len('/Fo'):]] - - nvcc_compiler_options, argv = GetNvccOptions(argv) - - opt_option, argv = GetOptionValue(argv, 'O') - opt = ['-g', '-G'] - if (len(opt_option) > 0 and opt_option[0] != 'd'): - opt = ['-O2'] - - include_options, argv = GetOptionValue(argv, 'I') - includes = ["-I " + include for include in include_options] - - defines, argv = GetOptionValue(argv, 'D') - defines = ['-D' + define for define in defines] - - undefines, argv = GetOptionValue(argv, 'U') - undefines = ['-U' + define for define in undefines] - - # The rest of the unrecongized options should be passed to host compiler - host_compiler_options = [option for option in argv if option not in (src_files + out_file)] - - m_options = ["-m64"] - - nvccopts = ['-D_FORCE_INLINES'] - for capability in supported_cuda_compute_capabilities: - capability = capability.replace('.', '') - nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % ( - capability, capability, capability)] - nvccopts += nvcc_compiler_options - nvccopts += undefines - nvccopts += defines - nvccopts += m_options - nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"'] - nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files - # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP - # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check - # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver - # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists. - if os.path.isfile(NVCC_TEMP_DIR): - os.remove(NVCC_TEMP_DIR) - if not os.path.exists(NVCC_TEMP_DIR): - os.makedirs(NVCC_TEMP_DIR) - nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR] - cmd = [NVCC_PATH] + nvccopts - if log: - Log(cmd) - proc = subprocess.Popen(cmd, - stdout=sys.stdout, - stderr=sys.stderr, - env=os.environ.copy(), - shell=True) - proc.wait() - return proc.returncode - -def main(): - parser = ArgumentParser() - parser.add_argument('-x', nargs=1) - parser.add_argument('--cuda_log', action='store_true') - args, leftover = parser.parse_known_args(sys.argv[1:]) - - if args.x and args.x[0] == 'cuda': - if args.cuda_log: Log('-x cuda') - leftover = [pipes.quote(s) for s in leftover] - if args.cuda_log: Log('using nvcc') - return InvokeNvcc(leftover, log=args.cuda_log) - - # Strip our flags before passing through to the CPU compiler for files which - # are not -x cuda. We can't just pass 'leftover' because it also strips -x. - # We not only want to pass -x to the CPU compiler, but also keep it in its - # relative location in the argv list (the compiler is actually sensitive to - # this). - cpu_compiler_flags = [flag for flag in sys.argv[1:] - if not flag.startswith(('--cuda_log')) - and not flag.startswith(('-nvcc_options'))] - - return subprocess.call([CPU_COMPILER] + cpu_compiler_flags) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/build_deps/toolchains/gpu/cub.BUILD b/build_deps/toolchains/gpu/cub.BUILD deleted file mode 100644 index cdc9e4f3..00000000 --- a/build_deps/toolchains/gpu/cub.BUILD +++ /dev/null @@ -1,25 +0,0 @@ -# Description: CUB library which is a set of primitives for GPU programming. - -load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda") - -package( - default_visibility = ["//visibility:public"], -) - -licenses(["notice"]) # BSD - -filegroup( - name = "cub_header_files", - srcs = glob([ - "cub/**", - ]), -) - -cc_library( - name = "cub", - hdrs = if_cuda([":cub_header_files"]), - include_prefix = "gpu", - deps = [ - "@local_config_cuda//cuda:cuda_headers", - ], -) diff --git a/build_deps/toolchains/gpu/cuda/BUILD.tpl b/build_deps/toolchains/gpu/cuda/BUILD.tpl deleted file mode 100644 index 1ac5643f..00000000 --- a/build_deps/toolchains/gpu/cuda/BUILD.tpl +++ /dev/null @@ -1,227 +0,0 @@ -load(":build_defs.bzl", "cuda_header_library") - -licenses(["restricted"]) # MPL2, portions GPL v3, LGPL v3, BSD-like - -package(default_visibility = ["//visibility:public"]) - -config_setting( - name = "using_nvcc", - values = { - "define": "using_cuda_nvcc=true", - }, -) - -config_setting( - name = "using_clang", - values = { - "define": "using_cuda_clang=true", - }, -) - -# Equivalent to using_clang && -c opt. -config_setting( - name = "using_clang_opt", - values = { - "define": "using_cuda_clang=true", - "compilation_mode": "opt", - }, -) - -config_setting( - name = "darwin", - values = {"cpu": "darwin"}, - visibility = ["//visibility:public"], -) - -config_setting( - name = "freebsd", - values = {"cpu": "freebsd"}, - visibility = ["//visibility:public"], -) - -cuda_header_library( - name = "cuda_headers", - hdrs = [ - %{cuda_headers} - ], - include_prefix = "third_party/gpus", - includes = [ - ".", - "cuda/include", - "cuda/include/crt", - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cudart_static", - srcs = ["cuda/lib/%{cudart_static_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkopts = select({ - ":freebsd": [], - "//conditions:default": ["-ldl"], - }) + [ - "-lpthread", - %{cudart_static_linkopt} - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cuda_driver", - srcs = ["cuda/lib/%{cuda_driver_lib}"], - includes = [ - ".", - "cuda/include", - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cudart", - srcs = ["cuda/lib/%{cudart_lib}"], - data = ["cuda/lib/%{cudart_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cublas", - srcs = ["cuda/lib/%{cublas_lib}"], - data = ["cuda/lib/%{cublas_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cusolver", - srcs = ["cuda/lib/%{cusolver_lib}"], - data = ["cuda/lib/%{cusolver_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkopts = ["-lgomp"], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cudnn", - srcs = ["cuda/lib/%{cudnn_lib}"], - data = ["cuda/lib/%{cudnn_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cudnn_header", - includes = [ - ".", - "cuda/include", - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cufft", - srcs = ["cuda/lib/%{cufft_lib}"], - data = ["cuda/lib/%{cufft_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "curand", - srcs = ["cuda/lib/%{curand_lib}"], - data = ["cuda/lib/%{curand_lib}"], - includes = [ - ".", - "cuda/include", - ], - linkstatic = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cuda", - visibility = ["//visibility:public"], - deps = [ - ":cublas", - ":cuda_headers", - ":cudart", - ":cudnn", - ":cufft", - ":curand", - ], -) - -cc_library( - name = "cupti_headers", - hdrs = [ - "cuda/cuda_config.h", - ":cuda-extras", - ], - includes = [ - ".", - "cuda/extras/CUPTI/include/", - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cupti_dsos", - data = ["cuda/lib/%{cupti_lib}"], - includes = [ - ".", - "cuda/include", - ], - visibility = ["//visibility:public"], -) - -cc_library( - name = "libdevice_root", - data = [":cuda-nvvm"], - visibility = ["//visibility:public"], -) - -cc_library( - name = "cuda_libs", - data = [ - ":cudart", - ], - linkopts = select({ - ":darwin": [ - "-Wl,-rpath,./lib", - "-Wl,-rpath,./extras/CUPTI/lib", - ], - "//conditions:default": [ - "-Wl,-rpath,./lib64", - "-Wl,-rpath,./extras/CUPTI/lib64", - ], - }), - deps = [ - ":cudart", - ], -) - -%{copy_rules} diff --git a/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl b/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl deleted file mode 100644 index 3ed4fd41..00000000 --- a/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl +++ /dev/null @@ -1,164 +0,0 @@ -licenses(["restricted"]) # MPL2, portions GPL v3, LGPL v3, BSD-like - -package(default_visibility = ["//visibility:public"]) - -config_setting( - name = "using_nvcc", - values = { - "define": "using_cuda_nvcc=true", - }, -) - -config_setting( - name = "using_clang", - values = { - "define": "using_cuda_clang=true", - }, -) - -# Equivalent to using_clang && -c opt. -config_setting( - name = "using_clang_opt", - values = { - "define": "using_cuda_clang=true", - "compilation_mode": "opt", - }, -) - -config_setting( - name = "darwin", - values = {"cpu": "darwin"}, - visibility = ["//visibility:public"], -) - -config_setting( - name = "freebsd", - values = {"cpu": "freebsd"}, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cuda_headers", - hdrs = [ - "cuda/cuda_config.h", - %{cuda_headers} - ], - includes = [ - ".", - "cuda/include", - "cuda/include/crt", - ], - visibility = ["//visibility:public"], -) - -cc_import( - name = "cudart_static", - # /WHOLEARCHIVE:cudart_static.lib will cause a - # "Internal error during CImplib::EmitThunk" error. - # Treat this library as interface library to avoid being whole archived when - # linking a DLL that depends on this. - # TODO(pcloudy): Remove this rule after b/111278841 is resolved. - interface_library = "cuda/lib/%{cudart_static_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "cuda_driver", - interface_library = "cuda/lib/%{cuda_driver_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "cudart", - interface_library = "cuda/lib/%{cudart_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "cublas", - interface_library = "cuda/lib/%{cublas_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "cusolver", - interface_library = "cuda/lib/%{cusolver_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "cudnn", - interface_library = "cuda/lib/%{cudnn_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cudnn_header", - includes = [ - ".", - "cuda/include", - ], - visibility = ["//visibility:public"], -) - -cc_import( - name = "cufft", - interface_library = "cuda/lib/%{cufft_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_import( - name = "curand", - interface_library = "cuda/lib/%{curand_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "cuda", - visibility = ["//visibility:public"], - deps = [ - ":cublas", - ":cuda_headers", - ":cudart", - ":cudnn", - ":cufft", - ":curand", - ], -) - -cc_library( - name = "cupti_headers", - hdrs = [ - "cuda/cuda_config.h", - ":cuda-extras", - ], - includes = [ - ".", - "cuda/", - "cuda/extras/CUPTI/include/", - ], - visibility = ["//visibility:public"], -) - -cc_import( - name = "cupti_dsos", - interface_library = "cuda/lib/%{cupti_lib}", - system_provided = 1, - visibility = ["//visibility:public"], -) - -cc_library( - name = "libdevice_root", - data = [":cuda-nvvm"], - visibility = ["//visibility:public"], -) - -%{copy_rules} diff --git a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl deleted file mode 100644 index a4f484fb..00000000 --- a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl +++ /dev/null @@ -1,62 +0,0 @@ -# Macros for building CUDA code. -def if_cuda(if_true, if_false = []): - """Shorthand for select()'ing on whether we're building with CUDA. - - Returns a select statement which evaluates to if_true if we're building - with CUDA enabled. Otherwise, the select statement evaluates to if_false. - - """ - return select({ - "@local_config_cuda//cuda:using_nvcc": if_true, - "@local_config_cuda//cuda:using_clang": if_true, - "//conditions:default": if_false - }) - - -def cuda_default_copts(): - """Default options for all CUDA compilations.""" - return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts}) - - -def cuda_is_configured(): - """Returns true if CUDA was enabled during the configure process.""" - return %{cuda_is_configured} - -def if_cuda_is_configured(x): - """Tests if the CUDA was enabled during the configure process. - - Unlike if_cuda(), this does not require that we are building with - --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries. - """ - if cuda_is_configured(): - return x - return [] - -def cuda_header_library( - name, - hdrs, - include_prefix = None, - strip_include_prefix = None, - deps = [], - **kwargs): - """Generates a cc_library containing both virtual and system include paths. - - Generates both a header-only target with virtual includes plus the full - target without virtual includes. This works around the fact that bazel can't - mix 'includes' and 'include_prefix' in the same target.""" - - native.cc_library( - name = name + "_virtual", - hdrs = hdrs, - include_prefix = include_prefix, - strip_include_prefix = strip_include_prefix, - deps = deps, - visibility = ["//visibility:private"], - ) - - native.cc_library( - name = name, - textual_hdrs = hdrs, - deps = deps + [":%s_virtual" % name], - **kwargs - ) diff --git a/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl b/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl deleted file mode 100644 index 811b040e..00000000 --- a/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef CUDA_CUDA_CONFIG_H_ -#define CUDA_CUDA_CONFIG_H_ - -#define TF_CUDA_CAPABILITIES %{cuda_compute_capabilities} - -#define TF_CUDA_VERSION "%{cuda_version}" -#define TF_CUDNN_VERSION "%{cudnn_version}" - -#define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}" - -#endif // CUDA_CUDA_CONFIG_H_ diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl deleted file mode 100644 index ba38c6b5..00000000 --- a/build_deps/toolchains/gpu/cuda_configure.bzl +++ /dev/null @@ -1,1116 +0,0 @@ -# -*- Python -*- -"""Repository rule for CUDA autoconfiguration. -`cuda_configure` depends on the following environment variables: - * `TF_NEED_CUDA`: Whether to enable building with CUDA. - * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path - * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler. - * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for - both host and device code compilation if TF_CUDA_CLANG is 1. - * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is - `/usr/local/cuda,usr/`. - * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is - `/usr/local/cuda`. - * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then - use the system default. - * `TF_CUDNN_VERSION`: The version of the cuDNN library. - * `CUDNN_INSTALL_PATH`: The path to the cuDNN library. Default is - `/usr/local/cuda`. - * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is - `3.5,5.2`. - * `PYTHON_BIN_PATH`: The python binary path -""" - -load( - "@bazel_tools//tools/cpp:lib_cc_configure.bzl", - "escape_string", - "get_env_var", -) -load( - "@bazel_tools//tools/cpp:windows_cc_configure.bzl", - "find_msvc_tool", - "find_vc_path", - "setup_vc_env_vars", -) - -_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH" - -_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH" - -_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH" - -_TF_CUDA_VERSION = "TF_CUDA_VERSION" - -_TF_CUDNN_VERSION = "TF_CUDNN_VERSION" - -_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH" - -_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES" - -_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG" - -_PYTHON_BIN_PATH = "PYTHON_BIN_PATH" - -_DEFAULT_CUDA_COMPUTE_CAPABILITIES = [ - "3.5", - "5.2", -] - -def _get_python_bin(repository_ctx): - """Gets the python bin path.""" - python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH) - if python_bin != None: - return python_bin - python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python" - python_bin_path = repository_ctx.which(python_bin_name) - if python_bin_path != None: - return str(python_bin_path) - auto_configure_fail( - "Cannot find python in PATH, please make sure " + - "python is installed and add its directory in PATH, or --define " + - "%s='/something/else'.\nPATH=%s" % ( - _PYTHON_BIN_PATH, - repository_ctx.os.environ.get("PATH", ""), - ), - ) - -def _get_nvcc_tmp_dir_for_windows(repository_ctx): - """Return the tmp directory for nvcc to generate intermediate source files.""" - escaped_tmp_dir = escape_string( - get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace( - "\\", - "\\\\", - ), - ) - return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir" - -def _get_msvc_compiler(repository_ctx): - vc_path = find_vc_path(repository_ctx) - return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/") - -def _get_win_cuda_defines(repository_ctx): - """Return CROSSTOOL defines for Windows""" - - # If we are not on Windows, return empty vaules for Windows specific fields. - # This ensures the CROSSTOOL file parser is happy. - if not _is_windows(repository_ctx): - return dict({ - "%{msvc_env_tmp}": "", - "%{msvc_env_path}": "", - "%{msvc_env_include}": "", - "%{msvc_env_lib}": "", - "%{msvc_cl_path}": "", - "%{msvc_ml_path}": "", - "%{msvc_link_path}": "", - "%{msvc_lib_path}": "", - "%{cxx_builtin_include_directory}": "", - }) - - vc_path = find_vc_path(repository_ctx) - if not vc_path: - auto_configure_fail( - "Visual C++ build tools not found on your machine. " + - "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using", - ) - return {} - - env = setup_vc_env_vars(repository_ctx, vc_path) - escaped_paths = escape_string(env["PATH"]) - escaped_include_paths = escape_string(env["INCLUDE"]) - escaped_lib_paths = escape_string(env["LIB"]) - escaped_tmp_dir = escape_string( - get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace( - "\\", - "\\\\", - ), - ) - - msvc_cl_path = _get_python_bin(repository_ctx) - msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace( - "\\", - "/", - ) - msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace( - "\\", - "/", - ) - msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace( - "\\", - "/", - ) - - # nvcc will generate some temporary source files under %{nvcc_tmp_dir} - # The generated files are guranteed to have unique name, so they can share the same tmp directory - escaped_cxx_include_directories = [ - "cxx_builtin_include_directory: \"%s\"" % - _get_nvcc_tmp_dir_for_windows(repository_ctx), - ] - for path in escaped_include_paths.split(";"): - if path: - escaped_cxx_include_directories.append( - "cxx_builtin_include_directory: \"%s\"" % path, - ) - - return { - "%{msvc_env_tmp}": escaped_tmp_dir, - "%{msvc_env_path}": escaped_paths, - "%{msvc_env_include}": escaped_include_paths, - "%{msvc_env_lib}": escaped_lib_paths, - "%{msvc_cl_path}": msvc_cl_path, - "%{msvc_ml_path}": msvc_ml_path, - "%{msvc_link_path}": msvc_link_path, - "%{msvc_lib_path}": msvc_lib_path, - "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories), - } - -def find_cc(repository_ctx): - """Find the C++ compiler.""" - if _is_windows(repository_ctx): - return _get_msvc_compiler(repository_ctx) - - target_cc_name = "gcc" - cc_path_envvar = _GCC_HOST_COMPILER_PATH - cc_name = target_cc_name - - if cc_path_envvar in repository_ctx.os.environ: - cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip() - if cc_name_from_env: - cc_name = cc_name_from_env - if cc_name.startswith("/"): - # Absolute path, maybe we should make this supported by our which function. - return cc_name - cc = repository_ctx.which(cc_name) - if cc == None: - fail(("Cannot find {}, either correct your path or set the {}" + - " environment variable").format(target_cc_name, cc_path_envvar)) - return cc - -_INC_DIR_MARKER_BEGIN = "#include <...>" - -# OSX add " (framework directory)" at the end of line, strip it. -_OSX_FRAMEWORK_SUFFIX = " (framework directory)" - -_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX) - -def _cxx_inc_convert(path): - """Convert path returned by cc -E xc++ in a complete path.""" - path = path.strip() - if path.endswith(_OSX_FRAMEWORK_SUFFIX): - path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip() - return path - -def _normalize_include_path(repository_ctx, path): - """Normalizes include paths before writing them to the crosstool. - If path points inside the 'crosstool' folder of the repository, a relative - path is returned. - If path points outside the 'crosstool' folder, an absolute path is returned. - """ - path = str(repository_ctx.path(path)) - crosstool_folder = str(repository_ctx.path(".").get_child("crosstool")) - - if path.startswith(crosstool_folder): - # We drop the path to "$REPO/crosstool" and a trailing path separator. - return path[len(crosstool_folder) + 1:] - return path - -def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp): - """Compute the list of default C or C++ include directories.""" - if lang_is_cpp: - lang = "c++" - else: - lang = "c" - result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"]) - index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN) - if index1 == -1: - return [] - index1 = result.stderr.find("\n", index1) - if index1 == -1: - return [] - index2 = result.stderr.rfind("\n ") - if index2 == -1 or index2 < index1: - return [] - index2 = result.stderr.find("\n", index2 + 1) - if index2 == -1: - inc_dirs = result.stderr[index1 + 1:] - else: - inc_dirs = result.stderr[index1 + 1:index2].strip() - - return [ - _normalize_include_path(repository_ctx, _cxx_inc_convert(p)) - for p in inc_dirs.split("\n") - ] - -def get_cxx_inc_directories(repository_ctx, cc): - """Compute the list of default C and C++ include directories.""" - - # For some reason `clang -xc` sometimes returns include paths that are - # different from the ones from `clang -xc++`. (Symlink and a dir) - # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists - includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True) - includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False) - - return includes_cpp + [ - inc - for inc in includes_c - if inc not in includes_cpp - ] - -def auto_configure_fail(msg): - """Output failure message when cuda configuration fails.""" - red = "\033[0;31m" - no_color = "\033[0m" - fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg)) - -# END cc_configure common functions (see TODO above). - -def _host_compiler_includes(repository_ctx, cc): - """Generates the cxx_builtin_include_directory entries for gcc inc dirs. - Args: - repository_ctx: The repository context. - cc: The path to the gcc host compiler. - Returns: - A string containing the cxx_builtin_include_directory for each of the gcc - host compiler include directories, which can be added to the CROSSTOOL - file. - """ - inc_dirs = get_cxx_inc_directories(repository_ctx, cc) - inc_entries = [] - for inc_dir in inc_dirs: - inc_entries.append(" cxx_builtin_include_directory: \"%s\"" % inc_dir) - return "\n".join(inc_entries) - -def _cuda_include_path(repository_ctx, cuda_config): - """Generates the cxx_builtin_include_directory entries for cuda inc dirs. - Args: - repository_ctx: The repository context. - cc: The path to the gcc host compiler. - Returns: - A string containing the cxx_builtin_include_directory for each of the gcc - host compiler include directories, which can be added to the CROSSTOOL - file. - """ - nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % ( - cuda_config.cuda_toolkit_path, - ".exe" if cuda_config.cpu_value == "Windows" else "", - )) - result = repository_ctx.execute([ - nvcc_path, - "-v", - "/dev/null", - "-o", - "/dev/null", - ]) - target_dir = "" - for one_line in result.stderr.splitlines(): - if one_line.startswith("#$ _TARGET_DIR_="): - target_dir = ( - cuda_config.cuda_toolkit_path + "/" + one_line.replace( - "#$ _TARGET_DIR_=", - "", - ) + "/include" - ) - inc_entries = [] - if target_dir != "": - inc_entries.append(" cxx_builtin_include_directory: \"%s\"" % target_dir) - default_include = cuda_config.cuda_toolkit_path + "/include" - inc_entries.append( - " cxx_builtin_include_directory: \"%s\"" % default_include, - ) - return "\n".join(inc_entries) - -def enable_cuda(repository_ctx): - if "TF_NEED_CUDA" in repository_ctx.os.environ: - enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip() - return enable_cuda == "1" - return False - -def matches_version(environ_version, detected_version): - """Checks whether the user-specified version matches the detected version. - This function performs a weak matching so that if the user specifies only - the - major or major and minor versions, the versions are still considered - matching - if the version parts match. To illustrate: - environ_version detected_version result - ----------------------------------------- - 5.1.3 5.1.3 True - 5.1 5.1.3 True - 5 5.1 True - 5.1.3 5.1 False - 5.2.3 5.1.3 False - Args: - environ_version: The version specified by the user via environment - variables. - detected_version: The version autodetected from the CUDA installation on - the system. - Returns: True if user-specified version matches detected version and False - otherwise. - """ - environ_version_parts = environ_version.split(".") - detected_version_parts = detected_version.split(".") - if len(detected_version_parts) < len(environ_version_parts): - return False - for i, part in enumerate(detected_version_parts): - if i >= len(environ_version_parts): - break - if part != environ_version_parts[i]: - return False - return True - -def find_cuda_define(repository_ctx, header_dir, header_file, define): - """Returns the value of a #define in a header file. - Greps through a header file and returns the value of the specified #define. - If the #define is not found, then raise an error. - Args: - repository_ctx: The repository context. - header_dir: The directory containing the header file. - header_file: The header file name. - define: The #define to search for. - Returns: - The value of the #define found in the header. - """ - - # Confirm location of the header and grep for the line defining the macro. - h_path = repository_ctx.path("%s/%s" % (header_dir, header_file)) - if not h_path.exists: - auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path))) - result = repository_ctx.execute( - # Grep one more lines as some #defines are split into two lines. - [ - "grep", - "--color=never", - "-A1", - "-E", - define, - str(h_path), - ], - ) - if result.stderr: - auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr)) - - # Parse the version from the line defining the macro. - if result.stdout.find(define) == -1: - auto_configure_fail( - "Cannot find line containing '%s' in %s" % (define, h_path), - ) - - # Split results to lines - lines = result.stdout.split("\n") - num_lines = len(lines) - for l in range(num_lines): - line = lines[l] - if define in line: # Find the line with define - version = line - if l != num_lines - 1 and line[-1] == "\\": # Add next line, if multiline - version = version[:-1] + lines[l + 1] - break - - # Remove any comments - version = version.split("//")[0] - - # Remove define name - version = version.replace(define, "").strip() - - # Remove the code after the version number. - version_end = version.find(" ") - if version_end != -1: - if version_end == 0: - auto_configure_fail( - "Cannot extract the version from line containing '%s' in %s" % - (define, str(h_path)), - ) - version = version[:version_end].strip() - return version - -def compute_capabilities(repository_ctx): - """Returns a list of strings representing cuda compute capabilities.""" - if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ: - return _DEFAULT_CUDA_COMPUTE_CAPABILITIES - capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES] - capabilities = capabilities_str.split(",") - for capability in capabilities: - # Workaround for Skylark's lack of support for regex. This check should - # be equivalent to checking: - # if re.match("[0-9]+.[0-9]+", capability) == None: - parts = capability.split(".") - if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit(): - auto_configure_fail("Invalid compute capability: %s" % capability) - return capabilities - -def get_cpu_value(repository_ctx): - """Returns the name of the host operating system. - Args: - repository_ctx: The repository context. - Returns: - A string containing the name of the host operating system. - """ - os_name = repository_ctx.os.name.lower() - if os_name.startswith("mac os"): - return "Darwin" - if os_name.find("windows") != -1: - return "Windows" - result = repository_ctx.execute(["uname", "-s"]) - return result.stdout.strip() - -def _is_windows(repository_ctx): - """Returns true if the host operating system is windows.""" - return get_cpu_value(repository_ctx) == "Windows" - -def lib_name(base_name, cpu_value, version = None, static = False): - """Constructs the platform-specific name of a library. - Args: - base_name: The name of the library, such as "cudart" - cpu_value: The name of the host operating system. - version: The version of the library. - static: True the library is static or False if it is a shared object. - Returns: - The platform-specific name of the library. - """ - version = "" if not version else "." + version - if cpu_value in ("Linux", "FreeBSD"): - if static: - return "lib%s.a" % base_name - return "lib%s.so%s" % (base_name, version) - elif cpu_value == "Windows": - return "%s.lib" % base_name - elif cpu_value == "Darwin": - if static: - return "lib%s.a" % base_name - return "lib%s%s.dylib" % (base_name, version) - else: - auto_configure_fail("Invalid cpu_value: %s" % cpu_value) - -def find_lib(repository_ctx, paths, check_soname = True): - """ - Finds a library among a list of potential paths. - Args: - paths: List of paths to inspect. - Returns: - Returns the first path in paths that exist. - """ - objdump = repository_ctx.which("objdump") - mismatches = [] - for path in [repository_ctx.path(path) for path in paths]: - if not path.exists: - continue - if check_soname and objdump != None and not _is_windows(repository_ctx): - output = repository_ctx.execute([objdump, "-p", str(path)]).stdout - output = [line for line in output.splitlines() if "SONAME" in line] - sonames = [line.strip().split(" ")[-1] for line in output] - if not any([soname == path.basename for soname in sonames]): - mismatches.append(str(path)) - continue - return path - if mismatches: - auto_configure_fail( - "None of the libraries match their SONAME: " + ", ".join(mismatches), - ) - auto_configure_fail("No library found under: " + ", ".join(paths)) - -def _find_cuda_lib( - lib, - repository_ctx, - cpu_value, - basedir, - version, - static = False): - """Finds the given CUDA or cuDNN library on the system. - Args: - lib: The name of the library, such as "cudart" - repository_ctx: The repository context. - cpu_value: The name of the host operating system. - basedir: The install directory of CUDA or cuDNN. - version: The version of the library. - static: True if static library, False if shared object. - Returns: - Returns the path to the library. - """ - file_name = lib_name(lib, cpu_value, version, static) - - return find_lib( - repository_ctx, - ["%s/%s" % (basedir, file_name)], - check_soname = version and not static, - ) - -def _find_libs(repository_ctx, cuda_config): - """Returns the CUDA and cuDNN libraries on the system. - Args: - repository_ctx: The repository context. - cuda_config: The CUDA config as returned by _get_cuda_config - Returns: - Map of library names to structs of filename and path. - """ - cpu_value = cuda_config.cpu_value - stub_dir = "" if _is_windows(repository_ctx) else "/stubs" - return { - "cuda": _find_cuda_lib( - "cuda", - repository_ctx, - cpu_value, - cuda_config.config["cuda_library_dir"] + stub_dir, - None, - ), - "cudart": _find_cuda_lib( - "cudart", - repository_ctx, - cpu_value, - cuda_config.config["cuda_library_dir"], - cuda_config.cudart_version, - ), - "cudart_static": _find_cuda_lib( - "cudart_static", - repository_ctx, - cpu_value, - cuda_config.config["cuda_library_dir"], - cuda_config.cuda_version, - static = True, - ), - "cublas": _find_cuda_lib( - "cublas", - repository_ctx, - cpu_value, - cuda_config.config["cublas_library_dir"], - cuda_config.cublas_version, - ), - "cusolver": _find_cuda_lib( - "cusolver", - repository_ctx, - cpu_value, - cuda_config.config["cusolver_library_dir"], - cuda_config.cusolver_version, - ), - "curand": _find_cuda_lib( - "curand", - repository_ctx, - cpu_value, - cuda_config.config["curand_library_dir"], - cuda_config.curand_version, - ), - "cufft": _find_cuda_lib( - "cufft", - repository_ctx, - cpu_value, - cuda_config.config["cufft_library_dir"], - cuda_config.cufft_version, - ), - "cudnn": _find_cuda_lib( - "cudnn", - repository_ctx, - cpu_value, - cuda_config.config["cudnn_library_dir"], - cuda_config.cudnn_version, - ), - "cupti": _find_cuda_lib( - "cupti", - repository_ctx, - cpu_value, - cuda_config.config["cupti_library_dir"], - cuda_config.cuda_version, - ), - } - -def _cudart_static_linkopt(cpu_value): - """Returns additional platform-specific linkopts for cudart.""" - return "" if cpu_value == "Darwin" else "\"-lrt\"," - -def _get_cuda_config(repository_ctx): - """Detects and returns information about the CUDA installation on the system. - Args: - repository_ctx: The repository context. - Returns: - A struct containing the following fields: - cuda_toolkit_path: The CUDA toolkit installation directory. - cudnn_install_basedir: The cuDNN installation directory. - cuda_version: The version of CUDA on the system. - cudart_version: The CUDA runtime version on the system. - cudnn_version: The version of cuDNN on the system. - compute_capabilities: A list of the system's CUDA compute capabilities. - cpu_value: The name of the host operating system. - """ - config = find_cuda_config(repository_ctx, ["cuda", "cudnn"]) - cpu_value = get_cpu_value(repository_ctx) - toolkit_path = config["cuda_toolkit_path"] - - is_windows = _is_windows(repository_ctx) - cuda_version = config["cuda_version"].split(".") - cuda_major = cuda_version[0] - cuda_minor = cuda_version[1] - - cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor) - cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"] - - if int(cuda_major) >= 11: - # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability. - if int(cuda_major) == 11: - cudart_version = "64_110" if is_windows else "11.0" - else: - cudart_version = ("64_%s" if is_windows else "%s") % cuda_major - cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0] - cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0] - curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0] - cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0] - elif (int(cuda_major), int(cuda_minor)) >= (10, 1): - # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc. - # It changed from 'x.y' to just 'x' in CUDA 10.1. - cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major - cublas_version = cuda_lib_version - cusolver_version = cuda_lib_version - curand_version = cuda_lib_version - cufft_version = cuda_lib_version - cudart_version = cuda_version - else: - cublas_version = cuda_version - cusolver_version = cuda_version - curand_version = cuda_version - cufft_version = cuda_version - cudart_version = cuda_version - - return struct( - cuda_toolkit_path = toolkit_path, - cuda_version = cuda_version, - cudart_version = cudart_version, - cublas_version = cublas_version, - cusolver_version = cusolver_version, - curand_version = curand_version, - cufft_version = cufft_version, - cudnn_version = cudnn_version, - compute_capabilities = compute_capabilities(repository_ctx), - cpu_value = cpu_value, - config = config, - ) - -def _tpl(repository_ctx, tpl, substitutions = {}, out = None): - if substitutions == None: - substitutions = {} - if not out: - out = tpl.replace(":", "/") - repository_ctx.template( - out, - Label("//build_deps/toolchains/gpu/%s.tpl" % tpl), - substitutions, - ) - -_DUMMY_CROSSTOOL_BUILD_FILE = """ -load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled") -error_gpu_disabled() -""" - -def _create_dummy_repository(repository_ctx): - cpu_value = get_cpu_value(repository_ctx) - - # Set up BUILD file for cuda/. - _tpl( - repository_ctx, - "cuda:build_defs.bzl", - { - "%{cuda_is_configured}": "False", - "%{cuda_extra_copts}": "[]", - }, - ) - _tpl( - repository_ctx, - "cuda:BUILD", - { - "%{cuda_driver_lib}": lib_name("cuda", cpu_value), - "%{cudart_static_lib}": lib_name( - "cudart_static", - cpu_value, - static = True, - ), - "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value), - "%{cudart_lib}": lib_name("cudart", cpu_value), - "%{cublas_lib}": lib_name("cublas", cpu_value), - "%{cusolver_lib}": lib_name("cusolver", cpu_value), - "%{cudnn_lib}": lib_name("cudnn", cpu_value), - "%{cufft_lib}": lib_name("cufft", cpu_value), - "%{curand_lib}": lib_name("curand", cpu_value), - "%{cupti_lib}": lib_name("cupti", cpu_value), - "%{copy_rules}": "", - "%{cuda_headers}": "", - }, - ) - - # Create dummy files for the CUDA toolkit since they are still required by - # tensorflow/core/platform/default/build_config:cuda. - repository_ctx.file("cuda/cuda/include/cuda.h") - repository_ctx.file("cuda/cuda/include/cublas.h") - repository_ctx.file("cuda/cuda/include/cudnn.h") - repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h") - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value)) - repository_ctx.file( - "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value), - ) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value)) - repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value)) - -def _execute( - repository_ctx, - cmdline, - error_msg = None, - error_details = None, - empty_stdout_fine = False): - """Executes an arbitrary shell command. - Args: - repository_ctx: the repository_ctx object - cmdline: list of strings, the command to execute - error_msg: string, a summary of the error if the command fails - error_details: string, details about the error or steps to fix it - empty_stdout_fine: bool, if True, an empty stdout result is fine, - otherwise it's an error - Return: the result of repository_ctx.execute(cmdline) - """ - result = repository_ctx.execute(cmdline) - if result.stderr or not (empty_stdout_fine or result.stdout): - auto_configure_fail( - "\n".join([ - error_msg.strip() if error_msg else "Repository command failed", - result.stderr.strip(), - error_details if error_details else "", - ]), - ) - return result - -def _norm_path(path): - """Returns a path with '/' and remove the trailing slash.""" - path = path.replace("\\", "/") - if path[-1] == "/": - path = path[:-1] - return path - -def make_copy_files_rule(repository_ctx, name, srcs, outs): - """Returns a rule to copy a set of files.""" - cmds = [] - - # Copy files. - for src, out in zip(srcs, outs): - cmds.append('cp -f "%s" $(location %s)' % (src, out)) - outs = [(' "%s",' % out) for out in outs] - return """genrule( - name = "%s", - outs = [ -%s - ], - cmd = \"""%s \""", -)""" % (name, "\n".join(outs), " && ".join(cmds)) - -def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir): - """Returns a rule to recursively copy a directory.""" - src_dir = _norm_path(src_dir) - out_dir = _norm_path(out_dir) - outs = _read_dir(repository_ctx, src_dir) - outs = [(' "%s",' % out.replace(src_dir, out_dir)) for out in outs] - - # '@D' already contains the relative path for a single file, see - # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables - out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)" - return """genrule( - name = "%s", - outs = [ -%s - ], - cmd = \"""cp -rLf "%s/." "%s/" \""", -)""" % (name, "\n".join(outs), src_dir, out_dir) - -def _read_dir(repository_ctx, src_dir): - """Returns a string with all files in a directory. - Finds all files inside a directory, traversing subfolders and following - symlinks. The returned string contains the full path of all files - separated by line breaks. - """ - if _is_windows(repository_ctx): - src_dir = src_dir.replace("/", "\\") - find_result = _execute( - repository_ctx, - ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"], - empty_stdout_fine = True, - ) - - # src_files will be used in genrule.outs where the paths must - # use forward slashes. - result = find_result.stdout.replace("\\", "/") - else: - find_result = _execute( - repository_ctx, - ["find", src_dir, "-follow", "-type", "f"], - empty_stdout_fine = True, - ) - result = find_result.stdout - return sorted(result.splitlines()) - -def _create_local_cuda_repository(repository_ctx): - """Creates the repository containing files set up to build with CUDA.""" - cuda_config = _get_cuda_config(repository_ctx) - - cuda_include_path = cuda_config.config["cuda_include_dir"] - cublas_include_path = cuda_config.config["cublas_include_dir"] - cudnn_header_dir = cuda_config.config["cudnn_include_dir"] - cupti_header_dir = cuda_config.config["cupti_include_dir"] - nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"] - - # Create genrule to copy files from the installed CUDA toolkit into execroot. - copy_rules = [ - make_copy_dir_rule( - repository_ctx, - name = "cuda-include", - src_dir = cuda_include_path, - out_dir = "cuda/include", - ), - make_copy_dir_rule( - repository_ctx, - name = "cuda-nvvm", - src_dir = nvvm_libdevice_dir, - out_dir = "cuda/nvvm/libdevice", - ), - make_copy_dir_rule( - repository_ctx, - name = "cuda-extras", - src_dir = cupti_header_dir, - out_dir = "cuda/extras/CUPTI/include", - ), - ] - - copy_rules.append(make_copy_files_rule( - repository_ctx, - name = "cublas-include", - srcs = [ - cublas_include_path + "/cublas.h", - cublas_include_path + "/cublas_v2.h", - cublas_include_path + "/cublas_api.h", - ], - outs = [ - "cublas/include/cublas.h", - "cublas/include/cublas_v2.h", - "cublas/include/cublas_api.h", - ], - )) - - cuda_libs = _find_libs(repository_ctx, cuda_config) - cuda_lib_srcs = [] - cuda_lib_outs = [] - for path in cuda_libs.values(): - cuda_lib_srcs.append(str(path)) - cuda_lib_outs.append("cuda/lib/" + path.basename) - copy_rules.append(make_copy_files_rule( - repository_ctx, - name = "cuda-lib", - srcs = cuda_lib_srcs, - outs = cuda_lib_outs, - )) - - copy_rules.append(make_copy_dir_rule( - repository_ctx, - name = "cuda-bin", - src_dir = cuda_config.cuda_toolkit_path + "/bin", - out_dir = "cuda/bin", - )) - - # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH. - included_files = _read_dir(repository_ctx, cuda_include_path) - if not any([file.endswith("cudnn.h") for file in included_files]): - if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]: - cudnn_headers = ["cudnn.h"] - else: - cudnn_headers = [ - "cudnn_adv_infer.h", - "cudnn_adv_train.h", - "cudnn_cnn_infer.h", - "cudnn_cnn_train.h", - "cudnn_ops_infer.h", - "cudnn_ops_train.h", - "cudnn.h", - "cudnn_version.h", - ] - cudnn_srcs = [] - cudnn_outs = [] - for header in cudnn_headers: - cudnn_srcs.append(cudnn_header_dir + "/" + header) - cudnn_outs.append("cudnn/include/" + header) - - copy_rules.append(make_copy_files_rule( - repository_ctx, - name = "cudnn-include", - srcs = cudnn_srcs, - outs = cudnn_outs, - )) - else: - copy_rules.append("filegroup(name = 'cudnn-include')\n") - - # Set up BUILD file for cuda/ - _tpl( - repository_ctx, - "cuda:build_defs.bzl", - { - "%{cuda_is_configured}": "True", - "%{cuda_extra_copts}": "[]", - }, - ) - - _tpl( - repository_ctx, - "cuda:BUILD", - { - "%{cuda_driver_lib}": cuda_libs["cuda"].basename, - "%{cudart_static_lib}": cuda_libs["cudart_static"].basename, - "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value), - "%{cudart_lib}": cuda_libs["cudart"].basename, - "%{cublas_lib}": cuda_libs["cublas"].basename, - "%{cusolver_lib}": cuda_libs["cusolver"].basename, - "%{cudnn_lib}": cuda_libs["cudnn"].basename, - "%{cufft_lib}": cuda_libs["cufft"].basename, - "%{curand_lib}": cuda_libs["curand"].basename, - "%{cupti_lib}": cuda_libs["cupti"].basename, - "%{copy_rules}": "\n".join(copy_rules), - "%{cuda_headers}": ( - '":cuda-include",\n' + ' ":cudnn-include",' - ), - }, - "cuda/BUILD", - ) - - # Set up crosstool/ - cc = find_cc(repository_ctx) - cc_fullpath = cc - - host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath) - - cuda_defines = {} - - # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see - # https://github.com/bazelbuild/bazel/issues/760). - # However, this stops our custom clang toolchain from picking the provided - # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded - # toolchain. - # TODO: when bazel stops adding '-B/usr/bin' by default, remove this - # flag from the CROSSTOOL completely (see - # https://github.com/bazelbuild/bazel/issues/5634) - cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"' - - cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc" - cuda_defines["%{host_compiler_warnings}"] = "" - - # nvcc has the system include paths built in and will automatically - # search them; we cannot work around that, so we add the relevant cuda - # system paths to the allowed compiler specific include paths. - cuda_defines["%{host_compiler_includes}"] = ( - host_compiler_includes + "\n" + _cuda_include_path( - repository_ctx, - cuda_config, - ) + - "\n cxx_builtin_include_directory: \"%s\"" % cupti_header_dir + - "\n cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir - ) - - # For gcc, do not canonicalize system header paths; some versions of gcc - # pick the shortest possible path for system includes when creating the - # .d file - given that includes that are prefixed with "../" multiple - # time quickly grow longer than the root of the tree, this can lead to - # bazel's header check failing. - cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ( - "flag: \"-fno-canonical-system-headers\"" - ) - nvcc_path = str( - repository_ctx.path("%s/bin/nvcc%s" % ( - cuda_config.cuda_toolkit_path, - ".exe" if _is_windows(repository_ctx) else "", - )), - ) - - builtin_include_directories = [] - for one_line in cuda_defines["%{host_compiler_includes}"].splitlines(): - inc_dir = one_line.split(":")[1][2:-1] - builtin_include_directories.append(inc_dir) - - _tpl( - repository_ctx, - "crosstool:BUILD", - { - "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc", - "%{cxx_builtin_include_directories}": ",".join(builtin_include_directories), - "%{win_linker_files}": ":windows_msvc_wrapper_files", - }, - ) - wrapper_defines = { - "%{cpu_compiler}": str(cc), - "%{cuda_version}": cuda_config.cuda_version, - "%{nvcc_path}": nvcc_path, - "%{gcc_host_compiler_path}": str(cc), - "%{cuda_compute_capabilities}": ", ".join( - ["\"%s\"" % c for c in cuda_config.compute_capabilities], - ), - "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx), - } - - _tpl( - repository_ctx, - "crosstool:cc_toolchain_config.bzl", - wrapper_defines, - ) - _tpl( - repository_ctx, - "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc", - wrapper_defines, - ) - - _tpl( - repository_ctx, - "crosstool:windows/msvc_wrapper_for_nvcc.py", - wrapper_defines, - ) - - _tpl( - repository_ctx, - "crosstool:CROSSTOOL", - cuda_defines.update(_get_win_cuda_defines(repository_ctx)), - out = "crosstool/CROSSTOOL", - ) - -def find_cuda_config(repository_ctx, cuda_libraries): - """Returns CUDA config dictionary from running find_cuda_config.py""" - exec_result = repository_ctx.execute([ - _get_python_bin(repository_ctx), - repository_ctx.path(Label("//build_deps/toolchains/gpu:find_cuda_config.py")), - ] + cuda_libraries) - if exec_result.return_code: - auto_configure_fail("Failed to run find_cuda_config.py: %s" % exec_result.stderr) - - # Parse the dict from stdout. - return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()]) - -def _cuda_autoconf_impl(repository_ctx): - """Implementation of the cuda_autoconf repository rule.""" - if not enable_cuda(repository_ctx): - _create_dummy_repository(repository_ctx) - else: - _create_local_cuda_repository(repository_ctx) - -cuda_configure = repository_rule( - environ = [ - _GCC_HOST_COMPILER_PATH, - _CLANG_CUDA_COMPILER_PATH, - "TF_NEED_CUDA", - "TF_CUDA_CLANG", - _TF_DOWNLOAD_CLANG, - _CUDA_TOOLKIT_PATH, - _CUDNN_INSTALL_PATH, - _TF_CUDA_VERSION, - _TF_CUDNN_VERSION, - _TF_CUDA_COMPUTE_CAPABILITIES, - "NVVMIR_LIBRARY_DIR", - _PYTHON_BIN_PATH, - ], - implementation = _cuda_autoconf_impl, -) - -"""Detects and configures the local CUDA toolchain. -Add the following to your WORKSPACE FILE: -```python -cuda_configure(name = "local_config_cuda") -``` -Args: - name: A unique name for this workspace rule. -""" diff --git a/build_deps/toolchains/gpu/find_cuda_config.py b/build_deps/toolchains/gpu/find_cuda_config.py deleted file mode 100644 index 679de5ea..00000000 --- a/build_deps/toolchains/gpu/find_cuda_config.py +++ /dev/null @@ -1,632 +0,0 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Prints CUDA library and header directories and versions found on the system. - -The script searches for CUDA library and header files on the system, inspects -them to determine their version and prints the configuration to stdout. -The paths to inspect and the required versions are specified through environment -variables. If no valid configuration is found, the script prints to stderr and -returns an error code. - -The list of libraries to find is specified as arguments. Supported libraries are -CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT. - -The script takes a list of base directories specified by the TF_CUDA_PATHS -environment variable as comma-separated glob list. The script looks for headers -and library files in a hard-coded set of subdirectories from these base paths. -If TF_CUDA_PATHS is not specified, a OS specific default is used: - - Linux: /usr/local/cuda, /usr, and paths from 'ldconfig -p'. - Windows: CUDA_PATH environment variable, or - C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\* - -For backwards compatibility, some libraries also use alternative base -directories from other environment variables if they are specified. List of -library-specific environment variables: - - Library Version env variable Additional base directories - ---------------------------------------------------------------- - CUDA TF_CUDA_VERSION CUDA_TOOLKIT_PATH - cuBLAS TF_CUBLAS_VERSION CUDA_TOOLKIT_PATH - cuDNN TF_CUDNN_VERSION CUDNN_INSTALL_PATH - NCCL TF_NCCL_VERSION NCCL_INSTALL_PATH, NCCL_HDR_PATH - TensorRT TF_TENSORRT_VERSION TENSORRT_INSTALL_PATH - -Versions environment variables can be of the form 'x' or 'x.y' to request a -specific version, empty or unspecified to accept any version. - -The output of a found library is of the form: -tf__version: x.y.z -tf__header_dir: ... -tf__library_dir: ... -""" - -import io -import os -import glob -import platform -import re -import subprocess -import sys - -# pylint: disable=g-import-not-at-top -try: - from shutil import which -except ImportError: - from distutils.spawn import find_executable as which -# pylint: enable=g-import-not-at-top - - -class ConfigError(Exception): - pass - - -def _is_linux(): - return platform.system() == "Linux" - - -def _is_windows(): - return platform.system() == "Windows" - - -def _is_macos(): - return platform.system() == "Darwin" - - -def _matches_version(actual_version, required_version): - """Checks whether some version meets the requirements. - - All elements of the required_version need to be present in the - actual_version. - - required_version actual_version result - ----------------------------------------- - 1 1.1 True - 1.2 1 False - 1.2 1.3 False - 1 True - - Args: - required_version: The version specified by the user. - actual_version: The version detected from the CUDA installation. - Returns: Whether the actual version matches the required one. - """ - if actual_version is None: - return False - - # Strip spaces from the versions. - actual_version = actual_version.strip() - required_version = required_version.strip() - return actual_version.startswith(required_version) - - -def _at_least_version(actual_version, required_version): - actual = [int(v) for v in actual_version.split(".")] - required = [int(v) for v in required_version.split(".")] - return actual >= required - - -def _get_header_version(path, name): - """Returns preprocessor defines in C header file.""" - for line in io.open(path, "r", encoding="utf-8").readlines(): - match = re.match(r"#define %s +(\d+)" % name, line) - if match: - return match.group(1) - return "" - - -def _cartesian_product(first, second): - """Returns all path combinations of first and second.""" - return [os.path.join(f, s) for f in first for s in second] - - -def _get_ld_config_paths(): - """Returns all directories from 'ldconfig -p'.""" - if not _is_linux(): - return [] - ldconfig_path = which("ldconfig") or "/sbin/ldconfig" - output = subprocess.check_output([ldconfig_path, "-p"]) - pattern = re.compile(".* => (.*)") - result = set() - for line in output.splitlines(): - try: - match = pattern.match(line.decode("ascii")) - except UnicodeDecodeError: - match = False - if match: - result.add(os.path.dirname(match.group(1))) - return sorted(list(result)) - - -def _get_default_cuda_paths(cuda_version): - if not cuda_version: - cuda_version = "*" - elif "." not in cuda_version: - cuda_version = cuda_version + ".*" - - if _is_windows(): - return [ - os.environ.get( - "CUDA_PATH", - "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" % cuda_version, - ) - ] - return [ - "/usr/local/cuda-%s" % cuda_version, - "/usr/local/cuda", - "/usr", - "/usr/local/cudnn", - ] + _get_ld_config_paths() - - -def _header_paths(): - """Returns hard-coded set of relative paths to look for header files.""" - return [ - "", - "include", - "include/cuda", - "include/*-linux-gnu", - "extras/CUPTI/include", - "include/cuda/CUPTI", - "local/cuda/extras/CUPTI/include", - ] - - -def _library_paths(): - """Returns hard-coded set of relative paths to look for library files.""" - return [ - "", - "lib64", - "lib", - "lib/*-linux-gnu", - "lib/x64", - "extras/CUPTI/*", - "local/cuda/lib64", - "local/cuda/extras/CUPTI/lib64", - ] - - -def _not_found_error(base_paths, relative_paths, filepattern): - base_paths = "".join(["\n '%s'" % path for path in sorted(base_paths)]) - relative_paths = "".join(["\n '%s'" % path for path in relative_paths]) - return ConfigError( - "Could not find any %s in any subdirectory:%s\nof:%s\n" % (filepattern, relative_paths, base_paths) - ) - - -def _find_file(base_paths, relative_paths, filepattern): - for path in _cartesian_product(base_paths, relative_paths): - for file in glob.glob(os.path.join(path, filepattern)): - return file - raise _not_found_error(base_paths, relative_paths, filepattern) - - -def _find_library(base_paths, library_name, required_version): - """Returns first valid path to the requested library.""" - if _is_windows(): - filepattern = library_name + ".lib" - elif _is_macos(): - filepattern = "%s*.dylib" % (".".join(["lib" + library_name] + required_version.split(".")[:1])) - else: - filepattern = (".".join(["lib" + library_name, "so"] + required_version.split(".")[:1]) + "*") - return _find_file(base_paths, _library_paths(), filepattern) - - -def _find_versioned_file(base_paths, relative_paths, filepatterns, required_version, get_version): - """Returns first valid path to a file that matches the requested version.""" - if type(filepatterns) not in [list, tuple]: - filepatterns = [filepatterns] - for path in _cartesian_product(base_paths, relative_paths): - for filepattern in filepatterns: - for file in glob.glob(os.path.join(path, filepattern)): - actual_version = get_version(file) - if _matches_version(actual_version, required_version): - return file, actual_version - raise _not_found_error( - base_paths, - relative_paths, - ", ".join(filepatterns) + " matching version '%s'" % required_version, - ) - - -def _find_header(base_paths, header_name, required_version, get_version): - """Returns first valid path to a header that matches the requested version.""" - return _find_versioned_file(base_paths, _header_paths(), header_name, required_version, get_version) - - -def _find_cuda_config(base_paths, required_version): - - def get_header_version(path): - version = int(_get_header_version(path, "CUDA_VERSION")) - if not version: - return None - return "%d.%d" % (version // 1000, version % 1000 // 10) - - cuda_header_path, header_version = _find_header(base_paths, "cuda.h", required_version, get_header_version) - cuda_version = header_version # x.y, see above. - - cuda_library_path = _find_library(base_paths, "cudart", cuda_version) - - def get_nvcc_version(path): - pattern = r"Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)" - for line in subprocess.check_output([path, "--version"]).splitlines(): - match = re.match(pattern, line.decode("ascii")) - if match: - return match.group(1) - return None - - nvcc_name = "nvcc.exe" if _is_windows() else "nvcc" - nvcc_path, nvcc_version = _find_versioned_file( - base_paths, - [ - "", - "bin", - "local/cuda/bin", - ], - nvcc_name, - cuda_version, - get_nvcc_version, - ) - - nvvm_path = _find_file( - base_paths, - [ - "nvvm/libdevice", - "share/cuda", - "lib/nvidia-cuda-toolkit/libdevice", - "local/cuda/nvvm/libdevice", - ], - "libdevice*.10.bc", - ) - - cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h") - cupti_library_path = _find_library(base_paths, "cupti", required_version) - - cuda_binary_dir = os.path.dirname(nvcc_path) - nvvm_library_dir = os.path.dirname(nvvm_path) - - # XLA requires the toolkit path to find ptxas and libdevice. - # TODO(csigg): pass in both directories instead. - cuda_toolkit_paths = ( - os.path.normpath(os.path.join(cuda_binary_dir, "..")), - os.path.normpath(os.path.join(nvvm_library_dir, "../..")), - ) - if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]: - raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths) - - return { - "cuda_version": cuda_version, - "cuda_include_dir": os.path.dirname(cuda_header_path), - "cuda_library_dir": os.path.dirname(cuda_library_path), - "cuda_binary_dir": cuda_binary_dir, - "nvvm_library_dir": nvvm_library_dir, - "cupti_include_dir": os.path.dirname(cupti_header_path), - "cupti_library_dir": os.path.dirname(cupti_library_path), - "cuda_toolkit_path": cuda_toolkit_paths[0], - } - - -def _find_cublas_config(base_paths, required_version, cuda_version): - if _at_least_version(cuda_version, "10.1"): - - def get_header_version(path): - version = ( - _get_header_version(path, name) for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH") - ) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "cublas_api.h", required_version, get_header_version) - # cuBLAS uses the major version only. - cublas_version = header_version.split(".")[0] - - else: - # There is no version info available before CUDA 10.1, just find the file. - header_version = cuda_version - header_path = _find_file(base_paths, _header_paths(), "cublas_api.h") - # cuBLAS version is the same as CUDA version (x.y). - cublas_version = required_version - - library_path = _find_library(base_paths, "cublas", cublas_version) - - return { - "cublas_version": header_version, - "cublas_include_dir": os.path.dirname(header_path), - "cublas_library_dir": os.path.dirname(library_path), - } - - -def _find_cusolver_config(base_paths, required_version, cuda_version): - if _at_least_version(cuda_version, "11.0"): - - def get_header_version(path): - version = ( - _get_header_version(path, name) for name in ( - "CUSOLVER_VER_MAJOR", - "CUSOLVER_VER_MINOR", - "CUSOLVER_VER_PATCH", - ) - ) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "cusolver_common.h", required_version, get_header_version) - cusolver_version = header_version.split(".")[0] - - else: - header_version = cuda_version - header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h") - cusolver_version = required_version - - library_path = _find_library(base_paths, "cusolver", cusolver_version) - - return { - "cusolver_version": header_version, - "cusolver_include_dir": os.path.dirname(header_path), - "cusolver_library_dir": os.path.dirname(library_path), - } - - -def _find_curand_config(base_paths, required_version, cuda_version): - if _at_least_version(cuda_version, "11.0"): - - def get_header_version(path): - version = ( - _get_header_version(path, name) for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH") - ) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "curand.h", required_version, get_header_version) - curand_version = header_version.split(".")[0] - - else: - header_version = cuda_version - header_path = _find_file(base_paths, _header_paths(), "curand.h") - curand_version = required_version - - library_path = _find_library(base_paths, "curand", curand_version) - - return { - "curand_version": header_version, - "curand_include_dir": os.path.dirname(header_path), - "curand_library_dir": os.path.dirname(library_path), - } - - -def _find_cufft_config(base_paths, required_version, cuda_version): - if _at_least_version(cuda_version, "11.0"): - - def get_header_version(path): - version = (_get_header_version(path, name) for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH")) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "cufft.h", required_version, get_header_version) - cufft_version = header_version.split(".")[0] - - else: - header_version = cuda_version - header_path = _find_file(base_paths, _header_paths(), "cufft.h") - cufft_version = required_version - - library_path = _find_library(base_paths, "cufft", cufft_version) - - return { - "cufft_version": header_version, - "cufft_include_dir": os.path.dirname(header_path), - "cufft_library_dir": os.path.dirname(library_path), - } - - -def _find_cudnn_config(base_paths, required_version): - - def get_header_version(path): - version = [_get_header_version(path, name) for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")] - return ".".join(version) if version[0] else None - - header_path, header_version = _find_header( - base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version - ) - cudnn_version = header_version.split(".")[0] - - library_path = _find_library(base_paths, "cudnn", cudnn_version) - - return { - "cudnn_version": cudnn_version, - "cudnn_include_dir": os.path.dirname(header_path), - "cudnn_library_dir": os.path.dirname(library_path), - } - - -def _find_cusparse_config(base_paths, required_version, cuda_version): - if _at_least_version(cuda_version, "11.0"): - - def get_header_version(path): - version = ( - _get_header_version(path, name) for name in ( - "CUSPARSE_VER_MAJOR", - "CUSPARSE_VER_MINOR", - "CUSPARSE_VER_PATCH", - ) - ) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "cusparse.h", required_version, get_header_version) - cusparse_version = header_version.split(".")[0] - - else: - header_version = cuda_version - header_path = _find_file(base_paths, _header_paths(), "cusparse.h") - cusparse_version = required_version - - library_path = _find_library(base_paths, "cusparse", cusparse_version) - - return { - "cusparse_version": header_version, - "cusparse_include_dir": os.path.dirname(header_path), - "cusparse_library_dir": os.path.dirname(library_path), - } - - -def _find_nccl_config(base_paths, required_version): - - def get_header_version(path): - version = (_get_header_version(path, name) for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH")) - return ".".join(version) - - header_path, header_version = _find_header(base_paths, "nccl.h", required_version, get_header_version) - nccl_version = header_version.split(".")[0] - - library_path = _find_library(base_paths, "nccl", nccl_version) - - return { - "nccl_version": nccl_version, - "nccl_include_dir": os.path.dirname(header_path), - "nccl_library_dir": os.path.dirname(library_path), - } - - -def _find_tensorrt_config(base_paths, required_version): - - def get_header_version(path): - version = ( - _get_header_version(path, name) for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH") - ) - # `version` is a generator object, so we convert it to a list before using - # it (muitiple times below). - version = list(version) - if not all(version): - return None # Versions not found, make _matches_version returns False. - return ".".join(version) - - try: - header_path, header_version = _find_header(base_paths, "NvInfer.h", required_version, get_header_version) - except ConfigError: - # TensorRT 6 moved the version information to NvInferVersion.h. - header_path, header_version = _find_header(base_paths, "NvInferVersion.h", required_version, get_header_version) - - tensorrt_version = header_version.split(".")[0] - library_path = _find_library(base_paths, "nvinfer", tensorrt_version) - - return { - "tensorrt_version": tensorrt_version, - "tensorrt_include_dir": os.path.dirname(header_path), - "tensorrt_library_dir": os.path.dirname(library_path), - } - - -def _list_from_env(env_name, default=[]): - """Returns comma-separated list from environment variable.""" - if env_name in os.environ: - return os.environ[env_name].split(",") - return default - - -def _get_legacy_path(env_name, default=[]): - """Returns a path specified by a legacy environment variable. - - CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to - '/usr/lib/x86_64-linux-gnu' would previously find both library and header - paths. Detect those and return '/usr', otherwise forward to _list_from_env(). - """ - if env_name in os.environ: - match = re.match(r"^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name]) - if match: - return [match.group(1)] - return _list_from_env(env_name, default) - - -def _normalize_path(path): - """Returns normalized path, with forward slashes on Windows.""" - path = os.path.realpath(path) - if _is_windows(): - path = path.replace("\\", "/") - return path - - -def find_cuda_config(): - """Returns a dictionary of CUDA library and header file paths.""" - libraries = [argv.lower() for argv in sys.argv[1:]] - cuda_version = os.environ.get("TF_CUDA_VERSION", "") - base_paths = _list_from_env("TF_CUDA_PATHS", _get_default_cuda_paths(cuda_version)) - base_paths = [path for path in base_paths if os.path.exists(path)] - - result = {} - if "cuda" in libraries: - cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths) - result.update(_find_cuda_config(cuda_paths, cuda_version)) - - cuda_version = result["cuda_version"] - cublas_paths = base_paths - if tuple(int(v) for v in cuda_version.split(".")) < (10, 1): - # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit. - cublas_paths = cuda_paths - cublas_version = os.environ.get("TF_CUBLAS_VERSION", "") - result.update(_find_cublas_config(cublas_paths, cublas_version, cuda_version)) - - cusolver_paths = base_paths - if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): - cusolver_paths = cuda_paths - cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "") - result.update(_find_cusolver_config(cusolver_paths, cusolver_version, cuda_version)) - - curand_paths = base_paths - if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): - curand_paths = cuda_paths - curand_version = os.environ.get("TF_CURAND_VERSION", "") - result.update(_find_curand_config(curand_paths, curand_version, cuda_version)) - - cufft_paths = base_paths - if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): - cufft_paths = cuda_paths - cufft_version = os.environ.get("TF_CUFFT_VERSION", "") - result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version)) - - cusparse_paths = base_paths - if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): - cusparse_paths = cuda_paths - cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "") - result.update(_find_cusparse_config(cusparse_paths, cusparse_version, cuda_version)) - - if "cudnn" in libraries: - cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths) - cudnn_version = os.environ.get("TF_CUDNN_VERSION", "") - result.update(_find_cudnn_config(cudnn_paths, cudnn_version)) - - if "nccl" in libraries: - nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths) - nccl_version = os.environ.get("TF_NCCL_VERSION", "") - result.update(_find_nccl_config(nccl_paths, nccl_version)) - - if "tensorrt" in libraries: - tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths) - tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "") - result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version)) - - for k, v in result.items(): - if k.endswith("_dir") or k.endswith("_path"): - result[k] = _normalize_path(v) - - return result - - -def main(): - try: - for key, value in sorted(find_cuda_config().items()): - print("%s: %s" % (key, value)) - except ConfigError as e: - sys.stderr.write(str(e)) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/configure.py b/configure.py index 60fb41cf..e2086460 100644 --- a/configure.py +++ b/configure.py @@ -17,12 +17,15 @@ import argparse import errno import glob +import logging import os import pathlib import platform import re +import shutil import subprocess import sys +from typing import Optional import tensorflow as tf from packaging.version import Version @@ -37,9 +40,6 @@ _DEFAULT_CUDA_VERSION = '11' _DEFAULT_CUDNN_VERSION = '2' _DEFAULT_TENSORRT_VERSION = '6' -_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '7.0,7.5,8.0,8.6' - -_SUPPORTED_ANDROID_NDK_VERSIONS = [19, 20, 21, 25] _DEFAULT_PROMPT_ASK_ATTEMPTS = 10 @@ -48,20 +48,6 @@ _DP_BAZELRC = '' _DP_CURRENT_BAZEL_VERSION = None -NCCL_LIB_PATHS = ['lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''] - -# List of files to configure when building Bazel on Apple platforms. -APPLE_BAZEL_FILES = [ - 'tensorflow/lite/ios/BUILD', 'tensorflow/lite/objc/BUILD', 'tensorflow/lite/swift/BUILD', - 'tensorflow/lite/tools/benchmark/experimental/ios/BUILD' -] - -# List of files to move when building for iOS. -IOS_FILES = [ - 'tensorflow/lite/objc/TensorFlowLiteObjC.podspec', - 'tensorflow/lite/swift/TensorFlowLiteSwift.podspec', -] - class UserInputError(Exception): pass @@ -104,6 +90,45 @@ def get_tf_header_dir(): return tf_header_dir +def get_cpp_version(): + cpp_version = "c++14" + if Version(tf.__version__) >= Version("2.10"): + cpp_version = "c++17" + return cpp_version + + +def get_tf_shared_lib_dir(): + import tensorflow as tf + + # OS Specific parsing + if is_windows(): + tf_shared_lib_dir = tf.sysconfig.get_compile_flags()[0][2:-7] + "python" + return tf_shared_lib_dir.replace("\\", "/") + elif is_raspi_arm(): + return tf.sysconfig.get_compile_flags()[0][2:-7] + "python" + else: + return tf.sysconfig.get_link_flags()[0][2:] + + +# Converts the linkflag namespec to the full shared library name +def get_shared_lib_name(): + import tensorflow as tf + + namespec = tf.sysconfig.get_link_flags() + if is_macos(): + # MacOS + return "lib" + namespec[1][2:] + ".dylib" + elif is_windows(): + # Windows + return "_pywrap_tensorflow_internal.lib" + elif is_raspi_arm(): + # The below command for linux would return an empty list + return "_pywrap_tensorflow_internal.so" + else: + # Linux + return namespec[1][3:] + + def get_tf_version_integer(): """ Get Tensorflow version as a 4 digits string. @@ -115,7 +140,7 @@ def get_tf_version_integer(): 2.8.3 get 2083 The 4-digits-string will be passed to C macro to discriminate different - Tensorflow versions. + Tensorflow versions. We assume that major version has 1 digit, minor version has 2 digits. And patch version has 1 digit. @@ -146,45 +171,6 @@ def get_tf_version_integer(): return int(tf_version_num) -def get_cpp_version(): - cpp_version = "c++14" - if Version(tf.__version__) >= Version("2.10"): - cpp_version = "c++17" - return cpp_version - - -def get_tf_shared_lib_dir(): - import tensorflow as tf - - # OS Specific parsing - if is_windows(): - tf_shared_lib_dir = tf.sysconfig.get_compile_flags()[0][2:-7] + "python" - return tf_shared_lib_dir.replace("\\", "/") - elif is_raspi_arm(): - return tf.sysconfig.get_compile_flags()[0][2:-7] + "python" - else: - return tf.sysconfig.get_link_flags()[0][2:] - - -# Converts the linkflag namespec to the full shared library name -def get_shared_lib_name(): - import tensorflow as tf - - namespec = tf.sysconfig.get_link_flags() - if is_macos(): - # MacOS - return "lib" + namespec[1][2:] + ".dylib" - elif is_windows(): - # Windows - return "_pywrap_tensorflow_internal.lib" - elif is_raspi_arm(): - # The below command for linux would return an empty list - return "_pywrap_tensorflow_internal.so" - else: - # Linux - return namespec[1][3:] - - def get_input(question): try: try: @@ -222,6 +208,10 @@ def write_action_env(var_name, var): write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var))) +def write_repo_env(var_name, var): + write_to_bazelrc('build --repo_env {}="{}"'.format(var_name, str(var))) + + def run_shell(cmd, allow_non_zero=False, stderr=None): if stderr is None: stderr = sys.stdout @@ -315,6 +305,8 @@ def setup_python(environ_cp): python_major_version = get_python_major_version(python_bin_path) if python_major_version == '2': write_to_bazelrc('build --host_force_python=PY2') + logging.debug(f"Hermetic Python version: {sys.version_info.major}.{sys.version_info.minor}") + write_repo_env("HERMETIC_PYTHON_VERSION", f"{sys.version_info.major}.{sys.version_info.minor}") # Convert python path to Windows style before writing into bazel.rc if is_windows() or is_cygwin(): @@ -553,44 +545,6 @@ def set_cc_opt_flags(environ_cp): write_to_bazelrc('build:opt --host_copt=%s' % opt) -def set_tf_cuda_clang(environ_cp): - """set TF_CUDA_CLANG action_env. - - Args: - environ_cp: copy of the os.environ. - """ - question = 'Do you want to use clang as CUDA compiler?' - yes_reply = 'Clang will be used as CUDA compiler.' - no_reply = 'nvcc will be used as CUDA compiler.' - set_action_env_var( - environ_cp, - 'TF_CUDA_CLANG', - None, - False, - question=question, - yes_reply=yes_reply, - no_reply=no_reply, - bazel_config_name='cuda_clang', - ) - - -def set_tf_download_clang(environ_cp): - """Set TF_DOWNLOAD_CLANG action_env.""" - question = 'Do you wish to download a fresh release of clang? (Experimental)' - yes_reply = 'Clang will be downloaded and used to compile tensorflow.' - no_reply = 'Clang will not be downloaded.' - set_action_env_var( - environ_cp, - 'TF_DOWNLOAD_CLANG', - None, - False, - question=question, - yes_reply=yes_reply, - no_reply=no_reply, - bazel_config_name='download_clang' - ) - - def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var, var_default): """Get var_name either from env, or user or default. @@ -683,28 +637,12 @@ def prompt_loop_or_load_from_env( return val -def set_clang_cuda_compiler_path(environ_cp): - """Set CLANG_CUDA_COMPILER_PATH.""" - default_clang_path = '/usr/lib/llvm-17/bin/clang' - if not os.path.exists(default_clang_path): - default_clang_path = '/usr/lib/llvm-16/bin/clang' - if not os.path.exists(default_clang_path): - default_clang_path = which('clang') or '' - - clang_cuda_compiler_path = prompt_loop_or_load_from_env( - environ_cp, - var_name='CLANG_CUDA_COMPILER_PATH', - var_default=default_clang_path, - ask_for_var='Please specify clang path that to be used as host compiler.', - check_success=os.path.exists, - resolve_symlinks=True, - error_msg='Invalid clang path. %s cannot be found.', - ) - - # Set CLANG_CUDA_COMPILER_PATH - environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path - write_action_env('CLANG_CUDA_COMPILER_PATH', clang_cuda_compiler_path) - return clang_cuda_compiler_path +def choose_compiler(environ_cp): + question = 'Do you want to use Clang as the compiler?' + yes_reply = 'Clang will be used to compile Deepray.' + no_reply = 'GCC will be used to compile Deepray.' + var = int(get_var(environ_cp, 'TF_NEED_CLANG', None, False, question, yes_reply, no_reply)) + return var def set_gcc_host_compiler_path(environ_cp): @@ -726,16 +664,20 @@ def set_gcc_host_compiler_path(environ_cp): resolve_symlinks=True, error_msg='Invalid gcc path. %s cannot be found.', ) + write_repo_env("CC", gcc_host_compiler_path) + write_repo_env("BAZEL_COMPILER", gcc_host_compiler_path) + return gcc_host_compiler_path - write_action_env('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path) - -def choose_compiler(environ_cp): - question = 'Do you want to use Clang to build Deepray?' - yes_reply = 'Clang will be used to compile Deepray.' - no_reply = 'GCC will be used to compile Deepray.' - var = int(get_var(environ_cp, 'TF_NEED_CLANG', None, False, question, yes_reply, no_reply)) - return var +def get_gcc_major_version(gcc_path: str): + gcc_version_proc = subprocess.run( + [gcc_path, "-dumpversion"], + check=True, + capture_output=True, + text=True, + ) + major_version = int(gcc_version_proc.stdout) + return major_version def set_clang_compiler_path(environ_cp): @@ -751,10 +693,13 @@ def set_clang_compiler_path(environ_cp): Returns: string value for clang_compiler_path. """ - # Default path if clang-16 is installed by using apt-get install - default_clang_path = '/usr/lib/llvm-17/bin/clang' + # Default path if clang-18 is installed by using apt-get install + # remove 16 logic upon release of 19 + default_clang_path = '/usr/lib/llvm-18/bin/clang' if not os.path.exists(default_clang_path): - default_clang_path = '/usr/lib/llvm-16/bin/clang' + default_clang_path = '/usr/lib/llvm-17/bin/clang' + if not os.path.exists(default_clang_path): + default_clang_path = '/usr/lib/llvm-16/bin/clang' if not os.path.exists(default_clang_path): default_clang_path = which('clang') or '' @@ -772,9 +717,8 @@ def set_clang_compiler_path(environ_cp): ), ) - write_action_env('CLANG_COMPILER_PATH', clang_compiler_path) - write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path) - write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path) + write_repo_env('CC', clang_compiler_path) + write_repo_env('BAZEL_COMPILER', clang_compiler_path) return clang_compiler_path @@ -812,8 +756,16 @@ def retrieve_clang_version(clang_executable): # offset of in the current version of ubp. See # https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183. def disable_clang_offsetof_extension(clang_version): - if int(clang_version.split('.')[0]) in (16, 17): + clang_major_version = int(clang_version.split('.')[0]) + if clang_major_version in (16, 17): write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions') + if clang_major_version >= 16: + # Enable clang settings that are needed for the build to work with newer + # versions of Clang. + write_to_bazelrc("build --config=clang") + if clang_major_version < 19: + # Prevent XNNPACK from using `-mavxvnniint8` (only available in clang 16+/gcc 13+). + write_to_bazelrc("build --define=xnn_enable_avxvnniint8=false") def set_tf_cuda_paths(environ_cp): @@ -885,37 +837,76 @@ def set_tf_nccl_version(environ_cp): environ_cp['TF_NCCL_VERSION'] = tf_nccl_version -def get_native_cuda_compute_capabilities(environ_cp): - """Get native cuda compute capabilities. +def _find_executable(executable: str) -> Optional[str]: + logging.info("Trying to find path to %s...", executable) + # Resolving the symlink is necessary for finding system headers. + if unresolved_path := shutil.which(executable): + return str(pathlib.Path(unresolved_path).resolve()) + return None + + +def _find_executable_or_die(executable_name: str, executable_path: Optional[str] = None) -> str: + """Finds executable and resolves symlinks or raises RuntimeError. + + Resolving symlinks is sometimes necessary for finding system headers. Args: - environ_cp: copy of the os.environ. + executable_name: The name of the executable that we want to find. + executable_path: If not None, the path to the executable. Returns: - string of native cuda compute capabilities, separated by comma. + The path to the executable we are looking for, after symlinks are resolved. + Raises: + RuntimeError: if path to the executable cannot be found. """ - device_query_bin = os.path.join(environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery') - if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK): - try: - output = run_shell(device_query_bin).split('\n') - pattern = re.compile('[0-9]*\\.[0-9]*') - output = [pattern.search(x) for x in output if 'Capability' in x] - output = ','.join(x.group() for x in output if x is not None) - except subprocess.CalledProcessError: - output = '' - else: - output = '' - return output + if executable_path: + return str(pathlib.Path(executable_path).resolve(strict=True)) + resolved_path_to_exe = _find_executable(executable_name) + if resolved_path_to_exe is None: + raise RuntimeError( + f"Could not find executable `{executable_name}`! " + "Please change your $PATH or pass the path directly like" + f"`--{executable_name}_path=path/to/executable." + ) + logging.info("Found path to %s at %s", executable_name, resolved_path_to_exe) + + return resolved_path_to_exe + + +def _get_cuda_compute_capabilities_or_die() -> list[str]: + """Finds compute capabilities via nvidia-smi or rasies exception. + + Returns: + list of unique, sorted strings representing compute capabilities: + Raises: + RuntimeError: if path to nvidia-smi couldn't be found. + subprocess.CalledProcessError: if nvidia-smi process failed. + """ + try: + nvidia_smi = _find_executable_or_die("nvidia-smi") + nvidia_smi_proc = subprocess.run( + [nvidia_smi, "--query-gpu=compute_cap", "--format=csv,noheader"], + capture_output=True, + check=True, + text=True, + ) + # Command above returns a newline separated list of compute capabilities + # with possible repeats. So we should unique them and sort the final result. + capabilities = sorted(set(nvidia_smi_proc.stdout.strip().split("\n"))) + logging.info("Found CUDA compute capabilities: %s", capabilities) + return ','.join(capabilities) + except (RuntimeError, subprocess.CalledProcessError) as e: + logging.info( + "Could not find nvidia-smi, or nvidia-smi command failed. Please pass" + " capabilities directly using --cuda_compute_capabilities." + ) + raise e -def set_tf_cuda_compute_capabilities(environ_cp): - """Set TF_CUDA_COMPUTE_CAPABILITIES.""" +def set_hermetic_cuda_compute_capabilities(environ_cp): + """Set HERMETIC_CUDA_COMPUTE_CAPABILITIES.""" while True: - native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(environ_cp) - if not native_cuda_compute_capabilities: - default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES - else: - default_cuda_compute_capabilities = native_cuda_compute_capabilities + default_cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die() ask_cuda_compute_capabilities = ( 'Please specify a list of comma-separated CUDA compute capabilities ' @@ -925,18 +916,21 @@ def set_tf_cuda_compute_capabilities(environ_cp): ' binary GPU code, or as "sm_xy" to only include the binary ' 'code.\nPlease note that each additional compute capability ' 'significantly increases your build time and binary size, and that ' - 'TensorFlow only supports compute capabilities >= 3.5 [Default is: ' + 'Deepray only supports compute capabilities >= 3.5 [Default is: ' '%s]: ' % default_cuda_compute_capabilities ) - tf_cuda_compute_capabilities = get_from_env_or_user_or_default( - environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES', ask_cuda_compute_capabilities, default_cuda_compute_capabilities + hermetic_cuda_compute_capabilities = get_from_env_or_user_or_default( + environ_cp, + 'HERMETIC_CUDA_COMPUTE_CAPABILITIES', + ask_cuda_compute_capabilities, + default_cuda_compute_capabilities, ) # Check whether all capabilities from the input is valid all_valid = True # Remove all whitespace characters before splitting the string # that users may insert by accident, as this will result in error - tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split()) - for compute_capability in tf_cuda_compute_capabilities.split(','): + hermetic_cuda_compute_capabilities = ''.join(hermetic_cuda_compute_capabilities.split()) + for compute_capability in hermetic_cuda_compute_capabilities.split(','): m = re.match('[0-9]+.[0-9]+', compute_capability) if not m: # We now support sm_35,sm_50,sm_60,compute_70. @@ -977,20 +971,26 @@ def set_tf_cuda_compute_capabilities(environ_cp): break # Reset and Retry - environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = '' + environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = '' - # Set TF_CUDA_COMPUTE_CAPABILITIES - environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities - write_action_env('TF_CUDA_COMPUTE_CAPABILITIES', tf_cuda_compute_capabilities) + # Set HERMETIC_CUDA_COMPUTE_CAPABILITIES + environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = (hermetic_cuda_compute_capabilities) + write_to_bazelrc( + 'build:{} --repo_env {}="{}"'.format( + 'cuda', 'HERMETIC_CUDA_COMPUTE_CAPABILITIES', str(hermetic_cuda_compute_capabilities) + ) + ) def set_other_cuda_vars(environ_cp): """Set other CUDA related variables.""" # If CUDA is enabled, always use GPU during build and test. - if environ_cp.get('TF_CUDA_CLANG') == '1': + if environ_cp.get('TF_NEED_CLANG') == '1': write_to_bazelrc('build --config=cuda_clang') + write_action_env('CLANG_CUDA_COMPILER_PATH', environ_cp.get('CLANG_COMPILER_PATH')) else: write_to_bazelrc('build --config=cuda') + write_to_bazelrc('build --config=cuda_nvcc') def system_specific_test_config(environ_cp): @@ -1192,55 +1192,24 @@ def main(): # This should be replaced with a call to tf.sysconfig if it's added write_action_env("TF_CPLUSPLUS_VER", get_cpp_version()) + tf_version_integer = get_tf_version_integer() # This is used to trace the difference between Tensorflow versions. - write_action_env("TF_VERSION_INTEGER", get_tf_version_integer()) - - if is_windows(): - environ_cp['TF_NEED_OPENCL'] = '0' - environ_cp['TF_CUDA_CLANG'] = '0' - # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on - # Windows. - environ_cp['TF_DOWNLOAD_CLANG'] = '0' - environ_cp['TF_NEED_MPI'] = '0' - - if is_macos(): - environ_cp['TF_NEED_TENSORRT'] = '0' - - if is_ppc64le(): - # Enable MMA Dynamic Dispatch support if 'gcc' and if linker >= 2.35 - gcc_env = get_gcc_compiler(environ_cp) - if gcc_env is not None: - - # Use gold linker if 'gcc' and if 'ppc64le' - write_to_bazelrc('build --linkopt="-fuse-ld=gold"') - - # Get the linker version - ld_version = run_shell([gcc_env, '-Wl,-version']).split() - - ld_version_int = convert_version_to_int(ld_version[3]) - if ld_version_int is None: - ld_version_int = convert_version_to_int(ld_version[4]) + write_action_env("TF_VERSION_INTEGER", tf_version_integer) + write_to_bazelrc('') - # Enable if 'ld' version >= 2.35 - if ld_version_int >= 2035000: - write_to_bazelrc('build --copt="-DEIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH=1"') - - with_xla_support = environ_cp.get('TF_ENABLE_XLA', None) - if with_xla_support is not None: - write_to_bazelrc('build --define=with_xla_support=%s' % ('true' if int(with_xla_support) else 'false')) - - # set_action_env_var(environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm') - if ( - environ_cp.get('TF_NEED_ROCM') == '1' and 'LD_LIBRARY_PATH' in environ_cp and - environ_cp.get('LD_LIBRARY_PATH') != '1' - ): - write_action_env('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH')) - - if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')): - write_action_env('ROCM_PATH', environ_cp.get('ROCM_PATH')) - - if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('HIP_PLATFORM')): - write_action_env('HIP_PLATFORM', environ_cp.get('HIP_PLATFORM')) + # Ask whether we should use clang for the CPU build. + if is_linux(): + environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp)) + if environ_cp.get('TF_NEED_CLANG') == '1': + clang_compiler_path = set_clang_compiler_path(environ_cp) + clang_version = retrieve_clang_version(clang_compiler_path) + disable_clang_offsetof_extension(clang_version) + else: + gcc_path = set_gcc_host_compiler_path(environ_cp) + gcc_major_version = get_gcc_major_version(gcc_path) + if gcc_major_version < 13: + # Prevent XNNPACK from using `-mavxvnniint8` (only available in clang 16+/gcc 13+). + write_to_bazelrc('build --define=xnn_enable_avxvnniint8=false') if is_windows(): print( @@ -1300,30 +1269,14 @@ def main(): 'times in a row. Assuming to be a scripting mistake.' % _DEFAULT_PROMPT_ASK_ATTEMPTS ) - set_tf_cuda_compute_capabilities(environ_cp) + set_hermetic_cuda_compute_capabilities(environ_cp) if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get('LD_LIBRARY_PATH') != '1': write_action_env('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH')) - set_tf_cuda_clang(environ_cp) - if environ_cp.get('TF_CUDA_CLANG') == '1': - # Set up which clang we should use as the cuda / host compiler. - clang_cuda_compiler_path = set_clang_cuda_compiler_path(environ_cp) - clang_version = retrieve_clang_version(clang_cuda_compiler_path) - disable_clang_offsetof_extension(clang_version) - else: - # Set up which gcc nvcc should use as the host compiler - # No need to set this on Windows - if not is_windows(): - set_gcc_host_compiler_path(environ_cp) set_other_cuda_vars(environ_cp) else: - # CUDA not required. Ask whether we should use clang for the CPU build. - if is_linux(): - environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp)) - if environ_cp.get('TF_NEED_CLANG') == '1': - clang_compiler_path = set_clang_compiler_path(environ_cp) - clang_version = retrieve_clang_version(clang_compiler_path) - disable_clang_offsetof_extension(clang_version) + if environ_cp.get('TF_NEED_CLANG') == '1': + write_action_env('CLANG_COMPILER_PATH', clang_compiler_path) # ROCm / CUDA are mutually exclusive. # At most 1 GPU platform can be configured. diff --git a/deepray/BUILD b/deepray/BUILD index 06896eb7..5d7a13a2 100644 --- a/deepray/BUILD +++ b/deepray/BUILD @@ -1,119 +1,32 @@ -load("//deepray:tensorflow.bzl", "if_google") -load("@bazel_skylib//lib:selects.bzl", "selects") - licenses(["notice"]) # Apache 2.0 -package(default_visibility = ["//visibility:public"]) - -config_setting( - name = "windows", - constraint_values = ["@platforms//os:windows"], +package( + default_visibility = [":internal"], + licenses = ["notice"], # Apache 2.0 ) -# Sometimes Bazel reports darwin_x86_64 as "darwin" and sometimes as -# "darwin_x86_64". The former shows up when building on a Mac x86_64 host for a Mac x86_64 target. -# The latter shows up when cross-compiling for Mac x86_64 from a Mac ARM machine and in internal -# Google builds. -config_setting( - name = "macos_x86_64_default", - flag_values = if_google( - {"//tools/cpp:cc_target_os": "apple"}, - {}, - ), - values = { - "apple_platform_type": "macos", - "cpu": "darwin", - }, -) +exports_files([ + "LICENSE", +]) -config_setting( - name = "macos_x86_64_crosscompile", - flag_values = if_google( - {"//tools/cpp:cc_target_os": "apple"}, - {}, - ), - values = { - "apple_platform_type": "macos", - "cpu": "darwin_x86_64", - }, -) - -selects.config_setting_group( - name = "macos_x86_64", - match_any = [ - ":macos_x86_64_default", - ":macos_x86_64_crosscompile", +package_group( + name = "internal", + includes = [ ], - visibility = ["//visibility:public"], -) - -config_setting( - name = "macos_arm64", - flag_values = if_google( - {"//tools/cpp:cc_target_os": "apple"}, - {}, - ), - values = { - "apple_platform_type": "macos", - "cpu": "darwin_arm64", - }, - visibility = ["//visibility:public"], -) - -# TODO(jakeharmon): Remove in favor of TSL version -selects.config_setting_group( - name = "macos", - match_any = [ - ":macos_x86_64", - ":macos_arm64", + packages = [ + "//...", + "//deepray/...", ], - visibility = ["//visibility:public"], ) -# Crosses between framework_shared_object and a bunch of other configurations -# due to limitations in nested select() statements. config_setting( - name = "framework_shared_object", - define_values = {"framework_shared_object": "true"}, - visibility = ["//visibility:public"], -) - -config_setting( - name = "macos_x86_64_with_framework_shared_object", - define_values = { - "framework_shared_object": "true", - }, - values = { - "apple_platform_type": "macos", - "cpu": "darwin", - }, - visibility = ["//visibility:public"], -) - -config_setting( - name = "macos_arm64_with_framework_shared_object", - define_values = { - "framework_shared_object": "true", - }, - values = { - "apple_platform_type": "macos", - "cpu": "darwin_arm64", - }, - visibility = ["//visibility:public"], -) - -selects.config_setting_group( - name = "macos_with_framework_shared_object", - match_any = [ - ":macos_x86_64_with_framework_shared_object", - ":macos_arm64_with_framework_shared_object", - ], - visibility = ["//visibility:public"], + name = "windows", + constraint_values = ["@platforms//os:windows"], ) py_library( name = "deepray", - srcs = glob(["*.py"]), + srcs = glob(["**/*.py"]), deps = [ "//deepray/activations", "//deepray/callbacks", @@ -123,10 +36,11 @@ py_library( "//deepray/layers", "//deepray/losses", "//deepray/metrics", + "//deepray/models", "//deepray/optimizers", - "//deepray/seq2seq", + # "//deepray/seq2seq", "//deepray/testing", - "//deepray/text", + # "//deepray/text", "//deepray/utils", ], ) diff --git a/deepray/__init__.py b/deepray/__init__.py index b8731d98..a1d02a99 100644 --- a/deepray/__init__.py +++ b/deepray/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# Copyright 2023 The Deepray Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,22 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Useful extra functionality for TensorFlow maintained by SIG-deepray.""" +import argparse +import os + +os.environ["TF_USE_LEGACY_KERAS"] = "1" import sys +import tensorflow as tf from absl import flags -from deepray.utils.flags import common_flags - -common_flags.define_common_flags() - -FLAGS = flags.FLAGS -FLAGS(sys.argv, known_only=True) - -from deepray.utils.ensure_tf_install import _check_tf_version - -_check_tf_version() - # Local project imports from deepray import activations from deepray import callbacks @@ -35,12 +28,98 @@ from deepray import layers from deepray import losses from deepray import metrics +from deepray import models from deepray import optimizers -from deepray.layers import rnn -from deepray import seq2seq -from deepray import text from deepray import options from deepray.register import register_all +from deepray.utils import logging_util from deepray.utils import types - +from deepray.utils.ensure_tf_install import _check_tf_version +from deepray.utils.flags import common_flags from deepray.version import __version__ +from deepray.utils import gpu_affinity + +# _check_tf_version() + +logger = logging_util.get_logger() + +common_flags.define_common_flags() +flags.FLAGS(sys.argv, known_only=True) + + +def init(): + logger.debug(f"sys.argv = {sys.argv}") # sys.argv from Horovod + + gpus = tf.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + + if flags.FLAGS.distribution_strategy == "horovod": + import horovod.tensorflow as hvd + hvd.init() + if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + gpu_affinity.set_affinity(hvd.local_rank()) + + +def start_tensorflow_server(cluster_resolver): + # Set the environment variable to allow reporting worker and ps failure to the + # coordinator. This is a workaround and won't be necessary in the future. + os.environ["GRPC_FAIL_FAST"] = "use_caller" + + server = tf.distribute.Server( + cluster_resolver.cluster_spec(), + job_name=cluster_resolver.task_type, + task_index=cluster_resolver.task_id, + protocol=cluster_resolver.rpc_layer or "grpc", + start=True, + ) + server.join() + + +def runner(function, verbose=None): + parser = argparse.ArgumentParser(description='Deepray Runner') + parser.add_argument('-v', '--version', action='version', version=__version__, help='Shows Deepray version.') + parser.add_argument( + '--distribution_strategy', type=str, default='Horovod', help='Whether run distributed training with Horovod.' + ) + + physical_devices = tf.config.list_physical_devices('GPU') + world_size = len(physical_devices) + logger.debug(f"world_size = {world_size}") + + user_argv = sys.argv # get user specified args + args, unknown = parser.parse_known_args() + + if world_size > 1 and args.distribution_strategy == "Horovod": + user_argv.extend([ + "--distribution_strategy=horovod", + f"--num_gpus={world_size}", + "--use_horovod", + ]) + try: + import horovod + os.environ['HOROVOD_STALL_CHECK_TIME_SECONDS'] = '5' + os.environ['HOROVOD_STALL_SHUTDOWN_TIME_SECONDS'] = '30' + except ImportError: + raise ValueError("Please install Horovod properly first if you want to use Horovod distribution_strategy.") + + def helper(argv, main): + logger.debug(f"argv = {argv}") + init() + main() + + horovod.run(helper, args=(sys.argv,), kwargs={"main": function}, np=world_size, verbose=verbose, use_mpi=True) + elif args.distribution_strategy == "ParameterServer": + cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() + if cluster_resolver.task_type in ("worker", "ps"): + start_tensorflow_server(cluster_resolver) + else: + user_argv.extend(["--distribution_strategy=parameter_server"]) + init() + function() + else: + logger.info("Deepray finds only one GPU available, so we turn off distribution_strategy.") + user_argv.extend(["--distribution_strategy=off", f"--num_gpus={world_size}"]) + init() + function() diff --git a/deepray/activations/__init__.py b/deepray/activations/__init__.py index 58300cdb..e69de29b 100644 --- a/deepray/activations/__init__.py +++ b/deepray/activations/__init__.py @@ -1,27 +0,0 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Additional activation functions.""" - -from deepray.activations.hardshrink import hardshrink -from deepray.activations.lisht import lisht -from deepray.activations.mish import mish -from deepray.activations.softshrink import softshrink -from deepray.activations.rrelu import rrelu -from deepray.activations.snake import snake -from deepray.activations.sparsemax import sparsemax -from deepray.activations.tanhshrink import tanhshrink -from deepray.activations.swish import simple_swish -from deepray.activations.swish import hard_swish -from deepray.activations.swish import identity \ No newline at end of file diff --git a/deepray/callbacks/__init__.py b/deepray/callbacks/__init__.py index 98f2eae1..ee575bcd 100755 --- a/deepray/callbacks/__init__.py +++ b/deepray/callbacks/__init__.py @@ -15,6 +15,7 @@ """Additional callbacks that conform to Keras API.""" from deepray.callbacks.average_model_checkpoint import AverageModelCheckpoint +from deepray.callbacks.callbacks import HvdCallbackList +from deepray.callbacks.model_checkpoint import ModelCheckpoint from deepray.callbacks.time_stopping import TimeStopping from deepray.callbacks.tqdm_progress_bar import TQDMProgressBar -from deepray.callbacks.callbacks import HvdCallbackList diff --git a/deepray/callbacks/callbacks.py b/deepray/callbacks/callbacks.py index 6c6f7066..0250e3fb 100644 --- a/deepray/callbacks/callbacks.py +++ b/deepray/callbacks/callbacks.py @@ -13,44 +13,43 @@ # limitations under the License. # ============================================================================== """Callbacks: utilities called at certain points during model training.""" - -import horovod.tensorflow.keras as hvd import numpy as np import tensorflow as tf from absl import flags -from keras.callbacks import CallbackList - -FLAGS = flags.FLAGS +from tf_keras import callbacks as callbacks_module def sync_to_numpy_or_python_type(tensors): """Syncs and converts a structure of `Tensor`s to `NumPy` arrays or Python - scalar types. + scalar types. - For each tensor, it calls `tensor.numpy()`. If the result is a scalar value, - it converts it to a Python type, such as a float or int, by calling - `result.item()`. + For each tensor, it calls `tensor.numpy()`. If the result is a scalar value, + it converts it to a Python type, such as a float or int, by calling + `result.item()`. - Numpy scalars are converted, as Python types are often more convenient to - deal with. This is especially useful for bfloat16 Numpy scalars, which don't - support as many operations as other Numpy values. + Numpy scalars are converted, as Python types are often more convenient to + deal with. This is especially useful for bfloat16 Numpy scalars, which don't + support as many operations as other Numpy values. - Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are - forced to - sync during this process. + Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are + forced to + sync during this process. - Args: - tensors: A structure of tensors. + Args: + tensors: A structure of tensors. - Returns: - `tensors`, but scalar tensors are converted to Python types and non-scalar - tensors are converted to Numpy arrays. - """ + Returns: + `tensors`, but scalar tensors are converted to Python types and non-scalar + tensors are converted to Numpy arrays. + """ if isinstance(tensors, tf.distribute.experimental.coordinator.RemoteValue): tensors = tensors.fetch() + if isinstance(tensors, list) and isinstance(tensors[0], tf.distribute.experimental.coordinator.RemoteValue): + tensors = tf.nest.map_structure(lambda t: t.fetch(), tensors) def _to_single_numpy_or_python_type(t): - if FLAGS.use_horovod: + if flags.FLAGS.use_horovod: + import horovod.tensorflow.keras as hvd t = hvd.allreduce(t, op=hvd.Average) # Don't turn ragged or sparse tensors to NumPy. if isinstance(t, tf.Tensor): @@ -64,7 +63,7 @@ def _to_single_numpy_or_python_type(t): return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors) -class HvdCallbackList(CallbackList): +class HvdCallbackList(callbacks_module.CallbackList): def _process_logs(self, logs, is_batch_hook=False): """Turns tensors into numpy arrays or Python scalars if necessary.""" diff --git a/deepray/callbacks/model_checkpoint.py b/deepray/callbacks/model_checkpoint.py new file mode 100644 index 00000000..8b84b986 --- /dev/null +++ b/deepray/callbacks/model_checkpoint.py @@ -0,0 +1,147 @@ +# Copyright 2023 The Deepray Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import os +import sys + +import tensorflow as tf +from absl import flags +from tf_keras.callbacks import Callback +from typeguard import typechecked + +from deepray.utils import export +from deepray.utils import logging_util +from deepray.utils.horovod_utils import is_main_process, get_world_size, get_rank + +logger = logging_util.get_logger() + + +@tf.keras.utils.register_keras_serializable(package="Deepray") +class ModelCheckpoint(Callback): + + @typechecked + def __init__(self, save_checkpoint_steps: int = sys.maxsize, max_to_keep: int = 3): + super().__init__() + self.save_checkpoint_steps = save_checkpoint_steps + self.max_to_keep = max_to_keep + self.epochs = flags.FLAGS.epochs + if flags.FLAGS.stop_steps >= 0: + self.epochs = 1 + if flags.FLAGS.use_dynamic_embedding: + from tensorflow_recommenders_addons import dynamic_embedding as de + tf.train.Checkpoint = de.train.checkpoint.DECheckpoint + + def set_models(self, models): + self.models = models + + def set_optimizer(self, optimizer): + self.optimizer = optimizer + + # def set_iterator(self, iterator): + # self.iterator = iterator + + @property + def manager(self): + if len(self._managers) == 1: + return self._managers["main"] + else: + return self._managers + + def on_callback_begin(self): + self._checkpoints, self._managers = {}, {} + for name, model in self.models.items(): + if "main" in name: + _checkpoint = tf.train.Checkpoint(model=model, optimizer=self.optimizer) + self._checkpoints[name] = _checkpoint + if get_world_size() > 1: + self._managers[name] = tf.train.CheckpointManager( + _checkpoint, + os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}_{get_rank()}'), + max_to_keep=self.max_to_keep + ) + else: + self._managers[name] = tf.train.CheckpointManager( + _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep + ) + else: + _checkpoint = tf.train.Checkpoint(model=model) + self._checkpoints[name] = _checkpoint + self._managers[name] = tf.train.CheckpointManager( + _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep + ) + + if flags.FLAGS.init_checkpoint: + for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), flags.FLAGS.init_checkpoint): + if init_ckpt: + if tf.io.gfile.isdir(init_ckpt): + latest_checkpoint = tf.train.latest_checkpoint(init_ckpt) + else: + latest_checkpoint = init_ckpt + logger.info( + f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.' + ) + if os.getenv("DEEPRAY_VERBOSITY", None) == "detail" or flags.FLAGS.use_dynamic_embedding: + # TFRA DE doesn't support "assert_existing_objects_matched" method + ckpt.restore(latest_checkpoint) + else: + ckpt.restore(latest_checkpoint).assert_existing_objects_matched() + logger.info('Loading from checkpoint file...') + + self.current_step = 0 + self._steps_from_save = 0 # self.optimizer.iterations.numpy() + + def on_train_begin(self, logs=None): + self.on_callback_begin() + + def on_test_begin(self, logs=None): + self.on_callback_begin() + + def on_predict_begin(self, logs=None): + self.on_callback_begin() + + def on_train_batch_end(self, batch, logs=None): + self.current_step = batch + if self._steps_from_save + self.save_checkpoint_steps <= batch: + export.export_to_checkpoint(self.manager, batch) + self._steps_from_save = batch + + def on_epoch_end(self, epoch, logs=None): + # Saves model checkpoints and run validation steps at every epoch end. + # To avoid repeated model saving, we do not save after the last step of training. + if epoch < self.epochs - 1: + export.export_to_checkpoint(self.manager, self.current_step) + + def on_train_end(self, logs=None): + export.export_to_checkpoint(self.manager, self.current_step) + + def get_config(self): + config = { + "save_checkpoint_steps": self.save_checkpoint_steps, + "max_to_keep": self.max_to_keep, + } + + base_config = super().get_config() + return {**base_config, **config} + + +class SimpleCheckpoint(Callback): + """Keras callback to save tf.train.Checkpoints.""" + + def __init__(self, checkpoint_manager): + super(SimpleCheckpoint, self).__init__() + self.checkpoint_manager = checkpoint_manager + + def on_epoch_end(self, epoch, logs=None): + step_counter = self.checkpoint_manager._step_counter.numpy() # pylint: disable=protected-access + self.checkpoint_manager.save(checkpoint_number=step_counter) diff --git a/deepray/callbacks/profiler_callback.py b/deepray/callbacks/profiler_callback.py new file mode 100644 index 00000000..229c715f --- /dev/null +++ b/deepray/callbacks/profiler_callback.py @@ -0,0 +1,68 @@ +from tf_keras.callbacks import Callback +from tensorflow.python.eager import profiler + +from deepray.utils import logging_util + +logger = logging_util.get_logger() + + +def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_per_epoch): + """Validate profile_steps flag value and return profiler callback.""" + profile_steps_error_message = ( + 'profile_steps must be a comma separated pair of positive integers, ' + 'specifying the first and last steps to be profiled.' + ) + try: + profile_steps = [int(i) for i in profile_steps.split(',')] + except ValueError: + raise ValueError(profile_steps_error_message) + if len(profile_steps) != 2: + raise ValueError(profile_steps_error_message) + start_step, stop_step = profile_steps + if start_step < 0 or start_step > stop_step: + raise ValueError(profile_steps_error_message) + if enable_tensorboard: + logger.warning( + 'Both TensorBoard and profiler callbacks are used. Note that the ' + 'TensorBoard callback profiles the 2nd step (unless otherwise ' + 'specified). Please make sure the steps profiled by the two callbacks ' + 'do not overlap.' + ) + return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch) + + +class ProfilerCallback(Callback): + """Save profiles in specified step range to log directory.""" + + def __init__(self, log_dir, start_step, stop_step, steps_per_epoch): + super(ProfilerCallback, self).__init__() + self.log_dir = log_dir + self.start_step = start_step + self.stop_step = stop_step + self.start_epoch = start_step // steps_per_epoch + self.stop_epoch = stop_step // steps_per_epoch + self.start_step_in_epoch = start_step % steps_per_epoch + self.stop_step_in_epoch = stop_step % steps_per_epoch + self.should_start = False + self.should_stop = False + + def on_epoch_begin(self, epoch, logs=None): + if epoch == self.start_epoch: + self.should_start = True + if epoch == self.stop_epoch: + self.should_stop = True + + def on_batch_begin(self, batch, logs=None): + if batch == self.start_step_in_epoch and self.should_start: + self.should_start = False + profiler.start() + logger.info('Profiler started at Step %s', self.start_step) + + def on_batch_end(self, batch, logs=None): + if batch == self.stop_step_in_epoch and self.should_stop: + self.should_stop = False + results = profiler.stop() + profiler.save(self.log_dir, results) + logger.info( + 'Profiler saved profiles for steps between %s and %s to %s', self.start_step, self.stop_step, self.log_dir + ) diff --git a/deepray/callbacks/progbar_logger.py b/deepray/callbacks/progbar_logger.py new file mode 100644 index 00000000..27edf0cd --- /dev/null +++ b/deepray/callbacks/progbar_logger.py @@ -0,0 +1,458 @@ +# Copyright 2023 The Deepray Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import json +import os +import time +import copy +import numpy as np +import sys + +import tensorflow as tf +from absl import flags +from tf_keras.callbacks import Callback +from tf_keras.src.utils import io_utils +from tf_keras.src.utils import tf_utils + +from deepray.utils import logging_util +from deepray.utils.benchmark import PerformanceCalculator +from deepray.utils.flags import common_flags +from deepray.utils.horovod_utils import is_main_process, get_world_size + +logger = logging_util.get_logger() + + +class Progbar: + """Displays a progress bar. + + Args: + target: Total number of steps expected, None if unknown. + width: Progress bar width on screen. + verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose) + stateful_metrics: Iterable of string names of metrics that should *not* + be averaged over time. Metrics in this list will be displayed as-is. + All others will be averaged by the progbar before display. + interval: Minimum visual progress update interval (in seconds). + unit_name: Display name for step counts (usually "step" or "sample"). + """ + + def __init__( + self, + target, + width=30, + verbose=1, + interval=0.05, + stateful_metrics=None, + unit_name="step", + ): + self.target = target + self.width = width + self.verbose = verbose + self.interval = interval + self.unit_name = unit_name + if stateful_metrics: + self.stateful_metrics = set(stateful_metrics) + else: + self.stateful_metrics = set() + + self._dynamic_display = ( + (hasattr(sys.stdout, "isatty") and sys.stdout.isatty()) or "ipykernel" in sys.modules or + "posix" in sys.modules or "PYCHARM_HOSTED" in os.environ + ) + self._total_width = 0 + self._seen_so_far = 0 + # We use a dict + list to avoid garbage collection + # issues found in OrderedDict + self._values = {} + self._values_order = [] + self._start = time.time() + self._last_update = 0 + self._time_at_epoch_start = self._start + self._time_at_epoch_end = None + self._time_after_first_step = None + + def update(self, current, values=None, finalize=None): + """Updates the progress bar. + + Args: + current: Index of current step. + values: List of tuples: `(name, value_for_last_step)`. If `name` is + in `stateful_metrics`, `value_for_last_step` will be displayed + as-is. Else, an average of the metric over time will be + displayed. + finalize: Whether this is the last update for the progress bar. If + `None`, uses `current >= self.target`. Defaults to `None`. + """ + if finalize is None: + if self.target is None: + finalize = False + else: + finalize = current >= self.target + + values = values or [] + for k, v in values: + if k not in self._values_order: + self._values_order.append(k) + if k not in self.stateful_metrics: + # In the case that progress bar doesn't have a target value in + # the first epoch, both on_batch_end and on_epoch_end will be + # called, which will cause 'current' and 'self._seen_so_far' to + # have the same value. Force the minimal value to 1 here, + # otherwise stateful_metric will be 0s. + value_base = max(current - self._seen_so_far, 1) + if k not in self._values: + self._values[k] = [v * value_base, value_base] + else: + self._values[k][0] += v * value_base + self._values[k][1] += value_base + else: + # Stateful metrics output a numeric value. This representation + # means "take an average from a single value" but keeps the + # numeric formatting. + self._values[k] = [v, 1] + self._seen_so_far = current + + message = "" + now = time.time() + info = f" - {now - self._start:.0f}s" + if current == self.target: + self._time_at_epoch_end = now + if self.verbose == 1: + if now - self._last_update < self.interval and not finalize: + return + + prev_total_width = self._total_width + if self._dynamic_display: + message += "\b" * prev_total_width + message += "\r" + else: + message += "\n" + + if self.target is not None: + numdigits = int(np.log10(self.target)) + 1 + bar = ("%" + str(numdigits) + "d/%d [") % (current, self.target) + prog = float(current) / self.target + prog_width = int(self.width * prog) + if prog_width > 0: + bar += "=" * (prog_width - 1) + if current < self.target: + bar += ">" + else: + bar += "=" + bar += "." * (self.width - prog_width) + bar += "]" + else: + bar = "%7d/Unknown" % current + + self._total_width = len(bar) + message += bar + + time_per_unit = self._estimate_step_duration(current, now) + + if self.target is None or finalize: + info += self._format_time(time_per_unit, self.unit_name) + else: + eta = time_per_unit * (self.target - current) + if eta > 3600: + eta_format = "%d:%02d:%02d" % ( + eta // 3600, + (eta % 3600) // 60, + eta % 60, + ) + elif eta > 60: + eta_format = "%d:%02d" % (eta // 60, eta % 60) + else: + eta_format = "%ds" % eta + + info = f" - ETA: {eta_format}" + + for k in self._values_order: + info += f" - {k}:" + if isinstance(self._values[k], list): + avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) + if abs(avg) > 1e-3: + info += f" {avg:.4f}" + else: + info += f" {avg:.4e}" + else: + info += f" {self._values[k]}" + + self._total_width += len(info) + if prev_total_width > self._total_width: + info += " " * (prev_total_width - self._total_width) + + if finalize: + info += "\n" + + message += info + logger.info(message) + # io_utils.print_msg(message, line_break=False) + message = "" + + elif self.verbose == 2: + if finalize: + numdigits = int(np.log10(self.target)) + 1 + count = ("%" + str(numdigits) + "d/%d") % (current, self.target) + info = count + info + for k in self._values_order: + info += f" - {k}:" + avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) + if avg > 1e-3: + info += f" {avg:.4f}" + else: + info += f" {avg:.4e}" + if self._time_at_epoch_end: + time_per_epoch = (self._time_at_epoch_end - self._time_at_epoch_start) + avg_time_per_step = time_per_epoch / self.target + self._time_at_epoch_start = now + self._time_at_epoch_end = None + info += " -" + self._format_time(time_per_epoch, "epoch") + info += " -" + self._format_time(avg_time_per_step, self.unit_name) + info += "\n" + message += info + io_utils.print_msg(message, line_break=False) + message = "" + + self._last_update = now + + def add(self, n, values=None): + self.update(self._seen_so_far + n, values) + + def _format_time(self, time_per_unit, unit_name): + """format a given duration to display to the user. + + Given the duration, this function formats it in either milliseconds + or seconds and displays the unit (i.e. ms/step or s/epoch) + Args: + time_per_unit: the duration to display + unit_name: the name of the unit to display + Returns: + a string with the correctly formatted duration and units + """ + formatted = "" + if time_per_unit >= 1 or time_per_unit == 0: + formatted += f" {time_per_unit:.0f}s/{unit_name}" + elif time_per_unit >= 1e-3: + formatted += f" {time_per_unit * 1000.0:.0f}ms/{unit_name}" + else: + formatted += f" {time_per_unit * 1000000.0:.0f}us/{unit_name}" + return formatted + + def _estimate_step_duration(self, current, now): + """Estimate the duration of a single step. + + Given the step number `current` and the corresponding time `now` this + function returns an estimate for how long a single step takes. If this + is called before one step has been completed (i.e. `current == 0`) then + zero is given as an estimate. The duration estimate ignores the duration + of the (assumed to be non-representative) first step for estimates when + more steps are available (i.e. `current>1`). + + Args: + current: Index of current step. + now: The current time. + + Returns: Estimate of the duration of a single step. + """ + if current: + # there are a few special scenarios here: + # 1) somebody is calling the progress bar without ever supplying + # step 1 + # 2) somebody is calling the progress bar and supplies step one + # multiple times, e.g. as part of a finalizing call + # in these cases, we just fall back to the simple calculation + if self._time_after_first_step is not None and current > 1: + time_per_unit = (now - self._time_after_first_step) / (current - 1) + else: + time_per_unit = (now - self._start) / current + + if current == 1: + self._time_after_first_step = now + return time_per_unit + else: + return 0 + + def _update_stateful_metrics(self, stateful_metrics): + self.stateful_metrics = self.stateful_metrics.union(stateful_metrics) + + +class ProgbarLogger(Callback): + """Callback that prints metrics to stdout. + + Args: + count_mode: One of `"steps"` or `"samples"`. + Whether the progress bar should + count samples seen or steps (batches) seen. + stateful_metrics: Iterable of string names of metrics that + should *not* be averaged over an epoch. + Metrics in this list will be logged as-is. + All others will be averaged over time (e.g. loss, etc). + If not provided, defaults to the `Model`'s metrics. + + Raises: + ValueError: In case of invalid `count_mode`. + """ + + def __init__(self, count_mode: str = "samples", stateful_metrics=None): + super().__init__() + self._supports_tf_logs = True + if count_mode == "samples": + self.use_steps = False + elif count_mode == "steps": + self.use_steps = True + else: + raise ValueError(f"Unknown `count_mode`: {count_mode}. " + 'Expected values are ["samples", "steps"]') + # Defaults to all Model's metrics except for loss. + self.stateful_metrics = (set(stateful_metrics) if stateful_metrics else set()) + + self.seen = 0 + self.progbar = None + self.target = None + self.verbose = 1 + self.epochs = 1 + + self._train_step, self._test_step, self._predict_step = None, None, None + self._call_batch_hooks = True + + self._called_in_fit = False + + def set_params(self, params): + self.verbose = params["verbose"] + self.epochs = params["epochs"] + if self.use_steps and "steps" in params: + self.target = params["steps"] + elif not self.use_steps and "samples" in params: + self.target = params["samples"] + else: + self.target = ( + None # Will be inferred at the end of the first epoch. + ) + + self._call_batch_hooks = self.verbose == 1 + if self.target is None: + try: + self._train_step = self.model._train_counter + self._test_step = self.model._test_counter + self._predict_step = self.model._predict_counter + except AttributeError: + self._call_batch_hooks = True + + def on_train_begin(self, logs=None): + # When this logger is called inside `fit`, validation is silent. + self._called_in_fit = True + + def on_test_begin(self, logs=None): + if not self._called_in_fit: + self._reset_progbar() + self._maybe_init_progbar() + + def on_predict_begin(self, logs=None): + self._reset_progbar() + self._maybe_init_progbar() + + def on_epoch_begin(self, epoch, logs=None): + self._reset_progbar() + self._maybe_init_progbar() + if self.verbose and self.epochs > 1: + io_utils.print_msg(f"Epoch {epoch + 1}/{self.epochs}") + + def on_train_batch_end(self, batch, logs=None): + self._batch_update_progbar(batch, logs) + + def on_test_batch_end(self, batch, logs=None): + if not self._called_in_fit: + self._batch_update_progbar(batch, logs) + + def on_predict_batch_end(self, batch, logs=None): + # Don't pass prediction results. + self._batch_update_progbar(batch, None) + + def on_epoch_end(self, epoch, logs=None): + self._finalize_progbar(logs, self._train_step) + + def on_test_end(self, logs=None): + if not self._called_in_fit: + self._finalize_progbar(logs, self._test_step) + + def on_predict_end(self, logs=None): + self._finalize_progbar(logs, self._predict_step) + + def _reset_progbar(self): + self.seen = 0 + self.progbar = None + + def _maybe_init_progbar(self): + """Instantiate a `Progbar` if not yet, and update the stateful + metrics.""" + # TODO(rchao): Legacy TF1 code path may use list for + # `self.stateful_metrics`. Remove "cast to set" when TF1 support is + # dropped. + self.stateful_metrics = set(self.stateful_metrics) + + if self.model: + # Update the existing stateful metrics as `self.model.metrics` may + # contain updated metrics after `MetricsContainer` is built in the + # first train step. + self.stateful_metrics = self.stateful_metrics.union(set(m.name for m in self.model.metrics)) + + if self.progbar is None: + self.progbar = Progbar( + target=self.target, + verbose=self.verbose, + stateful_metrics=self.stateful_metrics, + unit_name="step" if self.use_steps else "sample", + ) + + self.progbar._update_stateful_metrics(self.stateful_metrics) + + def _implements_train_batch_hooks(self): + return self._call_batch_hooks + + def _implements_test_batch_hooks(self): + return self._call_batch_hooks + + def _implements_predict_batch_hooks(self): + return self._call_batch_hooks + + def _batch_update_progbar(self, batch, logs=None): + """Updates the progbar.""" + logs = logs or {} + self._maybe_init_progbar() + if self.use_steps: + self.seen = batch + 1 # One-indexed. + else: + # v1 path only. + logs = copy.copy(logs) + batch_size = logs.pop("size", 0) + num_steps = logs.pop("num_steps", 1) + logs.pop("batch", None) + add_seen = num_steps * batch_size + self.seen += add_seen + + if self.verbose == 1: + # Only block async when verbose = 1. + logs = tf_utils.sync_to_numpy_or_python_type(logs) + self.progbar.update(self.seen, list(logs.items()), finalize=False) + + def _finalize_progbar(self, logs, counter): + logs = tf_utils.sync_to_numpy_or_python_type(logs or {}) + if self.target is None: + if counter is not None: + counter = counter.numpy() + if not self.use_steps: + counter *= logs.get("size", 1) + self.target = counter or self.seen + self.progbar.target = self.target + self.progbar.update(self.target, list(logs.items()), finalize=True) diff --git a/deepray/utils/misc/keras_utils.py b/deepray/callbacks/time_history.py similarity index 65% rename from deepray/utils/misc/keras_utils.py rename to deepray/callbacks/time_history.py index 94b99092..0778c52c 100644 --- a/deepray/utils/misc/keras_utils.py +++ b/deepray/callbacks/time_history.py @@ -1,29 +1,14 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Helper functions for the Keras implementations of models.""" - -import multiprocessing -import os import time -from absl import logging import tensorflow as tf - +from tf_keras.callbacks import Callback from tensorflow.python.eager import monitoring -global_batch_size_gauge = monitoring.IntGauge('/tensorflow/training/global_batch_size', 'TF training global batch size') +from deepray.utils import logging_util + +logger = logging_util.get_logger() +global_batch_size_gauge = monitoring.IntGauge('/tensorflow/training/global_batch_size', 'TF training global batch size') first_batch_time_gauge = monitoring.IntGauge( '/tensorflow/training/first_batch', 'TF training start/end time for first batch (unix epoch time in us.', 'type' ) @@ -43,7 +28,7 @@ def __repr__(self): return "'BatchTimestamp'".format(self.batch_index, self.timestamp) -class TimeHistory(tf.keras.callbacks.Callback): +class TimeHistory(Callback): """Callback for Keras models.""" def __init__(self, batch_size, log_steps, initial_step=0, logdir=None): @@ -137,7 +122,7 @@ def on_batch_end(self, batch, logs=None): examples_per_second = steps_per_second * self.batch_size self.timestamp_log.append(BatchTimestamp(self.global_steps, now)) - logging.info( + logger.info( 'TimeHistory: %.2f seconds, %.2f examples/second between steps %d ' 'and %d', elapsed_time, examples_per_second, self.last_log_step, self.global_steps ) @@ -156,46 +141,3 @@ def on_epoch_end(self, epoch, logs=None): self.steps_before_epoch += self.steps_in_epoch self.steps_in_epoch = 0 - - -class SimpleCheckpoint(tf.keras.callbacks.Callback): - """Keras callback to save tf.train.Checkpoints.""" - - def __init__(self, checkpoint_manager): - super(SimpleCheckpoint, self).__init__() - self.checkpoint_manager = checkpoint_manager - - def on_epoch_end(self, epoch, logs=None): - step_counter = self.checkpoint_manager._step_counter.numpy() # pylint: disable=protected-access - self.checkpoint_manager.save(checkpoint_number=step_counter) - - -def set_session_config(enable_xla=False): - """Sets the session config.""" - if enable_xla: - tf.config.optimizer.set_jit(True) - - -# TODO(hongkuny): remove set_config_v2 globally. -set_config_v2 = set_session_config - - -def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count): - """Set GPU thread mode and count, and adjust dataset threads count.""" - cpu_count = multiprocessing.cpu_count() - logging.info('Logical CPU cores: %s', cpu_count) - - # Allocate private thread pool for each GPU to schedule and launch kernels - per_gpu_thread_count = per_gpu_thread_count or 2 - os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode - os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) - logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT']) - logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE']) - - # Limit data preprocessing threadpool to CPU cores minus number of total GPU - # private threads and memory copy threads. - total_gpu_thread_count = per_gpu_thread_count * num_gpus - num_runtime_threads = num_gpus - if not datasets_num_private_threads: - datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8) - logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads) diff --git a/deepray/callbacks/time_stopping.py b/deepray/callbacks/time_stopping.py index ca23885e..4196bf0e 100644 --- a/deepray/callbacks/time_stopping.py +++ b/deepray/callbacks/time_stopping.py @@ -18,7 +18,7 @@ import time import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from tf_keras.callbacks import Callback from typeguard import typechecked diff --git a/deepray/callbacks/tqdm_progress_bar.py b/deepray/callbacks/tqdm_progress_bar.py index c51291dc..8805b677 100644 --- a/deepray/callbacks/tqdm_progress_bar.py +++ b/deepray/callbacks/tqdm_progress_bar.py @@ -18,7 +18,7 @@ from collections import defaultdict import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from tf_keras.callbacks import Callback from typeguard import typechecked diff --git a/deepray/callbacks/training_speed.py b/deepray/callbacks/training_speed.py new file mode 100644 index 00000000..72ab756f --- /dev/null +++ b/deepray/callbacks/training_speed.py @@ -0,0 +1,155 @@ +# Copyright 2023 The Deepray Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from absl import flags +from tf_keras.callbacks import Callback +from tf_keras.src.utils import io_utils + +from deepray.utils import logging_util +from deepray.utils.benchmark import PerformanceCalculator +from deepray.utils.horovod_utils import get_world_size, is_main_process + +logger = logging_util.get_logger() + + +class TrainingSpeed(Callback): + """Callback that prints metrics to stdout. + + Args: + count_mode: One of `"steps"` or `"samples"`. + Whether the progress bar should + count samples seen or steps (batches) seen. + + Raises: + ValueError: In case of invalid `count_mode`. + """ + + def __init__(self, batch_size: int = None): + super().__init__() + local_batch_size = batch_size or flags.FLAGS.batch_size + logger.info(f"Callback using local (per-replica) batch_size: {local_batch_size}") + + if flags.FLAGS.use_horovod: + world_size = get_world_size() + self.global_batch_size = local_batch_size * world_size + if is_main_process(): + logger.info(f"Horovod enabled: global_batch_size set to {self.global_batch_size} ({world_size} workers)") + else: + self.global_batch_size = local_batch_size + + self.seen = 0 + self.performance_calculator = None + self.epochs = 1 + + self._train_step, self._test_step, self._predict_step = None, None, None + self._call_batch_hooks = True + + self._called_in_fit = False + + def set_params(self, params): + self.epochs = params["epochs"] + self._call_batch_hooks = True + try: + self._train_step = self.model._train_counter + self._test_step = self.model._test_counter + self._predict_step = self.model._predict_counter + except AttributeError: + self._call_batch_hooks = True + + self.last_step = 0 + if isinstance(self.last_step, (tf.Tensor, tf.Variable)): + self.last_step = self.last_step.numpy() + + def set_optimizer(self, optimizer): + self.optimizer = optimizer + + def on_train_begin(self, logs=None): + # When this logger is called inside `fit`, validation is silent. + self._called_in_fit = True + self._perf_wo = 0 + self._perf_wo_n = 0 + + # Training loop starts here. + if hasattr(self.optimizer, "iterations"): + self._first_steps = self.optimizer.iterations.numpy() + else: + self._first_steps = 0 + + def on_test_begin(self, logs=None): + if not self._called_in_fit: + self._reset_progbar() + self._maybe_init_progbar() + + def on_predict_begin(self, logs=None): + self._reset_progbar() + self._maybe_init_progbar() + + def on_train_batch_end(self, batch, logs=None): + if is_main_process(): + self._batch_update_progbar(batch, logs) + + def on_test_batch_end(self, batch, logs=None): + if not self._called_in_fit: + self._batch_update_progbar(batch, logs) + + def on_predict_batch_end(self, batch, logs=None): + # Don't pass prediction results. + self._batch_update_progbar(batch, None) + + def on_test_end(self, logs=None): + if not self._called_in_fit: + self._finalize_progbar(logs, self._test_step) + + def on_predict_end(self, logs=None): + self._finalize_progbar(logs, self._predict_step) + + def _reset_progbar(self): + self.seen = 0 + self.performance_calculator = None + + def _maybe_init_progbar(self): + if self.performance_calculator is None: + self.performance_calculator = PerformanceCalculator() + + def _implements_train_batch_hooks(self): + return self._call_batch_hooks + + def _implements_test_batch_hooks(self): + return self._call_batch_hooks + + def _implements_predict_batch_hooks(self): + return self._call_batch_hooks + + def _batch_update_progbar(self, batch, logs=None): + """Updates the performance_calculator.""" + self._maybe_init_progbar() + self.seen = batch + 1 # One-indexed. + delta_steps = self.seen - self.last_step + + step_throughput = self.performance_calculator(delta_steps, self.global_batch_size) + logger.info('Perf %.2f samples/s' % step_throughput) + + if batch > self._first_steps + delta_steps * 2: + self._perf_wo += step_throughput + self._perf_wo_n += 1 + + self.last_step = self.seen + + def _finalize_progbar(self, logs, counter): + results_perf = self.performance_calculator.get_current_benchmark_results() + logger.info(results_perf) + if self._perf_wo_n != 0: + logger.info("Throughput Average (examples/sec) = %0.2f", self._perf_wo / self._perf_wo_n) diff --git a/deepray/copts.bzl b/deepray/copts.bzl index e56213f0..42dd8ccc 100644 --- a/deepray/copts.bzl +++ b/deepray/copts.bzl @@ -71,3 +71,11 @@ TEST_CPP_COPTS = DEFAULT_CPP_COPTS + [ TEST_LINKOPTS = DEFAULT_LINKOPTS + [ "-fsanitize=address", ] + +# cc_* rules should include this list in copts. If additional cc_*-wide +# customization appears, we might want to switch to macros. + +"""This is the definition site for things we want to keep consistent, like copts.""" + +FCP_COPTS = [ +] diff --git a/deepray/core/base_trainer.py b/deepray/core/base_trainer.py deleted file mode 100644 index 68ee81c0..00000000 --- a/deepray/core/base_trainer.py +++ /dev/null @@ -1,991 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A light weight utilities to train TensorFlow models.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import json -import os -import sys -import time -from typing import Union, List, Dict, Text - -import tensorflow as tf -from absl import logging, flags -from dllogger import Verbosity -from keras.engine import compile_utils -from keras.engine import data_adapter -from packaging import version - -from .compile_utils import HvdMetricsContainer - -if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"): - from tensorflow.keras import optimizers -else: - from tensorflow.keras.optimizers import legacy as optimizers -from deepray.callbacks import HvdCallbackList -from deepray.core.common import distribution_utils -from deepray.optimizers.optimization import GradientAccumulator -from deepray.utils import dllogger_class -from deepray.utils import gpu_affinity -from deepray.utils.flags import common_flags -from deepray.utils.misc import keras_utils -from deepray.utils.benchmark import PerformanceCalculator -from deepray.utils.horovod_utils import is_main_process, get_world_size -from deepray.utils import export - -from .module import Module - -_SUMMARY_TXT = 'training_summary.txt' -_MIN_SUMMARY_STEPS = 10 -FLAGS = flags.FLAGS - -if FLAGS.use_dynamic_embedding: - from tensorflow_recommenders_addons import dynamic_embedding as de - from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_ops import TrainableWrapper, DEResourceVariable - tf.train.Checkpoint = de.train.checkpoint.DEHvdCheckpoint -else: - TrainableWrapper, DEResourceVariable = type(None), type(None) - -# Users should always run this script under TF 2.x -# The container haven't changed version number yet, skip the check. -assert tf.version.VERSION.startswith('2.') - -gpus = tf.config.experimental.list_physical_devices('GPU') -for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) - -if FLAGS.use_horovod: - if FLAGS.keras_use_ctl: - import horovod.tensorflow as hvd - else: - import horovod.tensorflow.keras as hvd - from horovod.tensorflow.compression import Compression - - hvd.init() - if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') - gpu_affinity.set_affinity(hvd.local_rank()) - -# Enables XLA in Session Config. Should not be set for TPU. -keras_utils.set_config_v2(FLAGS.enable_xla) - -use_float16 = common_flags.use_float16() -if use_float16: - policy = tf.keras.mixed_precision.Policy("mixed_float16") - tf.keras.mixed_precision.set_global_policy(policy) - logging.info("mixed_float16 enabled!") - - -def write_txt_summary(training_summary, summary_dir): - """Writes a summary text file to record stats.""" - summary_path = os.path.join(summary_dir, _SUMMARY_TXT) - with tf.io.gfile.GFile(summary_path, 'wb') as f: - logging.info('Training Summary: \n%s', str(training_summary)) - f.write(json.dumps(training_summary, indent=4, default=str)) - - -class Trainer(Module): - """Configures the model for training. - - Example: - - ```python - model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), - loss=tf.keras.losses.BinaryCrossentropy(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.FalseNegatives()]) - ``` - - Args: - optimizer: String (name of optimizer) or optimizer instance. See - `tf.keras.optimizers`. - loss: Loss function. May be a string (name of loss function), or - a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss - function is any callable with the signature `loss = fn(y_true, - y_pred)`, where `y_true` are the ground truth values, and - `y_pred` are the model's predictions. - `y_true` should have shape - `(batch_size, d0, .. dN)` (except in the case of - sparse loss functions such as - sparse categorical crossentropy which expects integer arrays of - shape `(batch_size, d0, .. dN-1)`). - `y_pred` should have shape `(batch_size, d0, .. dN)`. - The loss function should return a float tensor. - If a custom `Loss` instance is - used and reduction is set to `None`, return value has shape - `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss - values; otherwise, it is a scalar. If the model has multiple - outputs, you can use a different loss on each output by passing a - dictionary or a list of losses. The loss value that will be - minimized by the model will then be the sum of all individual - losses, unless `loss_weights` is specified. - metrics: List of metrics to be evaluated by the model during - training and testing. Each of this can be a string (name of a - built-in function), function or a `tf.keras.metrics.Metric` - instance. See `tf.keras.metrics`. Typically you will use - `metrics=['accuracy']`. - A function is any callable with the signature `result = fn(y_true, - y_pred)`. To specify different metrics for different outputs of a - multi-output model, you could also pass a dictionary, such as - `metrics={'output_a':'accuracy', 'output_b':['accuracy', 'mse']}`. - You can also pass a list to specify a metric or a list of metrics - for each output, such as - `metrics=[['accuracy'], ['accuracy', 'mse']]` - or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the - strings 'accuracy' or 'acc', we convert this to one of - `tf.keras.metrics.BinaryAccuracy`, - `tf.keras.metrics.CategoricalAccuracy`, - `tf.keras.metrics.SparseCategoricalAccuracy` based on the shapes - of the targets and of the model output. We do a similar - conversion for the strings 'crossentropy' and 'ce' as well. - The metrics passed here are evaluated without sample weighting; if - you would like sample weighting to apply, you can specify your - metrics via the `weighted_metrics` argument instead. - loss_weights: Optional list or dictionary specifying scalar - coefficients (Python floats) to weight the loss contributions of - different model outputs. The loss value that will be minimized by - the model will then be the *weighted sum* of all individual - losses, weighted by the `loss_weights` coefficients. If a list, - it is expected to have a 1:1 mapping to the model's outputs. If a - dict, it is expected to map output names (strings) to scalar - coefficients. - weighted_metrics: List of metrics to be evaluated and weighted by - `sample_weight` or `class_weight` during training and testing. - run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s - logic will not be wrapped in a `tf.function`. Recommended to leave - this as `None` unless your `Model` cannot be run inside a - `tf.function`. `run_eagerly=True` is not supported when using - `tf.distribute.experimental.ParameterServerStrategy`. - steps_per_execution: Int. Defaults to 1. The number of batches to - run during each `tf.function` call. Running multiple batches - inside a single `tf.function` call can greatly improve performance - on TPUs or small models with a large Python overhead. At most, one - full epoch will be run each execution. If a number larger than the - size of the epoch is passed, the execution will be truncated to - the size of the epoch. Note that if `steps_per_execution` is set - to `N`, `Callback.on_batch_begin` and `Callback.on_batch_end` - methods will only be called every `N` batches (i.e. before/after - each `tf.function` execution). - jit_compile: If `True`, compile the model training step with XLA. - [XLA](https://www.tensorflow.org/xla) is an optimizing compiler - for machine learning. - `jit_compile` is not enabled for by default. - Note that `jit_compile=True` - may not necessarily work for all models. - For more information on supported operations please refer to the - [XLA documentation](https://www.tensorflow.org/xla). - Also refer to - [known XLA issues](https://www.tensorflow.org/xla/known_issues) - for more details. - **kwargs: Arguments supported for backwards compatibility only. - """ - - def __init__( - self, - model: Union[tf.keras.Model, List[tf.keras.Model], Dict[Text, tf.keras.Model]], - optimizer="rmsprop", - loss=None, - metrics=None, - loss_weights=None, - weighted_metrics=None, - use_horovod=None, - run_eagerly=None, - jit_compile=None, - **kwargs - ): - super().__init__(**kwargs) - self._model = {} - if isinstance(model, list): - if len(model) > 0: - self._model = {"main": model[0]} - if len(model) == 2: - self._model["sub_model"] = model[1] - else: - for i in range(1, len(model)): - self._model[f"sub_model{i}"] = model[i] - else: - raise ValueError("Not a reachable model.") - elif isinstance(model, dict): - main_keys = [k for k in model.keys() if "main" in k] - if len(main_keys) == 1: - if (len(model) == 1): - self._model = {"main": next(iter(model.values()))} - else: - self._model = model - else: - raise ValueError(f"Must set only one model with key contains \"main\", found {main_keys}.") - elif isinstance(model, tf.keras.Model): - self._model = {"main": model} - else: - raise ValueError("Not a reachable model.") - - self._loss = loss - self._metrics = metrics - self._loss_weights = loss_weights - self._weighted_metrics = weighted_metrics - - self.use_horovod = use_horovod if use_horovod else FLAGS.use_horovod - self.run_eagerly = run_eagerly if run_eagerly else FLAGS.run_eagerly - self._jit_compile = jit_compile - - self.epochs = FLAGS.epochs - - if is_main_process(): - logging.info(" {} Initialize training".format(time.strftime("%Y%m%d %H:%M:%S"))) - - logging.info("\ttf.app.flags.FLAGS:") - for key, value in sorted(FLAGS.flag_values_dict().items()): - logging.info(f"\t{key:25}= {value}") - - self.global_batch_size = FLAGS.batch_size * FLAGS.num_accumulation_steps - learning_rate = FLAGS.learning_rate - - if self.use_horovod: - self.global_batch_size *= get_world_size() - learning_rate *= get_world_size() - - # TODO: fuhailin - # if isinstance(optimizer, optimizers.Optimizer): - self.optimizer = optimizer - # else: - # raise ValueError("Not support opt.") - self.use_float16 = common_flags.use_float16() - if self.use_float16: - self.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(self.optimizer, dynamic=True) - - with distribution_utils.get_strategy_scope(self._distribution_strategy): - # To correctly place the model weights on accelerators, - # model should be created in scope. - if isinstance(self._loss, compile_utils.LossesContainer): - self.loss_container = self._loss - else: - self.loss_container = compile_utils.LossesContainer( - self._loss, self._loss_weights, output_names=self.main_model.output_names - ) - self.metric_container = HvdMetricsContainer( - self._metrics, - self._weighted_metrics, - output_names=self.main_model.output_names, - # from_serialized=from_serialized, - ) if self._metrics or self._weighted_metrics else None - - @property - def main_model(self): - """ - Returns: - The main model - """ - if len(self._model) == 1: - return self._model["main"] - else: - for name, _model in self._model.items(): - if "main" in name: - return _model - ValueError("Could not find the main model.") - - @property - def models(self): - if len(self._model) == 1: - return self._model["main"] - else: - return self._model - - @property - def checkpoint(self): - if len(self._checkpoints) == 1: - return self._checkpoints["main"] - else: - return self._checkpoints - - @property - def manager(self): - if len(self._managers) == 1: - return self._managers["main"] - else: - return self._managers - - def fit( - self, - train_input=None, - eval_input=None, - eval_steps=None, - verbose="auto", - callbacks=[], - steps_per_epoch: int = None, - ): - """Trains the model for a fixed number of epochs (dataset iterations). - - Args: - x: Input data. It could be: - - A Numpy array (or array-like), or a list of arrays - (in case the model has multiple inputs). - - A TensorFlow tensor, or a list of tensors - (in case the model has multiple inputs). - - A dict mapping input names to the corresponding array/tensors, - if the model has named inputs. - - A `tf.data` dataset. Should return a tuple - of either `(inputs, targets)` or - `(inputs, targets, sample_weights)`. - - A generator or `keras.utils.Sequence` returning `(inputs, - targets)` or `(inputs, targets, sample_weights)`. - - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a - callable that takes a single argument of type - `tf.distribute.InputContext`, and returns a `tf.data.Dataset`. - `DatasetCreator` should be used when users prefer to specify the - per-replica batching and sharding logic for the `Dataset`. - See `tf.keras.utils.experimental.DatasetCreator` doc for more - information. - A more detailed description of unpacking behavior for iterator - types (Dataset, generator, Sequence) is given below. If these - include `sample_weights` as a third component, note that sample - weighting applies to the `weighted_metrics` argument but not the - `metrics` argument in `compile()`. If using - `tf.distribute.experimental.ParameterServerStrategy`, only - `DatasetCreator` type is supported for `x`. - y: Target data. Like the input data `x`, - it could be either Numpy array(s) or TensorFlow tensor(s). - It should be consistent with `x` (you cannot have Numpy inputs and - tensor targets, or inversely). If `x` is a dataset, generator, - or `keras.utils.Sequence` instance, `y` should - not be specified (since targets will be obtained from `x`). - batch_size: Integer or `None`. - Number of samples per gradient update. - If unspecified, `batch_size` will default to 32. - Do not specify the `batch_size` if your data is in the - form of datasets, generators, or `keras.utils.Sequence` - instances (since they generate batches). - epochs: Integer. Number of epochs to train the model. - An epoch is an iteration over the entire `x` and `y` - data provided - (unless the `steps_per_epoch` flag is set to - something other than None). - Note that in conjunction with `initial_epoch`, - `epochs` is to be understood as "final epoch". - The model is not trained for a number of iterations - given by `epochs`, but merely until the epoch - of index `epochs` is reached. - verbose: 'auto', 0, 1, or 2. Verbosity mode. - 0 = silent, 1 = progress bar, 2 = one line per epoch. - 'auto' defaults to 1 for most cases, but 2 when used with - `ParameterServerStrategy`. Note that the progress bar is not - particularly useful when logged to a file, so verbose=2 is - recommended when not running interactively (eg, in a production - environment). - callbacks: List of `keras.callbacks.Callback` instances. - List of callbacks to apply during training. - See `tf.keras.callbacks`. Note - `tf.keras.callbacks.ProgbarLogger` and - `tf.keras.callbacks.History` callbacks are created automatically - and need not be passed into `model.fit`. - `tf.keras.callbacks.ProgbarLogger` is created or not based on - `verbose` argument to `model.fit`. - Callbacks with batch-level calls are currently unsupported with - `tf.distribute.experimental.ParameterServerStrategy`, and users - are advised to implement epoch-level calls instead with an - appropriate `steps_per_epoch` value. - validation_split: Float between 0 and 1. - Fraction of the training data to be used as validation data. - The model will set apart this fraction of the training data, - will not train on it, and will evaluate - the loss and any model metrics - on this data at the end of each epoch. - The validation data is selected from the last samples - in the `x` and `y` data provided, before shuffling. This - argument is not supported when `x` is a dataset, generator or - `keras.utils.Sequence` instance. - If both `validation_data` and `validation_split` are provided, - `validation_data` will override `validation_split`. - `validation_split` is not yet supported with - `tf.distribute.experimental.ParameterServerStrategy`. - validation_data: Data on which to evaluate - the loss and any model metrics at the end of each epoch. - The model will not be trained on this data. Thus, note the fact - that the validation loss of data provided using - `validation_split` or `validation_data` is not affected by - regularization layers like noise and dropout. - `validation_data` will override `validation_split`. - `validation_data` could be: - - A tuple `(x_val, y_val)` of Numpy arrays or tensors. - - A tuple `(x_val, y_val, val_sample_weights)` of NumPy - arrays. - - A `tf.data.Dataset`. - - A Python generator or `keras.utils.Sequence` returning - `(inputs, targets)` or `(inputs, targets, sample_weights)`. - `validation_data` is not yet supported with - `tf.distribute.experimental.ParameterServerStrategy`. - shuffle: Boolean (whether to shuffle the training data - before each epoch) or str (for 'batch'). This argument is - ignored when `x` is a generator or an object of tf.data.Dataset. - 'batch' is a special option for dealing - with the limitations of HDF5 data; it shuffles in batch-sized - chunks. Has no effect when `steps_per_epoch` is not `None`. - class_weight: Optional dictionary mapping class indices (integers) - to a weight (float) value, used for weighting the loss function - (during training only). - This can be useful to tell the model to - "pay more attention" to samples from - an under-represented class. - sample_weight: Optional Numpy array of weights for - the training samples, used for weighting the loss function - (during training only). You can either pass a flat (1D) - Numpy array with the same length as the input samples - (1:1 mapping between weights and samples), - or in the case of temporal data, - you can pass a 2D array with shape - `(samples, sequence_length)`, - to apply a different weight to every timestep of every sample. - This argument is not supported when `x` is a dataset, generator, - or `keras.utils.Sequence` instance, instead provide the - sample_weights as the third element of `x`. - Note that sample weighting does not apply to metrics specified - via the `metrics` argument in `compile()`. To apply sample - weighting to your metrics, you can specify them via the - `weighted_metrics` in `compile()` instead. - initial_epoch: Integer. - Epoch at which to start training - (useful for resuming a previous training run). - steps_per_epoch: Integer or `None`. - Total number of steps (batches of samples) - before declaring one epoch finished and starting the - next epoch. When training with input tensors such as - TensorFlow data tensors, the default `None` is equal to - the number of samples in your dataset divided by - the batch size, or 1 if that cannot be determined. If x is a - `tf.data` dataset, and 'steps_per_epoch' - is None, the epoch will run until the input dataset is - exhausted. When passing an infinitely repeating dataset, you - must specify the `steps_per_epoch` argument. If - `steps_per_epoch=-1` the training will run indefinitely with an - infinitely repeating dataset. This argument is not supported - with array inputs. - When using `tf.distribute.experimental.ParameterServerStrategy`: - * `steps_per_epoch=None` is not supported. - eval_steps: Only relevant if `validation_data` is provided and - is a `tf.data` dataset. Total number of steps (batches of - samples) to draw before stopping when performing validation - at the end of every epoch. If 'eval_steps' is None, - validation will run until the `validation_data` dataset is - exhausted. In the case of an infinitely repeated dataset, it - will run into an infinite loop. If 'eval_steps' is - specified and only part of the dataset will be consumed, the - evaluation will start from the beginning of the dataset at each - epoch. This ensures that the same validation samples are used - every time. - validation_batch_size: Integer or `None`. - Number of samples per validation batch. - If unspecified, will default to `batch_size`. - Do not specify the `validation_batch_size` if your data is in - the form of datasets, generators, or `keras.utils.Sequence` - instances (since they generate batches). - validation_freq: Only relevant if validation data is provided. - Integer or `collections.abc.Container` instance (e.g. list, tuple, - etc.). If an integer, specifies how many training epochs to run - before a new validation run is performed, e.g. `validation_freq=2` - runs validation every 2 epochs. If a Container, specifies the - epochs on which to run validation, e.g. - `validation_freq=[1, 2, 10]` runs validation at the end of the - 1st, 2nd, and 10th epochs. - max_queue_size: Integer. Used for generator or - `keras.utils.Sequence` input only. Maximum size for the generator - queue. If unspecified, `max_queue_size` will default to 10. - workers: Integer. Used for generator or `keras.utils.Sequence` input - only. Maximum number of processes to spin up - when using process-based threading. If unspecified, `workers` - will default to 1. - use_multiprocessing: Boolean. Used for generator or - `keras.utils.Sequence` input only. If `True`, use process-based - threading. If unspecified, `use_multiprocessing` will default to - `False`. Note that because this implementation relies on - multiprocessing, you should not pass non-picklable arguments to - the generator as they can't be passed easily to children - processes. - - Unpacking behavior for iterator-like inputs: - A common pattern is to pass a tf.data.Dataset, generator, or - tf.keras.utils.Sequence to the `x` argument of fit, which will in fact - yield not only features (x) but optionally targets (y) and sample - weights. Keras requires that the output of such iterator-likes be - unambiguous. The iterator should return a tuple of length 1, 2, or 3, - where the optional second and third elements will be used for y and - sample_weight respectively. Any other type provided will be wrapped in - a length one tuple, effectively treating everything as 'x'. When - yielding dicts, they should still adhere to the top-level tuple - structure. - e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate - features, targets, and weights from the keys of a single dict. - A notable unsupported data type is the namedtuple. The reason is - that it behaves like both an ordered datatype (tuple) and a mapping - datatype (dict). So given a namedtuple of the form: - `namedtuple("example_tuple", ["y", "x"])` - it is ambiguous whether to reverse the order of the elements when - interpreting the value. Even worse is a tuple of the form: - `namedtuple("other_tuple", ["x", "y", "z"])` - where it is unclear if the tuple was intended to be unpacked into x, - y, and sample_weight or passed through as a single element to `x`. As - a result the data processing code will simply raise a ValueError if it - encounters a namedtuple. (Along with instructions to remedy the - issue.) - - Returns: - A `History` object. Its `History.history` attribute is - a record of training loss values and metrics values - at successive epochs, as well as validation loss values - and validation metrics values (if applicable). - - Raises: - RuntimeError: 1. If the model was never compiled or, - 2. If `model.fit` is wrapped in `tf.function`. - - ValueError: In case of mismatch between the provided input data - and what the model expects or when the input data is empty. - """ - self.steps_per_epoch = steps_per_epoch if steps_per_epoch else -1 - self.eval_steps = eval_steps - if FLAGS.benchmark or FLAGS.stop_steps >= 0: - if FLAGS.stop_steps >= 0: - self.steps_per_epoch = FLAGS.stop_steps - else: - self.steps_per_epoch = 1000 - self.epochs = 1 - - if FLAGS.keras_use_ctl: - self._performance_calculator = PerformanceCalculator(total_steps=self.steps_per_epoch * self.epochs) - - self.steps_per_loop = FLAGS.steps_per_summary - if 1 < self.steps_per_epoch < self.steps_per_loop: - if is_main_process(): - logging.error( - 'steps_per_summary: %d is specified to be greater than ' - ' steps_per_epoch: %d, we will use steps_per_epoch as' - ' steps_per_summary.', self.steps_per_loop, self.steps_per_epoch - ) - self.steps_per_loop = self.steps_per_epoch - - self._configure_steps_per_execution(self.steps_per_loop or 1) - assert tf.executing_eagerly() - - if self.run_eagerly: - # if self.steps_per_loop > 1: - # raise ValueError( - # 'steps_per_loop is used for performance optimization. When you want ' - # 'to run eagerly, you cannot leverage graph mode loop.') - if isinstance(self._distribution_strategy, tf.distribute.experimental.TPUStrategy): - raise ValueError( - 'TPUStrategy should not run eagerly as it heavily replies on graph' - ' optimization for the distributed system.' - ) - - self.make_train_function() - - # Create summary writers - if is_main_process(): - self.summary_dir = os.path.join(FLAGS.model_dir, 'summaries') - self.eval_summary_writer = tf.summary.create_file_writer(os.path.join(self.summary_dir, 'eval')) - if self.steps_per_loop >= _MIN_SUMMARY_STEPS: - # Only writes summary when the stats are collected sufficiently over - # enough steps. - self.train_summary_writer = tf.summary.create_file_writer(os.path.join(self.summary_dir, 'train')) - else: - self.train_summary_writer = None - else: - self.eval_summary_writer = None - self.train_summary_writer = None - eval_input_fn = None - - self._checkpoints, self._managers = {}, {} - for name, model in self._model.items(): - if "main" in name: - _checkpoint = tf.train.Checkpoint(model=model, optimizer=self.optimizer) - self._checkpoints[name] = _checkpoint - self._managers[name] = tf.train.CheckpointManager( - _checkpoint, os.path.join(FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=3 - ) - else: - _checkpoint = tf.train.Checkpoint(model=model) - self._checkpoints[name] = _checkpoint - self._managers[name] = tf.train.CheckpointManager( - _checkpoint, os.path.join(FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=3 - ) - - if FLAGS.init_checkpoint: - for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), FLAGS.init_checkpoint): - if init_ckpt: - if tf.io.gfile.isdir(init_ckpt): - latest_checkpoint = tf.train.latest_checkpoint(init_ckpt) - else: - latest_checkpoint = init_ckpt - logging.info( - f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.' - ) - ckpt.restore(latest_checkpoint).assert_existing_objects_matched() - logging.info('Loading from checkpoint file completed') - - if FLAGS.init_weights: - for (name, _model), init_weight in zip(self._model.items(), FLAGS.init_weights): - if init_weight: - logging.info(f'variables file {init_weight} found and restoring from initial variables for {name} model.') - _model.load_weights(os.path.join(init_weight, "variables")) - logging.info('Loading from weights file completed') - - if FLAGS.num_accumulation_steps > 1: - self.accum_gradients = GradientAccumulator() - - verbose = 0 # training_module._get_verbosity(verbose, self._distribution_strategy) - - # Container that configures and calls `tf.keras.Callback`s. - if not isinstance(callbacks, HvdCallbackList): - self.callbacks = HvdCallbackList( - callbacks, - add_history=True, - add_progbar=verbose != 0, - model=self.main_model, - verbose=verbose, - epochs=self.epochs, - steps=self.steps_per_epoch * self.epochs, - ) - return self.run_customized_training_loop(train_input, eval_input) - else: - if FLAGS.use_horovod and not FLAGS.use_dynamic_embedding: - # Add Horovod Distributed Optimizer - opt = hvd.DistributedOptimizer(self.optimizer) - else: - opt = self.optimizer - - self.main_model.compile( - optimizer=opt, - loss=self._loss, - loss_weights=self._loss_weights, - metrics=self._metrics, - weighted_metrics=self._weighted_metrics, - run_eagerly=self.run_eagerly - ) - - # if not FLAGS.benchmark: - # # Create Tensorboard summary and checkpoint callbacks. - # summary_dir = os.path.join(FLAGS.model_dir, "summaries") - # callbacks.append(tf.keras.callbacks.TensorBoard(summary_dir, profile_batch=0)) - - # # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. - # if is_main_process(): - # checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") - # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True)) - - if FLAGS.use_horovod: - callbacks += [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - # hvd callback用于广播rank0的初始化器产生的值 - de.keras.callbacks.DEHvdBroadcastGlobalVariablesCallback(root_rank=0) - if FLAGS.use_dynamic_embedding else hvd.callbacks.BroadcastGlobalVariablesCallback(0), - ] - - # Horovod: write logs on worker 0. - verbose = 2 if is_main_process() else 0 - history = self.main_model.fit( - train_input, - epochs=self.epochs, - steps_per_epoch=self.steps_per_epoch if self.steps_per_epoch else None, - callbacks=callbacks, - validation_data=eval_input, - validation_steps=eval_steps, - verbose=verbose - ) - return history - - def run_customized_training_loop( - self, - train_input=None, - eval_input=None, - ): - # if self.epochs > 1 and FLAGS.num_train_examples == -1: - # raise ValueError('When the num_train_examples is INFINITE or UNKNOWN, we just can run one epoch.') - - # Training loop starts here. - self.current_step = self._first_steps = self.optimizer.iterations.numpy() - - if self.use_horovod: - with tf.init_scope(): - self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name='first_batch') - if not hasattr(self.main_model, 'optimizer'): - raise ValueError('User should set optimizer attribute to model ' - 'inside `model_fn`.') - # if self.sub_model_export_name and self.sub_model is None: - # raise ValueError('sub_model_export_name is specified as %s, but ' - # 'sub_model is None.' % self.sub_model_export_name) - - self._steps_from_save = 0 - start_time = time.time() - self._perf_wo = 0 - self._perf_wo_n = 0 - - self.callbacks.on_train_begin() - training_logs = None - for epoch in range(self.epochs): - train_iterator = distribution_utils.make_distributed_iterator(self._distribution_strategy, train_input) - self.on_epoch_begin(epoch) - while self.steps_per_epoch < 0 or self._step_epoch < self.steps_per_epoch: - t0 = time.time() - self.callbacks.on_train_batch_begin(self.current_step) - # Runs several steps in the host while loop. - steps, num_accumulation_steps = self.steps_to_run(self.current_step, self.steps_per_epoch, self.steps_per_loop) - - try: - if steps == 1: - training_logs = self._train_step(next(train_iterator), num_accumulation_steps) - else: - # Converts steps to a Tensor to avoid tf.function retracing. - training_logs = self._train_steps( - train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32), num_accumulation_steps - ) - except (tf.errors.OutOfRangeError, StopIteration): - if is_main_process(): - logging.info(f"Done reading data for epoch {epoch}") - if self.optimizer.iterations.numpy() == self._first_steps: - logging.warning("No data was processed.") - return None - elif steps > 1 and self.optimizer.iterations.numpy() > self.current_step: - steps = self.optimizer.iterations.numpy() - self.current_step - training_logs = self.get_metrics_result() - self.on_batch_end(training_logs, steps, t0) - break - - self.on_batch_end(training_logs, steps, t0) - self.on_epoch_end(epoch, self.current_step, eval_input, epoch_logs=training_logs) - if self.main_model.stop_training: - logging.info(f"self.model.stop_training = {self.main_model.stop_training}") - break - self.callbacks.on_train_end(logs=training_logs) - - total_time = time.time() - start_time - results_perf = self._performance_calculator.results - if not self._performance_calculator.completed: - logging.info(f"self._performance_calculator.completed: {self._performance_calculator.completed}") - results_perf = self._performance_calculator.get_current_benchmark_results() - - export.export_to_checkpoint(self.manager, self.current_step) - if is_main_process(): - training_summary = {'total_training_steps': self.current_step} - if self.loss_container: - training_summary['train_loss'] = self._float_metric_value(self.loss_container.metrics[0]) - - if self.metric_container and self.metric_container.metrics: - # TODO(hongkuny): Cleans up summary reporting in text. - for metric in self.metric_container.metrics: - training_summary['last_' + metric.name] = self._float_metric_value(metric) - # training_summary['eval_metrics'] = _float_metric_value(self.metric_container.metrics[0]) - - write_txt_summary(training_summary, self.summary_dir) - - dllogging = dllogger_class.dllogger_class(FLAGS.dllog_path) - total_sentences = self.current_step * self.global_batch_size - logging.info("-----------------------------") - logging.info(" Batch size = %d", FLAGS.batch_size) - logging.info(" Num steps = %d", self.current_step) - logging.info(" LR = %g", FLAGS.learning_rate) - if self.use_horovod: - logging.info("Multi-GPU training with TF Horovod") - logging.info("hvd.size() = %d", get_world_size()) - logging.info("Total Training Time = %0.2f for Examples = %d", total_time, total_sentences) - logging.info("Throughput Average (examples/sec) with overhead = %0.2f", results_perf['throughput']) - if self._perf_wo_n != 0: - logging.info("Throughput Average (examples/sec) = %0.2f", self._perf_wo / self._perf_wo_n) - logging.info("-----------------------------") - - if dllogging and self._perf_wo_n != 0: - dllogging.logger.log( - step=(), data={"throughput_train": self._perf_wo / self._perf_wo_n}, verbosity=Verbosity.DEFAULT - ) - dllogging.logger.log(step=(), data={"total_loss": training_summary['train_loss']}, verbosity=Verbosity.DEFAULT) - dllogging.logger.log(data=results_perf, step=tuple()) - - return self.main_model - - def train_single_step(self, iterator, num_grad_accumulates): - """Performs a distributed training step. - - Args: - iterator: the distributed iterator of training datasets. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ - if num_grad_accumulates != 1: - for _ in tf.range(num_grad_accumulates): - self.forward(iterator) - if _ == 0 or (_ + 1) % num_grad_accumulates == 0: - self.step(num_grad_accumulates) - if self.use_horovod and self.first_batch: - self.do_broadcast() - else: - self._replicated_step(iterator) - return self.get_metrics_result() - - @property - def trainable_variables(self): - if hasattr(self.loss_container, 'trainable_variables'): - return self.main_model.trainable_variables + self.loss_container.trainable_variables - else: - return self.main_model.trainable_variables - - def do_broadcast(self): - model_broadcast_vars = [ - var for var in self.main_model.variables - if (not isinstance(var, TrainableWrapper)) and (not isinstance(var, DEResourceVariable)) - ] - opt_broadcast_vars = [ - var for var in self.optimizer.variables() - if (not isinstance(var, TrainableWrapper)) and (not isinstance(var, DEResourceVariable)) - ] - - print_op = tf.print( - f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...", - output_stream=sys.stdout - ) - with tf.control_dependencies([print_op]): - hvd.broadcast_variables(model_broadcast_vars + opt_broadcast_vars, root_rank=0) - self.first_batch.assign(False) - - def _replicated_step(self, inputs): - """Replicated training step.""" - x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(inputs) - with tf.GradientTape() as tape: - model_outputs = self.main_model(x, training=True) - loss = self.loss_container(y, model_outputs, sample_weight=sample_weight) - - if self.use_horovod and not FLAGS.use_dynamic_embedding: - tape = hvd.DistributedGradientTape( - tape, sparse_as_dense=False, compression=Compression.fp16 if self.use_float16 else Compression.none - ) - # Run backwards pass. - self.optimizer.minimize(loss, self.trainable_variables, tape=tape) - - if self.use_horovod and self.first_batch: - self.do_broadcast() - - # For reporting, the metric takes the mean of losses. - if self.metric_container: - self.metric_container.update_state(y_true=y, y_pred=model_outputs, sample_weight=sample_weight) - - def forward(self, inputs): - x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(inputs) - with tf.GradientTape() as tape: - model_outputs = self.main_model(x, training=True) - loss = self.loss_container(y, model_outputs, sample_weight=sample_weight) - - # Compute gradients - if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"): - grads_and_vars = self.optimizer._compute_gradients(loss=loss, var_list=self.trainable_variables, tape=tape) - else: - grads_and_vars = self.optimizer.compute_gradients(loss=loss, var_list=self.trainable_variables, tape=tape) - grads = [g for g, _ in grads_and_vars] - self.accum_gradients.add_gradients(grads) - - # For reporting, the metric takes the mean of losses. - if self.metric_container: - self.metric_container.update_state(y_true=y, y_pred=model_outputs, sample_weight=sample_weight) - - def step(self, num_grad_accumulates): - gradients = self.accum_gradients.gradients - if self.use_horovod: - gradients = [ - None if g is None else hvd.allreduce( - g / tf.cast(num_grad_accumulates, g.dtype), - compression=Compression.fp16 if self.use_float16 else Compression.none - ) for g in gradients - ] - else: - gradients = [None if g is None else g / tf.cast(num_grad_accumulates, g.dtype) for g in gradients] - - self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) - self.accum_gradients.reset() - - def train_steps_strategy(self, iterator, steps, num_grad_accumulates): - """Performs distributed training steps in a loop. - - Args: - iterator: the distributed iterator of training datasets. - steps: a tf.int32 integer tensor to specify number of steps to run - inside host training loop. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ - if not isinstance(steps, tf.Tensor): - raise ValueError('steps should be an Tensor. Python object may cause ' - 'retracing.') - - if num_grad_accumulates != 1: - for _ in tf.range(steps * num_grad_accumulates): - self._distribution_strategy.run(self.forward, args=(next(iterator),)) - if _ == 0 or (_ + 1) % num_grad_accumulates == 0: - self._distribution_strategy.run(self.step, args=(num_grad_accumulates,)) - else: - for _ in tf.range(steps): - self._distribution_strategy.run(self._replicated_step, args=(next(iterator),)) - return self.get_metrics_result() - - def train_steps(self, iterator, steps, num_grad_accumulates): - if not isinstance(steps, tf.Tensor): - raise ValueError('steps should be an Tensor. Python object may cause ' - 'retracing.') - - if num_grad_accumulates != 1: - for _ in tf.range(steps * num_grad_accumulates): - self.forward(next(iterator)) - if _ == 0 or (_ + 1) % num_grad_accumulates == 0: - self.step(num_grad_accumulates) - if self.use_horovod and self.first_batch: - self.do_broadcast() - else: - for _ in tf.range(steps): - self._replicated_step(next(iterator)) - return self.get_metrics_result() - - def train_single_step_strategy(self, iterator, num_grad_accumulates): - """Performs a distributed training step. - - Args: - iterator: the distributed iterator of training datasets. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ - if num_grad_accumulates != 1: - for _ in tf.range(num_grad_accumulates): - self._distribution_strategy.run(self.forward, args=(iterator,)) - if _ == 0 or (_ + 1) % num_grad_accumulates == 0: - self._distribution_strategy.run(self.step, args=(num_grad_accumulates,)) - else: - self._distribution_strategy.run(self._replicated_step, args=(iterator,)) - return self.get_metrics_result() - - def make_train_function(self): - if not self.run_eagerly: - _train_single_step = tf.function(self.train_single_step) - _train_multi_steps = tf.function(self.train_steps) - else: - _train_single_step = self.train_single_step - _train_multi_steps = self.train_steps - - if self._distribution_strategy: - self._train_step = self.train_single_step_strategy - self._train_steps = self.train_steps_strategy - else: - self._train_step = _train_single_step - self._train_steps = _train_multi_steps diff --git a/deepray/core/base_trainer_test.py b/deepray/core/base_trainer_test.py deleted file mode 100644 index 29d59520..00000000 --- a/deepray/core/base_trainer_test.py +++ /dev/null @@ -1,351 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for tensorflow_models.core.trainers.trainer.""" -# pylint: disable=g-direct-tensorflow-import -import gc -import multiprocessing -import os -import sys - -from absl.testing import parameterized -import orbit -import portpicker -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.core import base_trainer as trainer_lib -from official.core import config_definitions as cfg -from official.core import train_lib -from official.utils.testing import mock_task - -TPU_TEST = 'test_tpu' in sys.argv[0] -GPU_TEST = 'test_gpu' in sys.argv[0] - - -def all_strategy_combinations(): - return combinations.combine( - distribution=[ - strategy_combinations.default_strategy, - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - ) - - -def create_in_process_cluster(num_workers, num_ps): - """Creates and starts local servers and returns the cluster_resolver.""" - worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)] - ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)] - - cluster_dict = {} - cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports] - if num_ps > 0: - cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports] - - cluster_spec = tf.train.ClusterSpec(cluster_dict) - - # Workers need some inter_ops threads to work properly. - worker_config = tf.compat.v1.ConfigProto() - if multiprocessing.cpu_count() < num_workers + 1: - worker_config.inter_op_parallelism_threads = num_workers + 1 - - for i in range(num_workers): - tf.distribute.Server(cluster_spec, job_name='worker', task_index=i, config=worker_config, protocol='grpc') - - for i in range(num_ps): - tf.distribute.Server(cluster_spec, job_name='ps', task_index=i, protocol='grpc') - - cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(cluster_spec, rpc_layer='grpc') - return cluster_resolver - - -def dataset_fn(input_context=None): - del input_context - - def dummy_data(_): - return tf.zeros((1, 1), dtype=tf.float32) - - dataset = tf.data.Dataset.range(1) - dataset = dataset.repeat() - dataset = dataset.map(dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) - return dataset - - -class MockAsyncTrainer(trainer_lib._AsyncTrainer): - """Mock AsyncTrainer to test the _AsyncTrainer class.""" - - def __init__(self): - self._strategy = tf.distribute.get_strategy() - self.init_async() - - self.global_step = tf.Variable( - 0, dtype=tf.int64, name='global_step', trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA - ) - self.eval_global_step = tf.Variable( - 0, - dtype=tf.int64, - name='eval_global_step', - trainable=False, - aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA - ) - - train_dataset = self.distribute_dataset(dataset_fn) - orbit.StandardTrainer.__init__(self, train_dataset, options=orbit.StandardTrainerOptions()) - - validation_dataset = self.distribute_dataset(dataset_fn) - orbit.StandardEvaluator.__init__( - self, validation_dataset, options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True) - ) - - def train_loop_begin(self): - self.global_step.assign(0) - - def train_step(self, iterator): - - def replica_step(_): - self.global_step.assign_add(1) - - self._strategy.run(replica_step, args=(next(iterator),)) - - def train_loop_end(self): - self.join() - return self.global_step.numpy() - - def eval_begin(self): - self.eval_global_step.assign(0) - - def eval_step(self, iterator): - - def replica_step(_): - self.eval_global_step.assign_add(1) - - self._strategy.run(replica_step, args=(next(iterator),)) - - def eval_end(self): - self.join() - return self.eval_global_step.numpy() - - -class TrainerTest(tf.test.TestCase, parameterized.TestCase): - - def setUp(self): - super().setUp() - self._config = cfg.ExperimentConfig( - trainer=cfg.TrainerConfig( - optimizer_config=cfg - .OptimizationConfig({ - 'optimizer': { - 'type': 'sgd' - }, - 'learning_rate': { - 'type': 'constant' - } - }) - ) - ) - - def tearDown(self): - gc.collect() - # This will only contain uncollectable garbage, i.e. reference cycles - # involving objects with __del__ defined. - self.assertEmpty(gc.garbage) - super().tearDown() - - def create_test_trainer(self, config, model_dir=None, task=None): - task = task or mock_task.MockTask(config.task, logging_dir=model_dir) - ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir) - trainer = trainer_lib.Trainer( - config, - task, - model=task.build_model(), - optimizer=task.create_optimizer(config.trainer.optimizer_config, config.runtime), - checkpoint_exporter=ckpt_exporter - ) - return trainer - - @combinations.generate(all_strategy_combinations()) - def test_trainer_train(self, distribution): - with distribution.scope(): - trainer = self.create_test_trainer(self._config) - logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('training_loss', logs) - self.assertIn('learning_rate', logs) - - @combinations.generate(all_strategy_combinations()) - def test_trainer_passing_datasets(self, distribution): - with distribution.scope(): - task = mock_task.MockTask(self._config) - train_dataset = orbit.utils.make_distributed_dataset( - distribution, task.build_inputs, self._config.task.train_data - ) - validation_dataset = orbit.utils.make_distributed_dataset( - distribution, task.build_inputs, self._config.task.validation_data - ) - self._config.task.train_data = None - self._config.task.validation_data = None - trainer = trainer_lib.Trainer( - self._config, - task, - model=task.build_model(), - optimizer=task.create_optimizer(self._config.trainer.optimizer_config, self._config.runtime), - train_dataset=train_dataset, - validation_dataset=validation_dataset - ) - logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('training_loss', logs) - self.assertIn('learning_rate', logs) - logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('validation_loss', logs) - - def test_base_async_trainer(self): - if TPU_TEST or GPU_TEST: - self.skipTest('Aysnc training is not available on GPU/GPU.') - num_workers = 3 - num_ps = 2 - cluster_resolver = create_in_process_cluster(num_workers, num_ps) - distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver) - with distribution.scope(): - trainer = MockAsyncTrainer() - trainer.init_async() - self.assertIsInstance(trainer._coordinator, tf.distribute.experimental.coordinator.ClusterCoordinator) - self.assertEqual(trainer.train(tf.constant(10)), 10) - self.assertEqual(trainer.evaluate(tf.constant(11)), 11) - - def test_async_trainer_train(self): - if TPU_TEST or GPU_TEST: - self.skipTest('Aysnc training is not available on GPU/TPU.') - num_workers = 3 - num_ps = 2 - cluster_resolver = create_in_process_cluster(num_workers, num_ps) - distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver) - with distribution.scope(): - config = cfg.ExperimentConfig(**self._config.as_dict()) - config.trainer.eval_tf_while_loop = True - trainer = self.create_test_trainer(config) - logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('training_loss', logs) - self.assertIn('learning_rate', logs) - - def test_async_trainer_validate(self): - if TPU_TEST or GPU_TEST: - self.skipTest('Aysnc training is not available on GPU/GPU.') - num_workers = 3 - num_ps = 2 - cluster_resolver = create_in_process_cluster(num_workers, num_ps) - distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver) - with distribution.scope(): - config = cfg.ExperimentConfig(**self._config.as_dict()) - config.trainer.eval_tf_while_loop = True - trainer = self.create_test_trainer(config) - logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('acc', logs) - self.assertIn('validation_loss', logs) - - @combinations.generate(all_strategy_combinations()) - def test_trainer_validate(self, distribution): - with distribution.scope(): - trainer = self.create_test_trainer(self._config) - logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync) - self.assertIn('validation_loss', logs) - - @combinations.generate(all_strategy_combinations()) - def test_trainer_validate_without_loss(self, distribution): - - class MockTaskWithoutValidationLoss(mock_task.MockTask): - - def validation_step(self, inputs, model, metrics=None): - # Disable validation loss. - logs = super().validation_step(inputs, model) - del logs[self.loss] - return logs - - with distribution.scope(): - task = MockTaskWithoutValidationLoss() - trainer = self.create_test_trainer(self._config, task=task) - logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync) - self.assertNotIn('validation_loss', logs) - - @combinations.generate( - combinations.combine( - mixed_precision_dtype=['float32', 'bfloat16', 'float16'], - loss_scale=[None, 'dynamic', 128, 256], - ) - ) - def test_configure_optimizer(self, mixed_precision_dtype, loss_scale): - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale), - trainer=cfg.TrainerConfig( - optimizer_config=cfg - .OptimizationConfig({ - 'optimizer': { - 'type': 'sgd' - }, - 'learning_rate': { - 'type': 'constant' - }, - }) - ) - ) - trainer = self.create_test_trainer(config) - if mixed_precision_dtype == 'float16': - self.assertIsInstance(trainer.optimizer, tf.keras.mixed_precision.LossScaleOptimizer) - if loss_scale in (None, 'dynamic'): - self.assertTrue(trainer.optimizer.dynamic) - else: - self.assertFalse(trainer.optimizer.dynamic) - self.assertEqual(trainer.optimizer.initial_scale, loss_scale) - else: - self.assertIsInstance(trainer.optimizer, (tf.keras.optimizers.SGD, tf.keras.optimizers.legacy.SGD)) - - metrics = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('training_loss', metrics) - - def test_export_best_ckpt(self): - config = cfg.ExperimentConfig( - trainer=cfg.TrainerConfig( - best_checkpoint_export_subdir='best_ckpt', - best_checkpoint_eval_metric='acc', - optimizer_config=cfg - .OptimizationConfig({ - 'optimizer': { - 'type': 'sgd' - }, - 'learning_rate': { - 'type': 'constant' - } - }) - ) - ) - model_dir = self.get_temp_dir() - trainer = self.create_test_trainer(config, model_dir=model_dir) - trainer.fit(tf.convert_to_tensor(1, dtype=tf.int32)) - trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32)) - self.assertTrue(tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json'))) - - def test_model_with_compiled_loss(self): - task = mock_task.MockTask() - model = task.build_model() - model.compile(loss=tf.keras.losses.CategoricalCrossentropy()) - trainer = trainer_lib.Trainer( - self._config, task, model=model, optimizer=task.create_optimizer(self._config.trainer.optimizer_config) - ) - logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32)) - self.assertIn('training_loss', logs) - - -if __name__ == '__main__': - tf.test.main() diff --git a/deepray/core/common/distribution_utils.py b/deepray/core/common/distribution_utils.py index 73887a35..879834a6 100644 --- a/deepray/core/common/distribution_utils.py +++ b/deepray/core/common/distribution_utils.py @@ -24,8 +24,6 @@ from deepray.utils.horovod_utils import is_main_process -FLAGS = flags.FLAGS - def _collective_communication(all_reduce_alg): """Return a CollectiveCommunication based on all_reduce_alg. @@ -97,7 +95,7 @@ def tpu_initialize(tpu_address): return cluster_resolver -def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, num_packs=1, **kwargs): +def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None, num_packs=1, **kwargs): """Return a Strategy for running the model. Args: distribution_strategy: a string specifying which distribution strategy to @@ -126,15 +124,15 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n `distribution_strategy` is `tpu` but `tpu_address` is not specified. """ del kwargs - if FLAGS.num_gpus < 0: + if flags.FLAGS.num_gpus < 0: raise ValueError("`num_gpus` can not be negative.") - if FLAGS.use_horovod: + if flags.FLAGS.use_horovod: distribution_strategy = "off" if is_main_process(): - logging.info("Run horovod and turn off distribution strategy.") + logging.info("Run horovod and turn off TF distribution strategy.") else: - distribution_strategy = FLAGS.distribution_strategy + distribution_strategy = flags.FLAGS.distribution_strategy if not isinstance(distribution_strategy, str): msg = ("distribution_strategy must be a string but got: %s." % (distribution_strategy,)) @@ -152,7 +150,7 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n if distribution_strategy == "tpu": # When tpu_address is an empty string, we communicate with local TPUs. - cluster_resolver = tpu_initialize(FLAGS.tpu) + cluster_resolver = tpu_initialize(flags.FLAGS.tpu_address) return tf.distribute.TPUStrategy(cluster_resolver) if distribution_strategy == "multi_worker_mirrored": @@ -161,25 +159,25 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n ) if distribution_strategy == "one_device": - if FLAGS.num_gpus == 0: + if flags.FLAGS.num_gpus == 0: return tf.distribute.OneDeviceStrategy("device:CPU:0") - if FLAGS.num_gpus > 1: + if flags.FLAGS.num_gpus > 1: raise ValueError("`OneDeviceStrategy` can not be used for more than " "one device.") return tf.distribute.OneDeviceStrategy("device:GPU:0") if distribution_strategy == "mirrored": - if FLAGS.num_gpus == 0: + if flags.FLAGS.num_gpus == 0: devices = ["device:CPU:0"] else: - devices = ["device:GPU:%d" % i for i in range(FLAGS.num_gpus)] + devices = ["device:GPU:%d" % i for i in range(flags.FLAGS.num_gpus)] return tf.distribute.MirroredStrategy( devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs) ) if distribution_strategy == "parameter_server": cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() - return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver) + return tf.distribute.ParameterServerStrategy(cluster_resolver) raise ValueError("Unrecognized Distribution Strategy: %r" % distribution_strategy) diff --git a/deepray/core/common/flags.py b/deepray/core/common/flags.py index 13cce788..b592f755 100644 --- a/deepray/core/common/flags.py +++ b/deepray/core/common/flags.py @@ -38,20 +38,6 @@ def define_flags(): flags.DEFINE_string( 'experiment', default=None, help='The experiment type registered, specifying an ExperimentConfig.' ) - - flags.DEFINE_enum( - 'mode', - default=None, - enum_values=[ - 'train', 'eval', 'train_and_eval', 'continuous_eval', 'continuous_train_and_eval', 'train_and_validate', - 'train_and_post_eval' - ], - help='Mode to run: `train`, `eval`, `train_and_eval`, ' - '`continuous_eval`, `continuous_train_and_eval` and ' - '`train_and_validate` (which is not implemented in ' - 'the open source version).' - ) - flags.DEFINE_string( 'model_dir', default=None, help='The directory where the model and training/evaluation summaries' 'are stored.' diff --git a/deepray/core/compile_utils.py b/deepray/core/compile_utils.py index bc0e8150..5ee1479a 100644 --- a/deepray/core/compile_utils.py +++ b/deepray/core/compile_utils.py @@ -1,10 +1,16 @@ import horovod.tensorflow as hvd import tensorflow as tf from absl import flags -from keras.engine.compile_utils import MetricsContainer, match_dtype_and_rank, get_mask, apply_mask -# Keras = 2.9.0 +from packaging.version import parse -FLAGS = flags.FLAGS +if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"): + from keras.engine.compile_utils import MetricsContainer, match_dtype_and_rank, get_mask, apply_mask +elif parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.engine.compile_utils import MetricsContainer, match_dtype_and_rank + from tf_keras.src.utils.losses_utils import get_mask, apply_mask +else: + from keras.src.engine.compile_utils import MetricsContainer, match_dtype_and_rank + from keras.src.utils.losses_utils import get_mask, apply_mask class HvdMetricsContainer(MetricsContainer): @@ -37,12 +43,12 @@ def update_state(self, y_true, y_pred, sample_weight=None): mask = get_mask(y_p) sw = apply_mask(y_p, sw, mask) - if FLAGS.use_horovod: + if flags.FLAGS.use_horovod: y_t = hvd.allgather(y_t) y_p = hvd.allgather(y_p) - if mask: + if mask is not None: mask = hvd.allgather(mask) - if sw: + if sw is not None: sw = hvd.allgather(sw) for metric_obj in metric_objs: diff --git a/deepray/core/dllogger_class.py b/deepray/core/dllogger_class.py deleted file mode 100644 index 2c851120..00000000 --- a/deepray/core/dllogger_class.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from dllogger import Logger, StdOutBackend, JSONStreamBackend, Verbosity - - -class dllogger_class(): - - def format_step(self, step): - if isinstance(step, str): - return step - elif isinstance(step, int): - return "Iteration: {} ".format(step) - elif len(step) > 0: - return "Iteration: {} ".format(step[0]) - else: - return "" - - def __init__(self, log_path="bert_dllog.json"): - self.logger = Logger( - [ - StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step), - JSONStreamBackend(Verbosity.VERBOSE, log_path), - ] - ) - self.logger.metadata("mlm_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("nsp_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("avg_loss_step", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("total_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("f1", {"unit": None, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("precision", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("recall", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("mcc", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("exact_match", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata( - "throughput_train", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "TRAIN" - }, - ) - self.logger.metadata( - "throughput_inf", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "VAL" - }, - ) - self.logger.metadata( - "throughput_val", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "VAL" - }, - ) diff --git a/deepray/core/module.py b/deepray/core/module.py deleted file mode 100644 index b450e1d9..00000000 --- a/deepray/core/module.py +++ /dev/null @@ -1,627 +0,0 @@ -import time - -import numpy as np -import tensorflow as tf -from absl import logging, flags -from keras import callbacks as callbacks_module -from keras.engine import base_layer -from keras.engine import data_adapter -from keras.engine.data_adapter import _ClusterCoordinatorDataHandler, DataHandler -from keras.utils import tf_utils -from keras.utils import version_utils -from packaging import version -from tensorflow.python.eager import context - -from deepray.core.common import distribution_utils -from deepray.utils import export -from deepray.utils.horovod_utils import is_main_process - -FLAGS = flags.FLAGS - - -def _minimum_control_deps(outputs): - """Returns the minimum control dependencies to ensure step succeeded.""" - if tf.executing_eagerly(): - return [] # Control dependencies not needed. - outputs = tf.nest.flatten(outputs, expand_composites=True) - for out in outputs: - # Variables can't be control dependencies. - if not isinstance(out, tf.Variable): - return [out] # Return first Tensor or Op from outputs. - return [] # No viable Tensor or Op to use for control deps. - - -def flatten_metrics_in_order(logs, metrics_names): - """Turns the `logs` dict into a list as per key order of `metrics_names`.""" - results = [] - for name in metrics_names: - if name in logs: - results.append(logs[name]) - for key in sorted(logs.keys()): - if key not in metrics_names: - results.append(logs[key]) - if len(results) == 1: - return results[0] - return results - - -class DataHandlerMOD(DataHandler): - - def _validate_data_handler(self): - pass - - -def get_data_handler(*args, **kwargs): - if getattr(kwargs["model"], "_cluster_coordinator", None): - return _ClusterCoordinatorDataHandler(*args, **kwargs) - return DataHandlerMOD(*args, **kwargs) - - -class Module(): - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._distribution_strategy = distribution_utils.get_distribution_strategy() - self._init_batch_counters() - self.eval_steps = None - self._cluster_coordinator = None - - self.test_function = None - - @tf.__internal__.tracking.no_automatic_dependency_tracking - def _init_batch_counters(self): - # Untracked Variables, used to keep track of mini-batches seen in `fit`, - # `evaluate`, and `predict`. - agg = tf.VariableAggregation.ONLY_FIRST_REPLICA - self._train_counter = tf.Variable(0, dtype='int64', aggregation=agg) - self._test_counter = tf.Variable(0, dtype='int64', aggregation=agg) - self._predict_counter = tf.Variable(0, dtype='int64', aggregation=agg) - - @tf.__internal__.tracking.no_automatic_dependency_tracking - def _configure_steps_per_execution(self, steps_per_execution): - self._steps_per_execution = tf.Variable( - steps_per_execution, dtype='int64', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA - ) - - @property - def distribute_strategy(self): - """The `tf.distribute.Strategy` this model was created under.""" - return self._distribution_strategy or tf.distribute.get_strategy() - - def steps_to_run(self, current_step, steps_per_epoch, steps_per_loop): - """Calculates steps to run on device.""" - if steps_per_loop <= 0: - raise ValueError('steps_per_loop should be positive integer.') - if steps_per_loop == 1: - return steps_per_loop, FLAGS.num_accumulation_steps - - # Note: broadcast should be called after the first gradient step to ensure optimizer - # initialization. - # if self.use_horovod and self.current_step == self._first_steps: - # return 1, 1 - - remainder_in_epoch = current_step % steps_per_epoch - if remainder_in_epoch != 0: - return min(steps_per_epoch - remainder_in_epoch, steps_per_loop), FLAGS.num_accumulation_steps - else: - return steps_per_loop, FLAGS.num_accumulation_steps - - def _float_metric_value(self, metric): - """Gets the value of a float-value keras metric.""" - return metric.result().numpy().astype(float) - - def on_epoch_begin(self, epoch): - self._step_epoch = 0 - """Calls the `on_epoch_begin` methods of its callbacks. - """ - self.callbacks.on_epoch_begin(epoch) - - # Training loss/metric are taking average over steps inside micro - # training loop. We reset their values before each round. - self.loss_container.reset_state() - if self.metric_container: - self.metric_container.reset_state() - - def on_batch_end(self, logs, steps, t0): - """Runs custom callbacks at the end of every N(steps) step.""" - self._step_epoch += steps - self.current_step += steps - - self.callbacks.on_train_batch_end(self.current_step, logs) - - elapse_time = time.time() - t0 - # Updates training logging. - if self.steps_per_epoch > 0: - training_status = 'Train Step: %d/%d / time=%.3f sec' % ( - self.current_step, self.steps_per_epoch * self.epochs + self._first_steps, elapse_time - ) - else: - training_status = 'Train Step: %d / time=%.3f sec' % (self.current_step, elapse_time) - - self._steps_from_save += steps - - if self._steps_from_save >= FLAGS.save_checkpoint_steps: - export.export_to_checkpoint(self.manager, self.current_step) - self._steps_from_save = 0 - - if self.train_summary_writer: - with self.train_summary_writer.as_default(): - for metric in self.metrics: - metric_value = self._float_metric_value(metric) - training_status += ' %s=%f' % (metric.name, metric_value) - tf.summary.scalar(metric.name, metric_value, step=self.current_step) - self.train_summary_writer.flush() - - # The number of samples trained per second - step_throughput = self._performance_calculator(steps, self.global_batch_size) - if is_main_process(): - if self.use_float16: - if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"): - logging.info( - 'Step: %d Lr %g Loss scale %g' % - (self.current_step, self.optimizer._optimizer._decayed_lr('float32'), self.optimizer.loss_scale) - ) - else: - logging.info( - 'Step: %d Lr %g Loss scale %g' % (self.current_step, self.optimizer.lr, self.optimizer.loss_scale) - ) - - logging.info(training_status) - logging.info('Perf %.2f samples/s' % step_throughput) - - if self.current_step > self._first_steps + steps * 2: - self._perf_wo += step_throughput - self._perf_wo_n += 1 - - def on_epoch_end(self, epoch, current_step, eval_input, epoch_logs=None): - # Saves model checkpoints and run validation steps at every epoch end. - # To avoid repeated model saving, we do not save after the last step of training. - if epoch < self.epochs - 1: - export.export_to_checkpoint(self.manager, current_step) - if eval_input: # and is_main_process(): - if is_main_process(): - logging.info('Running evaluation after step: %s.', current_step) - - val_logs = self.evaluate(eval_input, self.eval_steps) - val_logs = {'val_' + name: val for name, val in val_logs.items()} - epoch_logs.update(val_logs) - - if is_main_process(): - with self.eval_summary_writer.as_default(): - for name, value in val_logs.items(): - logging.info('Step: [%d] Validation %s = %f', current_step, name, value) - tf.summary.scalar(name, value, step=current_step) - self.eval_summary_writer.flush() - """Calls the `on_epoch_end` methods of its callbacks. - """ - self.callbacks.on_epoch_end(epoch, epoch_logs) - - def evaluate(self, eval_input: tf.data.Dataset, eval_steps: int = None, callbacks=None, return_dict=True, **kwargs): - """Returns the loss value & metrics values for the model in test mode. - - Computation is done in batches (see the `batch_size` arg.) - - Args: - eval_input: Target data. Like the input data `x`, it could be either Numpy - array(s) or TensorFlow tensor(s). It should be consistent with `x` - (you cannot have Numpy inputs and tensor targets, or inversely). - If `x` is a dataset, generator or `keras.utils.Sequence` instance, - `y` should not be specified (since targets will be obtained from - the iterator/dataset). - eval_steps: Integer or `None`. Total number of steps (batches of samples) - before declaring the evaluation round finished. Ignored with the - default value of `None`. If x is a `tf.data` dataset and `steps` - is None, 'evaluate' will run until the dataset is exhausted. This - argument is not supported with array inputs. - - - See the discussion of `Unpacking behavior for iterator-like inputs` for - `Model.fit`. - - Returns: - Scalar test loss (if the model has a single output and no metrics) - or list of scalars (if the model has multiple outputs - and/or metrics). The attribute `model.metrics_names` will give you - the display labels for the scalar outputs. - - Raises: - RuntimeError: If `trainer.evaluate` is wrapped in a `tf.function`. - """ - if eval_steps is None: - if self.eval_steps is not None: - eval_steps = self.eval_steps - else: - if self.eval_steps is None: - self.eval_steps = eval_steps - """Runs validation steps and aggregate metrics.""" - if self.eval_steps is None: - self.eval_steps = eval_steps - - base_layer.keras_api_gauge.get_cell('evaluate').set(True) - version_utils.disallow_legacy_graph('Model', 'evaluate') - - use_cached_eval_dataset = kwargs.pop('_use_cached_eval_dataset', False) - if kwargs: - raise TypeError(f'Invalid keyword arguments: {list(kwargs.keys())}') - - # TODO(@fuhailin): custom ProgbarLogger fix bug when verbose = 2 - verbose = 1 - with distribution_utils.get_strategy_scope(self._distribution_strategy): - # Use cached evaluation data only when it's called in `Model.fit` - if use_cached_eval_dataset and getattr(self, '_eval_data_handler', None) is not None: - data_handler = self._eval_data_handler - else: - # Creates a `tf.data.Dataset` and handles batch and epoch iteration. - data_handler = get_data_handler( - x=eval_input, - y=None, - sample_weight=None, - batch_size=FLAGS.batch_size, - steps_per_epoch=self.eval_steps, - initial_epoch=0, - epochs=1, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - model=self.main_model, - steps_per_execution=self._steps_per_execution - ) - - # Container that configures and calls `tf.keras.Callback`s. - if not isinstance(callbacks, callbacks_module.CallbackList): - callbacks = callbacks_module.CallbackList( - callbacks, - add_history=True, - add_progbar=verbose != 0, - model=self.main_model, - verbose=verbose, - epochs=1, - steps=data_handler.inferred_steps - ) - logs = {} - self.test_function = self.make_test_function() - self._test_counter.assign(0) - callbacks.on_test_begin() - for _, iterator in data_handler.enumerate_epochs(): # Single epoch. - # Re-initialize evaluation metric. - self.reset_metrics() - while eval_steps is None or self._test_counter.numpy() < eval_steps: - try: - steps, _ = self.steps_to_run( - self._test_counter.numpy(), - steps_per_epoch=eval_steps if eval_steps else -1, - steps_per_loop=FLAGS.steps_per_summary - ) - with tf.profiler.experimental.Trace('test', step_num=self._test_counter.numpy(), _r=1): - callbacks.on_test_batch_begin(self._test_counter.numpy()) - tmp_logs = self.test_function(iterator, tf.convert_to_tensor(steps, dtype=tf.int32)) - if data_handler.should_sync: - context.async_wait() - logs = tmp_logs # No error, now safe to assign to logs. - callbacks.on_test_batch_end(self._test_counter.numpy(), logs) - except (tf.errors.OutOfRangeError, StopIteration): - callbacks.on_test_batch_end(self._test_counter.numpy(), logs) - self.eval_steps = self._test_counter.numpy() - if is_main_process(): - logging.info('Data exhausted after %d eval_steps', self._test_counter.numpy()) - break - - logs = tf_utils.sync_to_numpy_or_python_type(logs) - callbacks.on_test_end(logs=logs) - - if return_dict: - return logs - else: - return flatten_metrics_in_order(logs, self.metrics_names) - - def make_test_function(self, force=False): - """Creates a function that executes one step of evaluation. - - This method can be overridden to support custom evaluation logic. - This method is called by `Model.evaluate` and `Model.test_on_batch`. - - Typically, this method directly controls `tf.function` and - `tf.distribute.Strategy` settings, and delegates the actual evaluation - logic to `Model.test_step`. - - This function is cached the first time `Model.evaluate` or - `Model.test_on_batch` is called. The cache is cleared whenever - `Model.compile` is called. You can skip the cache and generate again the - function with `force=True`. - - Args: - force: Whether to regenerate the test function and skip the cached - function if available. - - Returns: - Function. The function created by this method should accept a - `tf.data.Iterator`, and return a `dict` containing values that will - be passed to `tf.keras.Callbacks.on_test_batch_end`. - """ - if self.test_function is not None and not force: - return self.test_function - - def step_function(trainer, iterator): - """Runs a single evaluation step.""" - - def run_step(data): - outputs = self.test_step(data) - # Ensure counter is updated only if `test_step` succeeds. - with tf.control_dependencies(_minimum_control_deps(outputs)): - trainer._test_counter.assign_add(1) # pylint: disable=protected-access - return outputs - - if self._jit_compile: - run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) - - data = next(iterator) - outputs = run_step(data) - return outputs - - # Special case if steps_per_execution is one. - if self._steps_per_execution is None or self._steps_per_execution.numpy().item() == 1: - - def test_function(iterator): - """Runs a test execution with a single step.""" - return step_function(self, iterator) - - if not self.run_eagerly: - test_function = tf.function(test_function, reduce_retracing=True) - - if self._cluster_coordinator: - self.test_function = lambda it: self._cluster_coordinator.schedule( # pylint: disable=g-long-lambda - test_function, args=(it,)) - else: - self.test_function = test_function - - # If we're using a coordinator, use the value of self._steps_per_execution - # at the time the function is called/scheduled, and not when it is actually - # executed. - elif self._cluster_coordinator: - - def test_function(iterator, steps_per_execution): - """Runs a test execution with multiple steps.""" - for _ in tf.range(steps_per_execution): - outputs = step_function(self, iterator) - return outputs - - if not self.run_eagerly: - test_function = tf.function(test_function, reduce_retracing=True) - - self.test_function = lambda it: self._cluster_coordinator.schedule( # pylint: disable=g-long-lambda - test_function, - args=(it, self._steps_per_execution.value())) - else: - - def test_function(iterator, steps): - """Runs a test execution with multiple steps.""" - for _ in tf.range(steps): - outputs = step_function(self, iterator) - return outputs - - if not self.run_eagerly: - test_function = tf.function(test_function, reduce_retracing=True) - self.test_function = test_function - - return self.test_function - - @property - def metrics(self): - metrics = [] - if self.loss_container is not None: - metrics += self.loss_container.metrics - if self.metric_container is not None: - metrics += self.metric_container.metrics - return metrics - - def get_metrics_result(self): - """Returns the model's metrics values as a dict. - - If any of the metric result is a dict (containing multiple metrics), - each of them gets added to the top level returned dict of this method. - - Returns: - A `dict` containing values of the metrics listed in `self.metrics`. - Example: - `{'loss': 0.2, 'accuracy': 0.7}`. - """ - # Collect metrics to return - return_metrics = {} - for metric in self.metrics: - result = metric.result() - if isinstance(result, dict): - return_metrics.update(result) - else: - return_metrics[metric.name] = result - return return_metrics - - def test_step(self, data): - """The logic for one evaluation step. - - This method can be overridden to support custom evaluation logic. - This method is called by `Model.make_test_function`. - - This function should contain the mathematical logic for one step of - evaluation. - This typically includes the forward pass, loss calculation, and metrics - updates. - - Configuration details for *how* this logic is run (e.g. `tf.function` and - `tf.distribute.Strategy` settings), should be left to - `Model.make_test_function`, which can also be overridden. - - Args: - data: A nested structure of `Tensor`s. - - Returns: - A `dict` containing values that will be passed to - `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the - values of the `Model`'s metrics are returned. - """ - x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data) - - y_pred = self.main_model(x, training=False) - # Updates stateful loss metrics. - self.compute_loss(x, y, y_pred, sample_weight) - return self.compute_metrics(x, y, y_pred, sample_weight) - - def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None): - """Compute the total loss, validate it, and return it. - - Subclasses can optionally override this method to provide custom loss - computation logic. - - Example: - ```python - class MyModel(tf.keras.Model): - - def __init__(self, *args, **kwargs): - super(MyModel, self).__init__(*args, **kwargs) - self.loss_tracker = tf.keras.metrics.Mean(name='loss') - - def compute_loss(self, x, y, y_pred, sample_weight): - loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y)) - loss += tf.add_n(self.losses) - self.loss_tracker.update_state(loss) - return loss - - def reset_metrics(self): - self.loss_tracker.reset_states() - - @property - def metrics(self): - return [self.loss_tracker] - - tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,)) - dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1) - - inputs = tf.keras.layers.Input(shape=(10,), name='my_input') - outputs = tf.keras.layers.Dense(10)(inputs) - model = MyModel(inputs, outputs) - model.add_loss(tf.reduce_sum(outputs)) - - optimizer = tf.keras.optimizers.SGD() - model.compile(optimizer, loss='mse', steps_per_execution=10) - model.fit(dataset, epochs=2, steps_per_epoch=10) - print('My custom loss: ', model.loss_tracker.result().numpy()) - ``` - - Args: - x: Input data. - y: Target data. - y_pred: Predictions returned by the model (output of `model(x)`) - sample_weight: Sample weights for weighting the loss function. - - Returns: - The total loss as a `tf.Tensor`, or `None` if no loss results (which is - the case when called by `Model.test_step`). - """ - del x # The default implementation does not use `x`. - return self.loss_container( - y, - y_pred, - sample_weight, - # regularization_losses=self.losses - ) - - def compute_metrics(self, x, y, y_pred, sample_weight): - """Update metric states and collect all metrics to be returned. - - Subclasses can optionally override this method to provide custom metric - updating and collection logic. - - Example: - ```python - class MyModel(tf.keras.Sequential): - - def compute_metrics(self, x, y, y_pred, sample_weight): - - # This super call updates `self.compiled_metrics` and returns results - # for all metrics listed in `self.metrics`. - metric_results = super(MyModel, self).compute_metrics( - x, y, y_pred, sample_weight) - - # Note that `self.custom_metric` is not listed in `self.metrics`. - self.custom_metric.update_state(x, y, y_pred, sample_weight) - metric_results['custom_metric_name'] = self.custom_metric.result() - return metric_results - ``` - - Args: - x: Input data. - y: Target data. - y_pred: Predictions returned by the model (output of `model.call(x)`) - sample_weight: Sample weights for weighting the loss function. - - Returns: - A `dict` containing values that will be passed to - `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the - values of the metrics listed in `self.metrics` are returned. Example: - `{'loss': 0.2, 'accuracy': 0.7}`. - """ - del x # The default implementation does not use `x`. - self.metric_container.update_state(y, y_pred, sample_weight) - # Collect metrics to return - return self.get_metrics_result() - - def reset_metrics(self): - """Resets the state of all the metrics in the model. - - Examples: - - >>> inputs = tf.keras.layers.Input(shape=(3,)) - >>> outputs = tf.keras.layers.Dense(2)(inputs) - >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs) - >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"]) - - >>> x = np.random.random((2, 3)) - >>> y = np.random.randint(0, 2, (2, 2)) - >>> _ = model.fit(x, y, verbose=0) - >>> assert all(float(m.result()) for m in model.metrics) - - >>> model.reset_metrics() - >>> assert all(float(m.result()) == 0 for m in model.metrics) - - """ - for m in self.metrics: - m.reset_state() - - @property - def metrics_names(self): - """Returns the model's display labels for all outputs. - - Note: `metrics_names` are available only after a `keras.Model` has been - trained/evaluated on actual data. - - Examples: - - >>> inputs = tf.keras.layers.Input(shape=(3,)) - >>> outputs = tf.keras.layers.Dense(2)(inputs) - >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs) - >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"]) - >>> model.metrics_names - [] - - >>> x = np.random.random((2, 3)) - >>> y = np.random.randint(0, 2, (2, 2)) - >>> model.fit(x, y) - >>> model.metrics_names - ['loss', 'mae'] - - >>> inputs = tf.keras.layers.Input(shape=(3,)) - >>> d = tf.keras.layers.Dense(2, name='out') - >>> output_1 = d(inputs) - >>> output_2 = d(inputs) - >>> model = tf.keras.models.Model( - ... inputs=inputs, outputs=[output_1, output_2]) - >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"]) - >>> model.fit(x, (y, y)) - >>> model.metrics_names - ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae', - 'out_1_acc'] - - """ - - # This property includes all output names including `loss` and per-output - # losses for backward compatibility. - return [m.name for m in self.metrics] diff --git a/deepray/core/platform/build_config.default.bzl b/deepray/core/platform/build_config.default.bzl index 2590d8a1..dc54c11f 100644 --- a/deepray/core/platform/build_config.default.bzl +++ b/deepray/core/platform/build_config.default.bzl @@ -1,12 +1,12 @@ """OSS versions of Bazel macros that can't be migrated to TSL.""" +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm") load( "//deepray/tsl:tsl.bzl", "clean_dep", "if_libtpu", ) -load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") -load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm") load( "//third_party/mkl:build_defs.bzl", "if_mkl_ml", diff --git a/deepray/core/platform/build_config_root.bzl b/deepray/core/platform/build_config_root.bzl index 4fc39673..30a4fee6 100644 --- a/deepray/core/platform/build_config_root.bzl +++ b/deepray/core/platform/build_config_root.bzl @@ -1,5 +1,10 @@ """Provides a redirection point for platform specific implementations of starlark utilities.""" +load( + "//deepray/core/platform:build_config_root.default.bzl", + _if_dynamic_kernels = "if_dynamic_kernels", + _tf_additional_plugin_deps = "tf_additional_plugin_deps", +) load( "//deepray/tsl/platform/default:build_config_root.bzl", _if_static = "if_static", @@ -12,11 +17,6 @@ load( _tf_exec_properties = "tf_exec_properties", _tf_gpu_tests_tags = "tf_gpu_tests_tags", ) -load( - "//deepray/core/platform:build_config_root.default.bzl", - _if_dynamic_kernels = "if_dynamic_kernels", - _tf_additional_plugin_deps = "tf_additional_plugin_deps", -) if_dynamic_kernels = _if_dynamic_kernels if_static = _if_static diff --git a/deepray/core/trainer.py b/deepray/core/trainer.py new file mode 100644 index 00000000..8e112e0b --- /dev/null +++ b/deepray/core/trainer.py @@ -0,0 +1,3073 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Training-related part of the TF-Keras engine.""" + +import copy +import os +import random +import sys +import warnings +import weakref +from typing import Union, List, Dict, Text + +import horovod.tensorflow as hvd +import numpy as np +import tensorflow as tf +import tf_keras as keras +from absl import flags +from tensorflow.python.distribute import distribute_utils +from tensorflow.python.distribute import input_ops +from tensorflow.python.eager import context +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util.tf_export import keras_export +from tensorflow.tools.docs import doc_controls +from tf_keras import callbacks as callbacks_module +from tf_keras import optimizers +from tf_keras.src.dtensor import dtensor_api +from tf_keras.src.dtensor import layout_map as layout_map_lib +from tf_keras.src.engine import compile_utils +from tf_keras.src.engine import data_adapter +from tf_keras.src.engine import training as training_module +from tf_keras.src.engine import training_utils +from tf_keras.src.metrics import base_metric +from tf_keras.src.mixed_precision import loss_scale_optimizer as lso +from tf_keras.src.optimizers import optimizer +from tf_keras.src.optimizers import optimizer_v1 +from tf_keras.src.saving import serialization_lib +from tf_keras.src.utils import generic_utils +from tf_keras.src.utils import steps_per_execution_tuning +from tf_keras.src.utils import tf_utils +from tf_keras.src.utils import traceback_utils +from tf_keras.src.utils import version_utils +from tf_keras.src.utils.mode_keys import ModeKeys + +from deepray.callbacks import HvdCallbackList +from deepray.callbacks.progbar_logger import ProgbarLogger +from deepray.custom_ops.embedding_variable import kv_variable_ops +from deepray.utils import logging_util +from deepray.utils.horovod_utils import is_main_process + +logger = logging_util.get_logger() + +try: + from tensorflow_recommenders_addons.dynamic_embedding.python.ops.embedding_weights import TrainableWrapper + from tensorflow_recommenders_addons.dynamic_embedding.python.ops.shadow_embedding_ops import DEResourceVariable +except ImportError: + TrainableWrapper, DEResourceVariable = None, None + + +def set_random_seed(random_seed): + random.seed(random_seed) # set random seed for python + np.random.seed(random_seed) # set random seed for numpy + tf.random.set_seed(random_seed) # set random seed for tensorflow-cpu + os.environ['TF_DETERMINISTIC_OPS'] = '1' # set random seed for tensorflow-gpu + + +@keras_export("keras.Model", "keras.models.Model") +class Trainer(): + """A model grouping layers into an object with training/inference features. + + Args: + inputs: The input(s) of the model: a `keras.Input` object or a + combination of `keras.Input` objects in a dict, list or tuple. + outputs: The output(s) of the model: a tensor that originated from + `keras.Input` objects or a combination of such tensors in a dict, + list or tuple. See Functional API example below. + name: String, the name of the model. + + There are two ways to instantiate a `Model`: + + 1 - With the "Functional API", where you start from `Input`, + you chain layer calls to specify the model's forward pass, + and finally you create your model from inputs and outputs: + + ```python + import tensorflow as tf + + inputs = tf.keras.Input(shape=(3,)) + x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs) + outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x) + model = tf.keras.Model(inputs=inputs, outputs=outputs) + ``` + + Note: Only dicts, lists, and tuples of input tensors are supported. Nested + inputs are not supported (e.g. lists of list or dicts of dict). + + A new Functional API model can also be created by using the + intermediate tensors. This enables you to quickly extract sub-components + of the model. + + Example: + + ```python + inputs = keras.Input(shape=(None, None, 3)) + processed = keras.layers.RandomCrop(width=32, height=32)(inputs) + conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed) + pooling = keras.layers.GlobalAveragePooling2D()(conv) + feature = keras.layers.Dense(10)(pooling) + + full_model = keras.Model(inputs, feature) + backbone = keras.Model(processed, conv) + activations = keras.Model(conv, feature) + ``` + + Note that the `backbone` and `activations` models are not + created with `keras.Input` objects, but with the tensors that are originated + from `keras.Input` objects. Under the hood, the layers and weights will + be shared across these models, so that user can train the `full_model`, and + use `backbone` or `activations` to do feature extraction. + The inputs and outputs of the model can be nested structures of tensors as + well, and the created models are standard Functional API models that support + all the existing APIs. + + 2 - By subclassing the `Model` class: in that case, you should define your + layers in `__init__()` and you should implement the model's forward pass + in `call()`. + + ```python + import tensorflow as tf + + class MyModel(tf.keras.Model): + + def __init__(self): + super().__init__() + self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu) + self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax) + + def call(self, inputs): + x = self.dense1(inputs) + return self.dense2(x) + + model = MyModel() + ``` + + If you subclass `Model`, you can optionally have + a `training` argument (boolean) in `call()`, which you can use to specify + a different behavior in training and inference: + + ```python + import tensorflow as tf + + class MyModel(tf.keras.Model): + + def __init__(self): + super().__init__() + self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu) + self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax) + self.dropout = tf.keras.layers.Dropout(0.5) + + def call(self, inputs, training=False): + x = self.dense1(inputs) + if training: + x = self.dropout(x, training=training) + return self.dense2(x) + + model = MyModel() + ``` + + Once the model is created, you can config the model with losses and metrics + with `model.compile()`, train the model with `model.fit()`, or use the model + to do prediction with `model.predict()`. + """ + + @tf.__internal__.tracking.no_automatic_dependency_tracking + @traceback_utils.filter_traceback + def __init__( + self, + model: Union[keras.Model, List[keras.Model], Dict[Text, keras.Model]], + optimizer="rmsprop", + loss=None, + metrics=None, + loss_weights=None, + weighted_metrics=None, + run_eagerly=None, + steps_per_execution=None, + jit_compile=None, + pss_evaluation_shards=0, + *args, + **kwargs + ): + + self._model = {} + if isinstance(model, list): + if len(model) > 0: + self._model = {"main": model[0]} + if len(model) == 2: + self._model["sub_model"] = model[1] + else: + for i in range(1, len(model)): + self._model[f"sub_model{i}"] = model[i] + else: + raise ValueError("Not a reachable model.") + elif isinstance(model, dict): + main_keys = [k for k in model.keys() if "main" in k] + if len(main_keys) == 1: + if (len(model) == 1): + self._model = {"main": next(iter(model.values()))} + else: + self._model = model + else: + raise ValueError(f"Must set only one model with key contains \"main\", found {main_keys}.") + elif isinstance(model, (keras.Model, tf.keras.Model)): + self._model = {"main": model} + else: + raise ValueError("Not a reachable model.") + + if run_eagerly is None: + run_eagerly = flags.FLAGS.run_eagerly + + if steps_per_execution is None: + steps_per_execution = flags.FLAGS.steps_per_execution + + # Special case for Subclassed Functional Model, which we couldn't detect + # when __new__ is called. We only realize it is a functional model when + # it calls super.__init__ with input and output tensor. + from tf_keras.src.engine import functional + + if training_module.is_functional_model_init_params(args, kwargs) and not isinstance(self, functional.Functional): + # Filter the kwargs for multiple inheritance. + supported_kwargs = [ + "inputs", + "outputs", + "name", + "trainable", + "skip_init", + ] + model_kwargs = {k: kwargs[k] for k in kwargs if k in supported_kwargs} + other_kwargs = {k: kwargs[k] for k in kwargs if k not in supported_kwargs} + training_module.inject_functional_model_class(self.__class__) + functional.Functional.__init__(self, *args, **model_kwargs) + + # In case there is any multiple inheritance here, we need to call + # the __init__ for any class that appears after the Functional + # class. + clz_to_init = [] + found_functional_class = False + for clz in self.__class__.__bases__: + if issubclass(clz, functional.Functional): + found_functional_class = True + continue + if found_functional_class: + clz_to_init.append(clz) + + if clz_to_init: + for clz in clz_to_init: + clz.__init__(self, *args, **other_kwargs) + elif other_kwargs: + # In case there are unused kwargs, we should raise an error to + # user, in case they have a typo in the param name. + raise TypeError( + "The following keyword arguments passed to `Model` aren't " + "supported: {}.".format(other_kwargs) + ) + return + + # The following are implemented as property functions: + # self.trainable_weights + # self.non_trainable_weights + # `inputs` / `outputs` will only appear in kwargs if either are + # misspelled. + generic_utils.validate_kwargs( + kwargs, + { + "trainable", + "dtype", + "dynamic", + "name", + "autocast", + "inputs", + "outputs", + }, + ) + super().__init__(**kwargs) + + # stop_training is used by callback to stop training when error happens + self.stop_training = False + self.history = None + # These objects are used in the default `Model.compile`. They are not + # guaranteed to be set after `Model.compile` is called, as users can + # override compile with custom logic. + self.compiled_loss = None + self.compiled_metrics = None + + # Don't reset compilation if already done. This may occur if calling + # `__init__` (or `_init_graph_network`) on an already-compiled model + # such as a Sequential model. Sequential models may need to rebuild + # themselves after compilation. + self._maybe_create_attribute("_is_compiled", False) + self._maybe_create_attribute("optimizer", None) + + # Model must be created under scope of DistStrat it will be trained + # with. + if tf.distribute.has_strategy(): + self._distribution_strategy = tf.distribute.get_strategy() + else: + self._distribution_strategy = None + self._distribute_reduction_method = None + + self._cluster_coordinator = None + + # Defaults to value of `tf.config.experimental_functions_run_eagerly`. + self._run_eagerly = None + # Initialize cache attrs. + self._reset_compile_cache() + + # Fault-tolerance handler. Set in `ModelCheckpoint`. + self._training_state = None + + self._steps_per_execution = None + self._steps_per_execution_tuner = None + self._autotune_steps_per_execution = False + + self._layout_map = layout_map_lib.get_current_layout_map() + + self._init_batch_counters() + self._base_model_initialized = True + + # `jit_compile` starts off with None as default and gets overwritten by + # the value specified in `Model.compile`, and this is effective for + # `fit`, `evaluate`, and `predict`. + self._jit_compile = None + + self.compile( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + weighted_metrics=weighted_metrics, + run_eagerly=run_eagerly, + steps_per_execution=steps_per_execution, + jit_compile=jit_compile, + pss_evaluation_shards=pss_evaluation_shards, + **kwargs, + ) + + if is_main_process(): + logger.info("Initialize training") + logger.info("flags.FLAGS:") + for key, value in sorted(flags.FLAGS.flag_values_dict().items()): + logger.info(f"\t{key:25}= {value}") + if flags.FLAGS.random_seed is not None: + set_random_seed(flags.FLAGS.random_seed) + + def _create_counter_variable(self, init_value): + """Helper function for counter variable creation. + + For the DTensor use case with layout map, since the variable are not + tracked by model, they can't be visited by the layout map, and need to + be properly initialized as DVariable. + """ + # This function should be removed after we move to the strategy based + # implementation for DTensor. + if self._layout_map is None: + agg = tf.VariableAggregation.ONLY_FIRST_REPLICA + return tf.Variable(init_value, dtype="int64", aggregation=agg) + else: + layout = dtensor_api.Layout.replicated(mesh=self._layout_map.get_default_mesh(), rank=0) + return dtensor_api.DVariable(init_value, dtype="int64", layout=layout) + + @tf.__internal__.tracking.no_automatic_dependency_tracking + def _init_batch_counters(self): + # Untracked Variables, used to keep track of mini-batches seen in `fit`, + # `evaluate`, and `predict`. + if not tf.inside_function(): + # Creating variables inside tf.function is not allowed, hence + # these would otherwise prevent users from creating TF-Keras layers + # inside tf.function. + # These variables are not connected to outputs so they have no + # effect on graph generation anyway. + + self._train_counter = self._create_counter_variable(0) + self._test_counter = self._create_counter_variable(0) + self._predict_counter = self._create_counter_variable(0) + if flags.FLAGS.use_horovod: + self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name='first_batch') + + @traceback_utils.filter_traceback + def compile( + self, + optimizer="rmsprop", + loss=None, + metrics=None, + loss_weights=None, + weighted_metrics=None, + run_eagerly=None, + steps_per_execution=None, + jit_compile=None, + pss_evaluation_shards=0, + **kwargs, + ): + """Configures the model for training. + + Example: + + ```python + model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), + loss=tf.keras.losses.BinaryCrossentropy(), + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.FalseNegatives()]) + ``` + + Args: + optimizer: String (name of optimizer) or optimizer instance. See + `tf.keras.optimizers`. + loss: Loss function. May be a string (name of loss function), or + a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss + function is any callable with the signature `loss = fn(y_true, + y_pred)`, where `y_true` are the ground truth values, and + `y_pred` are the model's predictions. + `y_true` should have shape + `(batch_size, d0, .. dN)` (except in the case of + sparse loss functions such as + sparse categorical crossentropy which expects integer arrays of + shape `(batch_size, d0, .. dN-1)`). + `y_pred` should have shape `(batch_size, d0, .. dN)`. + The loss function should return a float tensor. + If a custom `Loss` instance is + used and reduction is set to `None`, return value has shape + `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss + values; otherwise, it is a scalar. If the model has multiple + outputs, you can use a different loss on each output by passing a + dictionary or a list of losses. The loss value that will be + minimized by the model will then be the sum of all individual + losses, unless `loss_weights` is specified. + metrics: List of metrics to be evaluated by the model during + training and testing. Each of this can be a string (name of a + built-in function), function or a `tf.keras.metrics.Metric` + instance. See `tf.keras.metrics`. Typically you will use + `metrics=['accuracy']`. + A function is any callable with the signature `result = fn(y_true, + y_pred)`. To specify different metrics for different outputs of a + multi-output model, you could also pass a dictionary, such as + `metrics={'output_a':'accuracy', 'output_b':['accuracy', 'mse']}`. + You can also pass a list to specify a metric or a list of metrics + for each output, such as + `metrics=[['accuracy'], ['accuracy', 'mse']]` + or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the + strings 'accuracy' or 'acc', we convert this to one of + `tf.keras.metrics.BinaryAccuracy`, + `tf.keras.metrics.CategoricalAccuracy`, + `tf.keras.metrics.SparseCategoricalAccuracy` based on the shapes + of the targets and of the model output. We do a similar + conversion for the strings 'crossentropy' and 'ce' as well. + The metrics passed here are evaluated without sample weighting; if + you would like sample weighting to apply, you can specify your + metrics via the `weighted_metrics` argument instead. + loss_weights: Optional list or dictionary specifying scalar + coefficients (Python floats) to weight the loss contributions of + different model outputs. The loss value that will be minimized by + the model will then be the *weighted sum* of all individual + losses, weighted by the `loss_weights` coefficients. If a list, + it is expected to have a 1:1 mapping to the model's outputs. If a + dict, it is expected to map output names (strings) to scalar + coefficients. + weighted_metrics: List of metrics to be evaluated and weighted by + `sample_weight` or `class_weight` during training and testing. + run_eagerly: Bool. If `True`, this `Model`'s logic will not be + wrapped in a `tf.function`. Recommended to leave this as `None` + unless your `Model` cannot be run inside a `tf.function`. + `run_eagerly=True` is not supported when using + `tf.distribute.experimental.ParameterServerStrategy`. Defaults to + `False`. + steps_per_execution: Int or `'auto'`. The number of batches to + run during each `tf.function` call. If set to "auto", keras will + automatically tune `steps_per_execution` during runtime. Running + multiple batches inside a single `tf.function` call can greatly + improve performance on TPUs, when used with distributed strategies + such as `ParameterServerStrategy`, or with small models with a + large Python overhead. At most, one full epoch will be run each + execution. If a number larger than the size of the epoch is + passed, the execution will be truncated to the size of the epoch. + Note that if `steps_per_execution` is set to `N`, + `Callback.on_batch_begin` and `Callback.on_batch_end` methods will + only be called every `N` batches (i.e. before/after each + `tf.function` execution). Defaults to `1`. + jit_compile: If `True`, compile the model training step with XLA. + [XLA](https://www.tensorflow.org/xla) is an optimizing compiler + for machine learning. + `jit_compile` is not enabled for by default. + Note that `jit_compile=True` + may not necessarily work for all models. + For more information on supported operations please refer to the + [XLA documentation](https://www.tensorflow.org/xla). + Also refer to + [known XLA issues](https://www.tensorflow.org/xla/known_issues) + for more details. + pss_evaluation_shards: Integer or 'auto'. Used for + `tf.distribute.ParameterServerStrategy` training only. This arg + sets the number of shards to split the dataset into, to enable an + exact visitation guarantee for evaluation, meaning the model will + be applied to each dataset element exactly once, even if workers + fail. The dataset must be sharded to ensure separate workers do + not process the same data. The number of shards should be at least + the number of workers for good performance. A value of 'auto' + turns on exact evaluation and uses a heuristic for the number of + shards based on the number of workers. 0, meaning no + visitation guarantee is provided. NOTE: Custom implementations of + `Model.test_step` will be ignored when doing exact evaluation. + Defaults to `0`. + **kwargs: Arguments supported for backwards compatibility only. + """ + if jit_compile and not tf_utils.can_jit_compile(warn=True): + jit_compile = False + self._compile_config = serialization_lib.Config( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + weighted_metrics=weighted_metrics, + run_eagerly=run_eagerly, + steps_per_execution=steps_per_execution, + jit_compile=jit_compile, + ) + with self.distribute_strategy.scope(): + if "experimental_steps_per_execution" in kwargs: + logging.warning( + "The argument `steps_per_execution` is no longer " + "experimental. Pass `steps_per_execution` instead of " + "`experimental_steps_per_execution`." + ) + if not steps_per_execution: + steps_per_execution = kwargs.pop("experimental_steps_per_execution") + + # When compiling from an already-serialized model, we do not want to + # reapply some processing steps (e.g. metric renaming for + # multi-output models, which have prefixes added for each + # corresponding output name). + from_serialized = kwargs.pop("from_serialized", False) + + self._validate_compile(optimizer, metrics, **kwargs) + self._run_eagerly = run_eagerly + + self.optimizer = self._get_optimizer(optimizer) + self.optimizer.global_step = self._train_counter + self.main_model.optimizer = self.optimizer + + mesh = None + if self._layout_map is not None: + mesh = self._layout_map.get_default_mesh() + + if isinstance(loss, compile_utils.LossesContainer): + self.compiled_loss = loss + else: + self.compiled_loss = compile_utils.LossesContainer( + loss, + loss_weights, + output_names=self.main_model.output_names, + mesh=mesh, + ) + self.compiled_metrics = compile_utils.MetricsContainer( + metrics, + weighted_metrics, + output_names=self.main_model.output_names, + from_serialized=from_serialized, + mesh=mesh, + ) + + if steps_per_execution == "auto": + if self._steps_per_execution is None: + self._configure_steps_per_execution(1) + self._steps_per_execution_tuner = ( + steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution) + ) + self._autotune_steps_per_execution = True + else: + self._configure_steps_per_execution(steps_per_execution or 1) + + self._pss_evaluation_shards = self._infer_exact_eval_shards(pss_evaluation_shards) + + # Initializes attrs that are reset each time `compile` is called. + self._reset_compile_cache() + self._is_compiled = True + self.loss = loss or {} + if (self._run_eagerly or self.main_model.dynamic) and jit_compile: + raise ValueError("You cannot enable `run_eagerly` and `jit_compile` " + "at the same time.") + else: + self._jit_compile = jit_compile + + def _get_optimizer(self, optimizer): + """Wraps `optimizer` in `LossScaleOptimizer` if necessary.""" + + def _get_single_optimizer(opt): + opt = optimizers.get(opt) + if self.main_model.dtype_policy.name == "mixed_float16" and not isinstance(opt, lso.BaseLossScaleOptimizer): + # Loss scaling is necessary with mixed_float16 for models to + # converge to the same accuracy as with float32. + opt = lso.BaseLossScaleOptimizer(opt) + return opt + + return tf.nest.map_structure(_get_single_optimizer, optimizer) + + @tf.__internal__.tracking.no_automatic_dependency_tracking + def _reset_compile_cache(self): + self.train_function = None + self.test_function = None + self.predict_function = None + # Used to cache the `tf.function`'ed `train_function` to be logged in + # TensorBoard, since the original `train_function` is not necessarily + # a `tf.function` (e.g., with ParameterServerStrategy, the + # `train_function` is a scheduling of the actual training function to a + # remote worker). + self.train_tf_function = None + + # Used to cache `trainable` attr of `Layer`s for `fit`. + self._compiled_trainable_state = self._get_trainable_state() + + @tf.__internal__.tracking.no_automatic_dependency_tracking + def _configure_steps_per_execution(self, steps_per_execution): + self._steps_per_execution = self._create_counter_variable(steps_per_execution) + + @property + def _should_compute_mask(self): + return False + + @property + def metrics(self): + """Return metrics added using `compile()` or `add_metric()`. + + Note: Metrics passed to `compile()` are available only after a + `keras.Model` has been trained/evaluated on actual data. + + Examples: + + >>> inputs = tf.keras.layers.Input(shape=(3,)) + >>> outputs = tf.keras.layers.Dense(2)(inputs) + >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs) + >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"]) + >>> [m.name for m in model.metrics] + [] + + >>> x = np.random.random((2, 3)) + >>> y = np.random.randint(0, 2, (2, 2)) + >>> model.fit(x, y) + >>> [m.name for m in model.metrics] + ['loss', 'mae'] + + >>> inputs = tf.keras.layers.Input(shape=(3,)) + >>> d = tf.keras.layers.Dense(2, name='out') + >>> output_1 = d(inputs) + >>> output_2 = d(inputs) + >>> model = tf.keras.models.Model( + ... inputs=inputs, outputs=[output_1, output_2]) + >>> model.add_metric( + ... tf.reduce_sum(output_2), name='mean', aggregation='mean') + >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"]) + >>> model.fit(x, (y, y)) + >>> [m.name for m in model.metrics] + ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae', + 'out_1_acc', 'mean'] + + """ + metrics = [] + if self._is_compiled: + if self.compiled_loss is not None: + metrics += self.compiled_loss.metrics + if self.compiled_metrics is not None: + metrics += self.compiled_metrics.metrics + + for l in self.main_model._flatten_layers(): + metrics.extend(l._metrics) + return metrics + + @property + def metrics_names(self): + """Returns the model's display labels for all outputs. + + Note: `metrics_names` are available only after a `keras.Model` has been + trained/evaluated on actual data. + + Examples: + + >>> inputs = tf.keras.layers.Input(shape=(3,)) + >>> outputs = tf.keras.layers.Dense(2)(inputs) + >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs) + >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"]) + >>> model.metrics_names + [] + + >>> x = np.random.random((2, 3)) + >>> y = np.random.randint(0, 2, (2, 2)) + >>> model.fit(x, y) + >>> model.metrics_names + ['loss', 'mae'] + + >>> inputs = tf.keras.layers.Input(shape=(3,)) + >>> d = tf.keras.layers.Dense(2, name='out') + >>> output_1 = d(inputs) + >>> output_2 = d(inputs) + >>> model = tf.keras.models.Model( + ... inputs=inputs, outputs=[output_1, output_2]) + >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"]) + >>> model.fit(x, (y, y)) + >>> model.metrics_names + ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae', + 'out_1_acc'] + + """ + + # This property includes all output names including `loss` and + # per-output losses for backward compatibility. + return [m.name for m in self.metrics] + + @property + def distribute_strategy(self): + """The `tf.distribute.Strategy` this model was created under.""" + return self._distribution_strategy or tf.distribute.get_strategy() + + @property + def run_eagerly(self): + """Settable attribute indicating whether the model should run eagerly. + + Running eagerly means that your model will be run step by step, + like Python code. Your model might run slower, but it should become + easier for you to debug it by stepping into individual layer calls. + + By default, we will attempt to compile your model to a static graph to + deliver the best execution performance. + + Returns: + Boolean, whether the model should run eagerly. + """ + if self.main_model.dynamic and self._run_eagerly == False: + # TODO(fchollet): consider using py_func to enable this. + raise ValueError( + "Your model contains layers that can only be " + "successfully run in eager execution (layers " + "constructed with `dynamic=True`). " + "You cannot set `run_eagerly=False`." + ) + + if self._cluster_coordinator and self._run_eagerly: + raise ValueError("When using `Model` with `ParameterServerStrategy`, " + "`run_eagerly` is not supported.") + + # Run eagerly logic, by priority: + # (1) Dynamic models must be run eagerly. + # (2) Explicitly setting run_eagerly causes a Model to be run eagerly. + # (3) Not explicitly setting run_eagerly defaults to TF's global + # setting. + return ( + self.main_model.dynamic or self._run_eagerly or + (tf.config.functions_run_eagerly() and self._run_eagerly is None) + ) + + @run_eagerly.setter + def run_eagerly(self, value): + self._run_eagerly = value + + @property + def autotune_steps_per_execution(self): + """Settable property to enable tuning for steps_per_execution""" + return self._autotune_steps_per_execution + + @autotune_steps_per_execution.setter + def autotune_steps_per_execution(self, value): + self._autotune_steps_per_execution = value + if value and self._steps_per_execution_tuner is None: + if self._steps_per_execution is None: + self._configure_steps_per_execution(1) + self._steps_per_execution_tuner = ( + steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution) + ) + + @property + def steps_per_execution(self): + """Settable `steps_per_execution variable. Requires a compiled model.""" + return self._steps_per_execution + + @steps_per_execution.setter + def steps_per_execution(self, value): + if self._steps_per_execution is None: + self._configure_steps_per_execution(value) + else: + self._steps_per_execution.assign(value) + + @property + def jit_compile(self): + """Specify whether to compile the model with XLA. + + [XLA](https://www.tensorflow.org/xla) is an optimizing compiler + for machine learning. `jit_compile` is not enabled by default. + Note that `jit_compile=True` may not necessarily work for all models. + + For more information on supported operations please refer to the + [XLA documentation](https://www.tensorflow.org/xla). Also refer to + [known XLA issues](https://www.tensorflow.org/xla/known_issues) + for more details. + """ + return self._jit_compile + + @jit_compile.setter + def jit_compile(self, value): + # Function remains cached with previous jit_compile settings + if self._jit_compile == value: + # Avoid resetting compiler cache if possible if the value is the + # same + return + # Check if TensorFlow is compiled with XLA before setting the value + if value and not tf_utils.can_jit_compile(warn=True): + self._jit_compile = False + return + + self._jit_compile = value + # Setting `jit_compile` should invalidate previously cached functions. + self._reset_compile_cache() + + @property + def distribute_reduction_method(self): + """The method employed to reduce per-replica values during training. + + Unless specified, the value "auto" will be assumed, indicating that + the reduction strategy should be chosen based on the current + running environment. + See `reduce_per_replica` function for more details. + + """ + return self._distribute_reduction_method or "auto" + + @distribute_reduction_method.setter + def distribute_reduction_method(self, value): + self._distribute_reduction_method = value + + def _validate_target_and_loss(self, y, loss): + """Raises error if target or loss is not found. + + This method verifies that the target and loss are properly populated + when applicable, or raises errors. + + Args: + y: the target for training. + loss: the total loss tensor including loss added via `compile` and + `add_loss`. + """ + + # `self.loss` references the loss added via `compile` call. If users + # have provided such, the target must be provided; otherwise it's a user + # error. Note that `self.loss` does not include losses added via + # `add_loss`, and it is a valid use when such loss from `add_loss` + # exists and target does not. + if self.loss and y is None: + raise ValueError( + "Target data is missing. Your model was compiled with " + f"loss={self.loss}, " + "and therefore expects target data to be provided in `fit()`." + ) + + # For training, there must be compiled loss or regularization loss to + # exist in order to apply the gradients. If one is not found, it means + # no loss was supplied via `compile` or `add_loss`. + elif loss is None: + raise ValueError( + "No loss found. You may have forgotten to provide a `loss` " + "argument in the `compile()` method." + ) + + def train_step(self, data): + """The logic for one training step. + + This method can be overridden to support custom training logic. + For concrete examples of how to override this method see + [Customizing what happens in fit]( + https://www.tensorflow.org/guide/tf_keras/customizing_what_happens_in_fit). + This method is called by `Model.make_train_function`. + + This method should contain the mathematical logic for one step of + training. This typically includes the forward pass, loss calculation, + backpropagation, and metric updates. + + Configuration details for *how* this logic is run (e.g. `tf.function` + and `tf.distribute.Strategy` settings), should be left to + `Model.make_train_function`, which can also be overridden. + + Args: + data: A nested structure of `Tensor`s. + + Returns: + A `dict` containing values that will be passed to + `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the + values of the `Model`'s metrics are returned. Example: + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data) + # Run forward pass. + with tf.GradientTape() as tape: + y_pred = self.main_model(x, training=True) + loss = self.compute_loss(x, y, y_pred, sample_weight) + self._validate_target_and_loss(y, loss) + # Run backwards pass. + self.optimizer.minimize(loss, self.main_model.trainable_variables, tape=tape) + return self.compute_metrics(x, y, y_pred, sample_weight) + + def hvd_train_step(self, data): + """The logic for one training step on Horovod. + + This method can be overridden to support custom training logic. + For concrete examples of how to override this method see + [Customizing what happens in fit]( + https://www.tensorflow.org/guide/tf_keras/customizing_what_happens_in_fit). + This method is called by `Model.make_train_function`. + + This method should contain the mathematical logic for one step of + training. This typically includes the forward pass, loss calculation, + backpropagation, and metric updates. + + Configuration details for *how* this logic is run (e.g. `tf.function` + and `tf.distribute.Strategy` settings), should be left to + `Model.make_train_function`, which can also be overridden. + + Args: + data: A nested structure of `Tensor`s. + + Returns: + A `dict` containing values that will be passed to + `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the + values of the `Model`'s metrics are returned. Example: + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data) + # Run forward pass. + with tf.GradientTape() as dp_tape, tf.GradientTape() as mp_tape: + y_pred = self.main_model(x, training=True) + loss = self.compute_loss(x, y, y_pred, sample_weight) + dp_tape = hvd.DistributedGradientTape(dp_tape, sparse_as_dense=False) + self._validate_target_and_loss(y, loss) + # Run backwards pass. + dp_vars, mp_vars = [], [] + for x in self.main_model.variables: + if isinstance(x, kv_variable_ops.EmbeddingVariable): + mp_vars.append(x) + else: + dp_vars.append(x) + self.optimizer.minimize(loss, dp_vars, tape=dp_tape) + self.optimizer.minimize(loss, mp_vars, tape=mp_tape) + return self.compute_metrics(x, y, y_pred, sample_weight) + + def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None): + """Compute the total loss, validate it, and return it. + + Subclasses can optionally override this method to provide custom loss + computation logic. + + Example: + ```python + class MyModel(tf.keras.Model): + + def __init__(self, *args, **kwargs): + super(MyModel, self).__init__(*args, **kwargs) + self.loss_tracker = tf.keras.metrics.Mean(name='loss') + + def compute_loss(self, x, y, y_pred, sample_weight): + loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y)) + loss += tf.add_n(self.losses) + self.loss_tracker.update_state(loss) + return loss + + def reset_metrics(self): + self.loss_tracker.reset_states() + + @property + def metrics(self): + return [self.loss_tracker] + + tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,)) + dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1) + + inputs = tf.keras.layers.Input(shape=(10,), name='my_input') + outputs = tf.keras.layers.Dense(10)(inputs) + model = MyModel(inputs, outputs) + model.add_loss(tf.reduce_sum(outputs)) + + optimizer = tf.keras.optimizers.SGD() + model.compile(optimizer, loss='mse', steps_per_execution=10) + model.fit(dataset, epochs=2, steps_per_epoch=10) + print('My custom loss: ', model.loss_tracker.result().numpy()) + ``` + + Args: + x: Input data. + y: Target data. + y_pred: Predictions returned by the model (output of `model(x)`) + sample_weight: Sample weights for weighting the loss function. + + Returns: + The total loss as a `tf.Tensor`, or `None` if no loss results (which + is the case when called by `Model.test_step`). + """ + del x # The default implementation does not use `x`. + return self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.main_model.losses) + + def compute_metrics(self, x, y, y_pred, sample_weight): + """Update metric states and collect all metrics to be returned. + + Subclasses can optionally override this method to provide custom metric + updating and collection logic. + + Example: + ```python + class MyModel(tf.keras.Sequential): + + def compute_metrics(self, x, y, y_pred, sample_weight): + + # This super call updates `self.compiled_metrics` and returns + # results for all metrics listed in `self.metrics`. + metric_results = super(MyModel, self).compute_metrics( + x, y, y_pred, sample_weight) + + # Note that `self.custom_metric` is not listed in `self.metrics`. + self.custom_metric.update_state(x, y, y_pred, sample_weight) + metric_results['custom_metric_name'] = self.custom_metric.result() + return metric_results + ``` + + Args: + x: Input data. + y: Target data. + y_pred: Predictions returned by the model (output of `model.call(x)`) + sample_weight: Sample weights for weighting the loss function. + + Returns: + A `dict` containing values that will be passed to + `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the + values of the metrics listed in `self.metrics` are returned. Example: + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + del x # The default implementation does not use `x`. + self.compiled_metrics.update_state(y, y_pred, sample_weight) + return self.get_metrics_result() + + def get_metrics_result(self): + """Returns the model's metrics values as a dict. + + If any of the metric result is a dict (containing multiple metrics), + each of them gets added to the top level returned dict of this method. + + Returns: + A `dict` containing values of the metrics listed in `self.metrics`. + Example: + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + # Collect metrics to return + return_metrics = {} + for metric in self.metrics: + result = metric.result() + if isinstance(result, dict): + return_metrics.update(result) + else: + return_metrics[metric.name] = result + return return_metrics + + def _validate_and_get_metrics_result(self, logs): + """Returns model metrics as a dict if the keys match with input logs. + + When the training / evalution is performed with asynchronous steps, such + as the case with `tf.distribute.ParameterServerStrategy`, the last + scheduled `train / test_step` may not give the latest metrics because it + is not guaranteed to be executed the last. This method gets metrics from + the model directly instead of relying on the return from last step + function. + + It logs a warning if the metric results could not be overridden when + used with `tf.distribute.ParameterServerStrategy`. + + When the user has custom train / test step functions, the metrics + returned may be different from `Model.metrics`. In those instances, + this function will be no-op and return the logs. + + Args: + logs: A `dict` of metrics returned by train / test step function. + + Returns: + A `dict` containing values of the metrics listed in `self.metrics` + when logs and model metrics keys match. Otherwise it returns input + `logs`. + """ + PSS_WARN_MSG = "Could not get Model metric results. \ + Using the results of last step function could lead to incorrect \ + results when used with ParameterServerStrategy" + + try: + metric_logs = self.get_metrics_result() + except TypeError: + if self._cluster_coordinator: + logging.warning(PSS_WARN_MSG) + else: + # Verify that train / test step logs passed and metric logs have + # matching keys. Could be different when using custom step functions + if isinstance(logs, dict) and set(logs.keys()) == set(metric_logs.keys()): + logs = tf_utils.sync_to_numpy_or_python_type(metric_logs) + elif self._cluster_coordinator: + logging.warning(PSS_WARN_MSG) + return logs + + def _aggregate_exact_metrics(self, logs): + # When doing exact evaluation, `logs` is a list of each data shard's + # metric variables, which will be used to update the metrics. + for shard_result in logs: + for metric in self.metrics: + if metric.name not in shard_result.keys(): + logging.log_first_n( + logging.WARN, + f"No matching result found for metric {metric.name}. " + "This metric's computed result may be incorrect.", + 3, + ) + continue + metric_result = shard_result[metric.name] + if len(metric_result) != len(metric.weights): + raise ValueError( + f"Expected {len(metric.weights)} variables in result " + f"for metric {metric.name}, but found " + f"{len(metric_result)}." + ) + for weight, val in zip(metric.weights, metric_result): + weight.assign_add(val) + return self.get_metrics_result() + + def make_train_function(self, force=False): + """Creates a function that executes one step of training. + + This method can be overridden to support custom training logic. + This method is called by `Model.fit` and `Model.train_on_batch`. + + Typically, this method directly controls `tf.function` and + `tf.distribute.Strategy` settings, and delegates the actual training + logic to `Model.train_step`. + + This function is cached the first time `Model.fit` or + `Model.train_on_batch` is called. The cache is cleared whenever + `Model.compile` is called. You can skip the cache and generate again the + function with `force=True`. + + Args: + force: Whether to regenerate the train function and skip the cached + function if available. + + Returns: + Function. The function created by this method should accept a + `tf.data.Iterator`, and return a `dict` containing values that will + be passed to `tf.keras.Callbacks.on_train_batch_end`, such as + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + if self.train_function is not None and not force: + return self.train_function + + def step_function(iterator): + """Runs a single training step.""" + + def run_step(data): + outputs = self.train_step(data) + # Ensure counter is updated only if `train_step` succeeds. + with tf.control_dependencies(training_module._minimum_control_deps(outputs)): + self._train_counter.assign_add(1) + return outputs + + if self.jit_compile: + run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) + data = next(iterator) + outputs = self.distribute_strategy.run(run_step, args=(data,)) + outputs = training_module.reduce_per_replica( + outputs, + self.distribute_strategy, + reduction=self.distribute_reduction_method, + ) + return outputs + + # Special case if steps_per_execution is one. + if ( + self._steps_per_execution is None or + self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution + ): + + def train_function(iterator): + """Runs a training execution with a single step.""" + return step_function(iterator) + + if not self.run_eagerly: + train_function = tf.function(train_function, reduce_retracing=True) + self.train_tf_function = train_function + + if self._cluster_coordinator: + self.train_function = (lambda it: self._cluster_coordinator.schedule(train_function, args=(it,))) + else: + self.train_function = train_function + + # If we're using a coordinator, use the value of + # self._steps_per_execution at the time the function is + # called/scheduled, and not when it is actually executed. + elif self._cluster_coordinator: + + def train_function(iterator, steps_per_execution): + """Runs a training execution with multiple steps.""" + for _ in tf.range(steps_per_execution): + outputs = step_function(iterator) + return outputs + + if not self.run_eagerly: + train_function = tf.function(train_function, reduce_retracing=True) + self.train_tf_function = train_function + + self.train_function = lambda it: self._cluster_coordinator.schedule( + train_function, args=(it, self._steps_per_execution.value()) + ) + else: + + def train_function(iterator): + """Runs a training execution with multiple steps.""" + for _ in tf.range(self._steps_per_execution): + outputs = step_function(iterator) + return outputs + + if not self.run_eagerly: + train_function = tf.function(train_function, reduce_retracing=True) + self.train_tf_function = train_function + self.train_function = train_function + + return self.train_function + + def make_hvd_train_function(self, force=False): + """Creates a function that executes one step of training. + + This method can be overridden to support custom training logic. + This method is called by `Model.fit` and `Model.train_on_batch`. + + Typically, this method directly controls `tf.function` and + `tf.distribute.Strategy` settings, and delegates the actual training + logic to `Model.train_step`. + + This function is cached the first time `Model.fit` or + `Model.train_on_batch` is called. The cache is cleared whenever + `Model.compile` is called. You can skip the cache and generate again the + function with `force=True`. + + Args: + force: Whether to regenerate the train function and skip the cached + function if available. + + Returns: + Function. The function created by this method should accept a + `tf.data.Iterator`, and return a `dict` containing values that will + be passed to `tf.keras.Callbacks.on_train_batch_end`, such as + `{'loss': 0.2, 'accuracy': 0.7}`. + """ + if self.train_function is not None and not force: + return self.train_function + + def step_function(iterator): + """Runs a single training step.""" + + def do_broadcast(): + model_broadcast_vars = [ + x for x in self.main_model.variables + if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable)) + ] + opt_broadcast_vars = [ + x for x in self.optimizer.variables() + if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable)) + ] + print_op = tf.print( + f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...", + output_stream=sys.stdout + ) + with tf.control_dependencies([print_op]): + hvd.broadcast_variables(model_broadcast_vars + opt_broadcast_vars, root_rank=0) + self.first_batch.assign(False) + + def run_step(data): + outputs = self.hvd_train_step(data) + # Ensure counter is updated only if `hvd_train_step` succeeds. + with tf.control_dependencies(training_module._minimum_control_deps(outputs)): + self._train_counter.assign_add(1) + if self.first_batch: + do_broadcast() + return outputs + + if self.jit_compile: + run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) + data = next(iterator) + outputs = run_step(data) + return outputs + + # Special case if steps_per_execution is one. + if ( + self._steps_per_execution is None or + self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution + ): + + def train_function(iterator): + """Runs a training execution with a single step.""" + return step_function(iterator) + + if not self.run_eagerly: + train_function = tf.function(train_function, reduce_retracing=True) + self.train_tf_function = train_function + + self.train_function = train_function + else: + + def train_function(iterator): + """Runs a training execution with multiple steps.""" + for _ in tf.range(self._steps_per_execution): + outputs = step_function(iterator) + return outputs + + if not self.run_eagerly: + train_function = tf.function(train_function, reduce_retracing=True) + self.train_tf_function = train_function + self.train_function = train_function + + return self.train_function + + @traceback_utils.filter_traceback + def fit( + self, + x=None, + y=None, + batch_size=None, + epochs=None, + verbose="auto", + callbacks=[], + validation_split=0.0, + validation_data=None, + shuffle=True, + class_weight=None, + sample_weight=None, + initial_epoch=0, + steps_per_epoch=None, + validation_steps=None, + validation_batch_size=None, + validation_freq=1, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + ): + """Trains the model for a fixed number of epochs (dataset iterations). + + Args: + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset. Should return a tuple + of either `(inputs, targets)` or + `(inputs, targets, sample_weights)`. + - A generator or `keras.utils.Sequence` returning `(inputs, + targets)` or `(inputs, targets, sample_weights)`. + - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a + callable that takes a single argument of type + `tf.distribute.InputContext`, and returns a `tf.data.Dataset`. + `DatasetCreator` should be used when users prefer to specify the + per-replica batching and sharding logic for the `Dataset`. + See `tf.keras.utils.experimental.DatasetCreator` doc for more + information. + A more detailed description of unpacking behavior for iterator + types (Dataset, generator, Sequence) is given below. If these + include `sample_weights` as a third component, note that sample + weighting applies to the `weighted_metrics` argument but not the + `metrics` argument in `compile()`. If using + `tf.distribute.experimental.ParameterServerStrategy`, only + `DatasetCreator` type is supported for `x`. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset, generator, + or `keras.utils.Sequence` instance, `y` should + not be specified (since targets will be obtained from `x`). + batch_size: Integer or `None`. + Number of samples per gradient update. + If unspecified, `batch_size` will default to 32. + Do not specify the `batch_size` if your data is in the + form of datasets, generators, or `keras.utils.Sequence` + instances (since they generate batches). + epochs: Integer. Number of epochs to train the model. + An epoch is an iteration over the entire `x` and `y` + data provided + (unless the `steps_per_epoch` flag is set to + something other than None). + Note that in conjunction with `initial_epoch`, + `epochs` is to be understood as "final epoch". + The model is not trained for a number of iterations + given by `epochs`, but merely until the epoch + of index `epochs` is reached. + verbose: 'auto', 0, 1, or 2. Verbosity mode. + 0 = silent, 1 = progress bar, 2 = one line per epoch. + 'auto' becomes 1 for most cases, but 2 when used with + `ParameterServerStrategy`. Note that the progress bar is not + particularly useful when logged to a file, so verbose=2 is + recommended when not running interactively (eg, in a production + environment). Defaults to 'auto'. + callbacks: List of `keras.callbacks.Callback` instances. + List of callbacks to apply during training. + See `tf.keras.callbacks`. Note + `tf.keras.callbacks.ProgbarLogger` and + `tf.keras.callbacks.History` callbacks are created automatically + and need not be passed into `model.fit`. + `tf.keras.callbacks.ProgbarLogger` is created or not based on + `verbose` argument to `model.fit`. + Callbacks with batch-level calls are currently unsupported with + `tf.distribute.experimental.ParameterServerStrategy`, and users + are advised to implement epoch-level calls instead with an + appropriate `steps_per_epoch` value. + validation_split: Float between 0 and 1. + Fraction of the training data to be used as validation data. + The model will set apart this fraction of the training data, + will not train on it, and will evaluate + the loss and any model metrics + on this data at the end of each epoch. + The validation data is selected from the last samples + in the `x` and `y` data provided, before shuffling. This + argument is not supported when `x` is a dataset, generator or + `keras.utils.Sequence` instance. + If both `validation_data` and `validation_split` are provided, + `validation_data` will override `validation_split`. + `validation_split` is not yet supported with + `tf.distribute.experimental.ParameterServerStrategy`. + validation_data: Data on which to evaluate + the loss and any model metrics at the end of each epoch. + The model will not be trained on this data. Thus, note the fact + that the validation loss of data provided using + `validation_split` or `validation_data` is not affected by + regularization layers like noise and dropout. + `validation_data` will override `validation_split`. + `validation_data` could be: + - A tuple `(x_val, y_val)` of Numpy arrays or tensors. + - A tuple `(x_val, y_val, val_sample_weights)` of NumPy + arrays. + - A `tf.data.Dataset`. + - A Python generator or `keras.utils.Sequence` returning + `(inputs, targets)` or `(inputs, targets, sample_weights)`. + `validation_data` is not yet supported with + `tf.distribute.experimental.ParameterServerStrategy`. + shuffle: Boolean (whether to shuffle the training data + before each epoch) or str (for 'batch'). This argument is + ignored when `x` is a generator or an object of tf.data.Dataset. + 'batch' is a special option for dealing + with the limitations of HDF5 data; it shuffles in batch-sized + chunks. Has no effect when `steps_per_epoch` is not `None`. + class_weight: Optional dictionary mapping class indices (integers) + to a weight (float) value, used for weighting the loss function + (during training only). + This can be useful to tell the model to + "pay more attention" to samples from + an under-represented class. When `class_weight` is specified + and targets have a rank of 2 or greater, either `y` must be + one-hot encoded, or an explicit final dimension of `1` must + be included for sparse class labels. + sample_weight: Optional Numpy array of weights for + the training samples, used for weighting the loss function + (during training only). You can either pass a flat (1D) + Numpy array with the same length as the input samples + (1:1 mapping between weights and samples), + or in the case of temporal data, + you can pass a 2D array with shape + `(samples, sequence_length)`, + to apply a different weight to every timestep of every sample. + This argument is not supported when `x` is a dataset, generator, + or `keras.utils.Sequence` instance, instead provide the + sample_weights as the third element of `x`. + Note that sample weighting does not apply to metrics specified + via the `metrics` argument in `compile()`. To apply sample + weighting to your metrics, you can specify them via the + `weighted_metrics` in `compile()` instead. + initial_epoch: Integer. + Epoch at which to start training + (useful for resuming a previous training run). + steps_per_epoch: Integer or `None`. + Total number of steps (batches of samples) + before declaring one epoch finished and starting the + next epoch. When training with input tensors such as + TensorFlow data tensors, the default `None` is equal to + the number of samples in your dataset divided by + the batch size, or 1 if that cannot be determined. If x is a + `tf.data` dataset, and 'steps_per_epoch' + is None, the epoch will run until the input dataset is + exhausted. When passing an infinitely repeating dataset, you + must specify the `steps_per_epoch` argument. If + `steps_per_epoch=-1` the training will run indefinitely with an + infinitely repeating dataset. This argument is not supported + with array inputs. + When using `tf.distribute.experimental.ParameterServerStrategy`: + * `steps_per_epoch=None` is not supported. + validation_steps: Only relevant if `validation_data` is provided and + is a `tf.data` dataset. Total number of steps (batches of + samples) to draw before stopping when performing validation + at the end of every epoch. If 'validation_steps' is None, + validation will run until the `validation_data` dataset is + exhausted. In the case of an infinitely repeated dataset, it + will run into an infinite loop. If 'validation_steps' is + specified and only part of the dataset will be consumed, the + evaluation will start from the beginning of the dataset at each + epoch. This ensures that the same validation samples are used + every time. + validation_batch_size: Integer or `None`. + Number of samples per validation batch. + If unspecified, will default to `batch_size`. + Do not specify the `validation_batch_size` if your data is in + the form of datasets, generators, or `keras.utils.Sequence` + instances (since they generate batches). + validation_freq: Only relevant if validation data is provided. + Integer or `collections.abc.Container` instance (e.g. list, tuple, + etc.). If an integer, specifies how many training epochs to run + before a new validation run is performed, e.g. `validation_freq=2` + runs validation every 2 epochs. If a Container, specifies the + epochs on which to run validation, e.g. + `validation_freq=[1, 2, 10]` runs validation at the end of the + 1st, 2nd, and 10th epochs. + max_queue_size: Integer. Used for generator or + `keras.utils.Sequence` input only. Maximum size for the generator + queue. If unspecified, `max_queue_size` will default to 10. + workers: Integer. Used for generator or `keras.utils.Sequence` input + only. Maximum number of processes to spin up + when using process-based threading. If unspecified, `workers` + will default to 1. + use_multiprocessing: Boolean. Used for generator or + `keras.utils.Sequence` input only. If `True`, use process-based + threading. If unspecified, `use_multiprocessing` will default to + `False`. Note that because this implementation relies on + multiprocessing, you should not pass non-pickleable arguments to + the generator as they can't be passed easily to children + processes. + + Unpacking behavior for iterator-like inputs: + A common pattern is to pass a tf.data.Dataset, generator, or + tf.keras.utils.Sequence to the `x` argument of fit, which will in fact + yield not only features (x) but optionally targets (y) and sample + weights. TF-Keras requires that the output of such iterator-likes be + unambiguous. The iterator should return a tuple of length 1, 2, or 3, + where the optional second and third elements will be used for y and + sample_weight respectively. Any other type provided will be wrapped in + a length one tuple, effectively treating everything as 'x'. When + yielding dicts, they should still adhere to the top-level tuple + structure. + e.g. `({"x0": x0, "x1": x1}, y)`. TF-Keras will not attempt to + separate features, targets, and weights from the keys of a single + dict. + A notable unsupported data type is the namedtuple. The reason is + that it behaves like both an ordered datatype (tuple) and a mapping + datatype (dict). So given a namedtuple of the form: + `namedtuple("example_tuple", ["y", "x"])` + it is ambiguous whether to reverse the order of the elements when + interpreting the value. Even worse is a tuple of the form: + `namedtuple("other_tuple", ["x", "y", "z"])` + where it is unclear if the tuple was intended to be unpacked into x, + y, and sample_weight or passed through as a single element to `x`. As + a result the data processing code will simply raise a ValueError if it + encounters a namedtuple. (Along with instructions to remedy the + issue.) + + Returns: + A `History` object. Its `History.history` attribute is + a record of training loss values and metrics values + at successive epochs, as well as validation loss values + and validation metrics values (if applicable). + + Raises: + RuntimeError: 1. If the model was never compiled or, + 2. If `model.fit` is wrapped in `tf.function`. + + ValueError: In case of mismatch between the provided input data + and what the model expects or when the input data is empty. + """ + if steps_per_epoch and flags.FLAGS.use_horovod: + try: + import horovod.tensorflow as hvd + steps_array = hvd.allgather_object(steps_per_epoch, name='check_train_step') + logger.debug(f"steps_array = {steps_array}") + assert max(set(steps_array)) == min(set(steps_array)) + except: + raise ValueError( + f"steps_per_epoch = {steps_per_epoch}, different rank should have same steps when using Horovod." + ) + # Legacy graph support is contained in `training_v1.Model`. + if batch_size is None: + batch_size = flags.FLAGS.batch_size + if epochs is None: + epochs = flags.FLAGS.epochs + if flags.FLAGS.stop_steps >= 0: + epochs = 1 + if steps_per_epoch is None: + steps_per_epoch = flags.FLAGS.stop_steps + else: + steps_per_epoch = min(steps_per_epoch, flags.FLAGS.stop_steps) + + version_utils.disallow_legacy_graph("Model", "fit") + self._assert_compile_was_called() + self._check_call_args("fit") + training_module._disallow_inside_tf_function("fit") + + verbose = training_module._get_verbosity(verbose, self.distribute_strategy) + + if validation_split and validation_data is None: + # Create the validation data using the training data. Only supported + # for `Tensor` and `NumPy` input. + ( + x, + y, + sample_weight, + ), validation_data = data_adapter.train_validation_split( + (x, y, sample_weight), validation_split=validation_split + ) + + if validation_data: + ( + val_x, + val_y, + val_sample_weight, + ) = data_adapter.unpack_x_y_sample_weight(validation_data) + + if self.distribute_strategy._should_use_with_coordinator: + self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy)) + + with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState( # noqa: E501 + self + ): + # Creates a `tf.data.Dataset` and handles batch and epoch iteration. + data_handler = data_adapter.get_data_handler( + x=x, + y=y, + sample_weight=sample_weight, + batch_size=batch_size, + steps_per_epoch=steps_per_epoch, + initial_epoch=initial_epoch, + epochs=epochs, + shuffle=shuffle, + class_weight=class_weight, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + model=self, + steps_per_execution=self._steps_per_execution, + ) + + for callback in callbacks: + if hasattr(callback, 'set_optimizer') and callable(callback.set_optimizer): + callback.set_optimizer(self.optimizer) + if hasattr(callback, 'set_models') and callable(callback.set_models): + callback.set_models(self._model) + + # Container that configures and calls `tf.keras.Callback`s. + if not isinstance(callbacks, callbacks_module.CallbackList): + if flags.FLAGS.use_horovod: + if is_main_process(): + callbacks += [ProgbarLogger(count_mode="steps")] + callbacks = HvdCallbackList( + callbacks, + add_history=True, + add_progbar=False, + model=self.main_model, + verbose=verbose, + epochs=epochs, + steps=data_handler.inferred_steps, + ) + else: + callbacks = callbacks_module.CallbackList( + callbacks, + add_history=True, + add_progbar=verbose != 0, + model=self.main_model, + verbose=verbose, + epochs=epochs, + steps=data_handler.inferred_steps, + ) + + self.stop_training = False + self.train_function = self.make_train_function() if not flags.FLAGS.use_horovod else self.make_hvd_train_function( + ) + self._train_counter.assign(0) + callbacks.on_train_begin() + training_logs = None + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.start() + # Handle fault-tolerance for multi-worker. + # TODO(omalleyt): Fix the ordering issues that mean this has to + # happen after `callbacks.on_train_begin`. + steps_per_epoch_inferred = (steps_per_epoch or data_handler.inferred_steps) + ( + data_handler._initial_epoch, + data_handler._initial_step, + ) = self._maybe_load_initial_counters_from_ckpt(steps_per_epoch_inferred, initial_epoch) + logs = None + for epoch, iterator in data_handler.enumerate_epochs(): + self.reset_metrics() + callbacks.on_epoch_begin(epoch) + with data_handler.catch_stop_iteration(): + for step in data_handler.steps(): + with tf.profiler.experimental.Trace( + "train", + epoch_num=epoch, + step_num=step, + batch_size=batch_size, + _r=1, + ): + callbacks.on_train_batch_begin(step) + tmp_logs = self.train_function(iterator) + if data_handler.should_sync: + context.async_wait() + # No error, now safe to assign to logs. + logs = tmp_logs + end_step = step + data_handler.step_increment + callbacks.on_train_batch_end(end_step, logs) + if self.stop_training: + break + + logs = tf_utils.sync_to_numpy_or_python_type(logs) + if logs is None: + raise ValueError( + "Unexpected result of `train_function` " + "(Empty logs). This could be due to issues in input " + "pipeline that resulted in an empty dataset. " + "Otherwise, please use " + "`Model.compile(..., run_eagerly=True)`, or " + "`tf.config.run_functions_eagerly(True)` for more " + "information of where went wrong, or file a " + "issue/bug to `tf.keras`." + ) + # Override with model metrics instead of last step logs + logs = self._validate_and_get_metrics_result(logs) + epoch_logs = copy.copy(logs) + + # Run validation. + if validation_data and self._should_eval(epoch, validation_freq): + if self._pss_evaluation_shards: + self._disallow_exact_eval_with_add_metrics() + # Create data_handler for evaluation and cache it. + if getattr(self, "_eval_data_handler", None) is None: + self._eval_data_handler = data_adapter.get_data_handler( + x=val_x, + y=val_y, + sample_weight=val_sample_weight, + batch_size=validation_batch_size or batch_size, + steps_per_epoch=validation_steps, + initial_epoch=0, + epochs=1, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + model=self, + steps_per_execution=self._steps_per_execution, + pss_evaluation_shards=self._pss_evaluation_shards, + ) + val_logs = self.evaluate( + x=val_x, + y=val_y, + sample_weight=val_sample_weight, + batch_size=validation_batch_size or batch_size, + steps=validation_steps, + callbacks=callbacks, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + return_dict=True, + _use_cached_eval_dataset=True, + ) + val_logs = {"val_" + name: val for name, val in val_logs.items()} + epoch_logs.update(val_logs) + + callbacks.on_epoch_end(epoch, epoch_logs) + training_logs = epoch_logs + if self.stop_training: + break + + if isinstance(self.optimizer, optimizer.Optimizer) and epochs > 0: + self.optimizer.finalize_variable_values(self.trainable_variables) + + # If eval data_handler exists, delete it after all epochs are done. + if getattr(self, "_eval_data_handler", None) is not None: + del self._eval_data_handler + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.stop() + callbacks.on_train_end(logs=training_logs) + return self.history + + def test_step(self, data): + """The logic for one evaluation step. + + This method can be overridden to support custom evaluation logic. + This method is called by `Model.make_test_function`. + + This function should contain the mathematical logic for one step of + evaluation. + This typically includes the forward pass, loss calculation, and metrics + updates. + + Configuration details for *how* this logic is run (e.g. `tf.function` + and `tf.distribute.Strategy` settings), should be left to + `Model.make_test_function`, which can also be overridden. + + Args: + data: A nested structure of `Tensor`s. + + Returns: + A `dict` containing values that will be passed to + `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the + values of the `Model`'s metrics are returned. + """ + x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data) + + y_pred = self.main_model(x, training=False) + # Updates stateful loss metrics. + self.compute_loss(x, y, y_pred, sample_weight) + return self.compute_metrics(x, y, y_pred, sample_weight) + + def _make_test_function_exact(self): + if getattr(self, "_shard_test_function", None): + return self._shard_test_function + + def step_function(batch): + + def run_step(data): + # TODO(b/272050910): Use sample_weight for weighted metrics. + x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data) + y_pred = self.main_model(x, training=False) + return x, y, y_pred, sample_weight + + if self._jit_compile: + run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) + + outputs = self.distribute_strategy.run(run_step, args=(batch,)) + outputs = training_module.reduce_per_replica( + outputs, + self.distribute_strategy, + reduction=self.distribute_reduction_method, + ) + return outputs + + def shard_test_function(dataset, total_shards, shard_idx): + # Copy loss and metric variables to the worker and work with them + # locally. This ensures each shard function is atomic: if a worker + # is preempted, the intermediate progress is discarded and that + # shard is retried. This in turn guarantees exactly-once visitation. + local_unweighted_metrics, local_weighted_metrics = [], [] + with tf_utils.with_metric_local_vars_scope(): + # TODO(jmullenbach): implement and use a clone for + # `MetricsContainer` and use its `update_state` method directly. + for metric in self.compiled_metrics.unweighted_metrics: + if metric is not None: + local_unweighted_metrics.append(base_metric.clone_metric(metric)) + for metric in self.compiled_metrics.weighted_metrics: + if metric is not None: + local_weighted_metrics.append(base_metric.clone_metric(metric)) + local_loss = compile_utils.LossesContainer.from_config(self.compiled_loss.get_config()) + + dataset = input_ops.auto_shard_dataset(dataset, total_shards, shard_idx) + iterator = iter(dataset) + with distribute_utils.cache_variable_reads(): + for batch in iterator: + x, y, y_pred, sample_weight = step_function(batch) + for weighted_metric in local_weighted_metrics: + weighted_metric.update_state(y, y_pred, sample_weight) + for unweighted_metric in local_unweighted_metrics: + unweighted_metric.update_state(y, y_pred) + local_loss(y, y_pred, sample_weight) + local_metrics = (local_unweighted_metrics + local_weighted_metrics + local_loss.metrics) + outputs = {metric.name: metric.weights for metric in local_metrics} + with tf.control_dependencies(training_module._minimum_control_deps(outputs)): + self._test_counter.assign_add(1) + return outputs + + if not self.run_eagerly: + shard_test_function = tf.function(shard_test_function, reduce_retracing=True) + + self._shard_test_function = (lambda *args: self._cluster_coordinator.schedule( + shard_test_function, + args=args, + )) + return self._shard_test_function + + def make_test_function(self, force=False): + """Creates a function that executes one step of evaluation. + + This method can be overridden to support custom evaluation logic. + This method is called by `Model.evaluate` and `Model.test_on_batch`. + + Typically, this method directly controls `tf.function` and + `tf.distribute.Strategy` settings, and delegates the actual evaluation + logic to `Model.test_step`. + + This function is cached the first time `Model.evaluate` or + `Model.test_on_batch` is called. The cache is cleared whenever + `Model.compile` is called. You can skip the cache and generate again the + function with `force=True`. + + Args: + force: Whether to regenerate the test function and skip the cached + function if available. + + Returns: + Function. The function created by this method should accept a + `tf.data.Iterator`, and return a `dict` containing values that will + be passed to `tf.keras.Callbacks.on_test_batch_end`. + """ + if self.test_function is not None and not force: + return self.test_function + + def step_function(iterator): + """Runs a single evaluation step.""" + + def run_step(data): + outputs = self.test_step(data) + # Ensure counter is updated only if `test_step` succeeds. + with tf.control_dependencies(training_module._minimum_control_deps(outputs)): + self._test_counter.assign_add(1) + return outputs + + if self.jit_compile: + run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) + + data = next(iterator) + outputs = self.distribute_strategy.run(run_step, args=(data,)) + outputs = training_module.reduce_per_replica( + outputs, + self.distribute_strategy, + reduction=self.distribute_reduction_method, + ) + return outputs + + # Special case if steps_per_execution is one. + if ( + self._steps_per_execution is None or + self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution + ): + + def test_function(iterator): + """Runs a test execution with a single step.""" + return step_function(iterator) + + if not self.run_eagerly: + test_function = tf.function(test_function, reduce_retracing=True) + + if self._cluster_coordinator: + self.test_function = (lambda it: self._cluster_coordinator.schedule(test_function, args=(it,))) + else: + self.test_function = test_function + + # If we're using a coordinator, use the value of + # self._steps_per_execution at the time the function is + # called/scheduled, and not when it is actually executed. + elif self._cluster_coordinator: + + def test_function(iterator, steps_per_execution): + """Runs a test execution with multiple steps.""" + for _ in tf.range(steps_per_execution): + outputs = step_function(iterator) + return outputs + + if not self.run_eagerly: + test_function = tf.function(test_function, reduce_retracing=True) + + self.test_function = lambda it: self._cluster_coordinator.schedule( + test_function, args=(it, self._steps_per_execution.value()) + ) + else: + + def test_function(iterator): + """Runs a test execution with multiple steps.""" + for _ in tf.range(self._steps_per_execution): + outputs = step_function(iterator) + return outputs + + if not self.run_eagerly: + test_function = tf.function(test_function, reduce_retracing=True) + self.test_function = test_function + + return self.test_function + + @traceback_utils.filter_traceback + def evaluate( + self, + x=None, + y=None, + batch_size=None, + verbose="auto", + sample_weight=None, + steps=None, + callbacks=None, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + return_dict=False, + **kwargs, + ): + """Returns the loss value & metrics values for the model in test mode. + + Computation is done in batches (see the `batch_size` arg.) + + Args: + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset. Should return a tuple + of either `(inputs, targets)` or + `(inputs, targets, sample_weights)`. + - A generator or `keras.utils.Sequence` returning `(inputs, + targets)` or `(inputs, targets, sample_weights)`. + A more detailed description of unpacking behavior for iterator + types (Dataset, generator, Sequence) is given in the `Unpacking + behavior for iterator-like inputs` section of `Model.fit`. + y: Target data. Like the input data `x`, it could be either Numpy + array(s) or TensorFlow tensor(s). It should be consistent with `x` + (you cannot have Numpy inputs and tensor targets, or inversely). + If `x` is a dataset, generator or `keras.utils.Sequence` instance, + `y` should not be specified (since targets will be obtained from + the iterator/dataset). + batch_size: Integer or `None`. Number of samples per batch of + computation. If unspecified, `batch_size` will default to 32. Do + not specify the `batch_size` if your data is in the form of a + dataset, generators, or `keras.utils.Sequence` instances (since + they generate batches). + verbose: `"auto"`, 0, 1, or 2. Verbosity mode. + 0 = silent, 1 = progress bar, 2 = single line. + `"auto"` becomes 1 for most cases, and to 2 when used with + `ParameterServerStrategy`. Note that the progress bar is not + particularly useful when logged to a file, so `verbose=2` is + recommended when not running interactively (e.g. in a production + environment). Defaults to 'auto'. + sample_weight: Optional Numpy array of weights for the test samples, + used for weighting the loss function. You can either pass a flat + (1D) Numpy array with the same length as the input samples + (1:1 mapping between weights and samples), or in the case of + temporal data, you can pass a 2D array with shape `(samples, + sequence_length)`, to apply a different weight to every + timestep of every sample. This argument is not supported when + `x` is a dataset, instead pass sample weights as the third + element of `x`. + steps: Integer or `None`. Total number of steps (batches of samples) + before declaring the evaluation round finished. Ignored with the + default value of `None`. If x is a `tf.data` dataset and `steps` + is None, 'evaluate' will run until the dataset is exhausted. This + argument is not supported with array inputs. + callbacks: List of `keras.callbacks.Callback` instances. List of + callbacks to apply during evaluation. See + [callbacks](https://www.tensorflow.org/api_docs/python/tf/tf_keras/callbacks). + max_queue_size: Integer. Used for generator or + `keras.utils.Sequence` input only. Maximum size for the generator + queue. If unspecified, `max_queue_size` will default to 10. + workers: Integer. Used for generator or `keras.utils.Sequence` input + only. Maximum number of processes to spin up when using + process-based threading. If unspecified, `workers` will default to + 1. + use_multiprocessing: Boolean. Used for generator or + `keras.utils.Sequence` input only. If `True`, use process-based + threading. If unspecified, `use_multiprocessing` will default to + `False`. Note that because this implementation relies on + multiprocessing, you should not pass non-pickleable arguments to + the generator as they can't be passed easily to children + processes. + return_dict: If `True`, loss and metric results are returned as a + dict, with each key being the name of the metric. If `False`, they + are returned as a list. + **kwargs: Unused at this time. + + See the discussion of `Unpacking behavior for iterator-like inputs` for + `Model.fit`. + + Returns: + Scalar test loss (if the model has a single output and no metrics) + or list of scalars (if the model has multiple outputs + and/or metrics). The attribute `model.metrics_names` will give you + the display labels for the scalar outputs. + + Raises: + RuntimeError: If `model.evaluate` is wrapped in a `tf.function`. + """ + version_utils.disallow_legacy_graph("Model", "evaluate") + self._assert_compile_was_called() + self._check_call_args("evaluate") + self._check_sample_weight_warning(x, sample_weight) + training_module._disallow_inside_tf_function("evaluate") + use_cached_eval_dataset = kwargs.pop("_use_cached_eval_dataset", False) + if kwargs: + raise TypeError(f"Invalid keyword arguments: {list(kwargs.keys())}") + + if self.distribute_strategy._should_use_with_coordinator: + self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy)) + + verbose = training_module._get_verbosity(verbose, self.distribute_strategy) + if self._pss_evaluation_shards: + self._disallow_exact_eval_with_add_metrics() + with self.distribute_strategy.scope(): + # Use cached evaluation data only when it's called in `Model.fit` + if (use_cached_eval_dataset and getattr(self, "_eval_data_handler", None) is not None): + data_handler = self._eval_data_handler + else: + # Creates a `tf.data.Dataset` and handles batch and epoch + # iteration. + data_handler = data_adapter.get_data_handler( + x=x, + y=y, + sample_weight=sample_weight, + batch_size=batch_size, + steps_per_epoch=steps, + initial_epoch=0, + epochs=1, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + model=self, + steps_per_execution=self._steps_per_execution, + pss_evaluation_shards=self._pss_evaluation_shards, + ) + + # Container that configures and calls `tf.keras.Callback`s. + if not isinstance(callbacks, callbacks_module.CallbackList): + callbacks = callbacks_module.CallbackList( + callbacks, + add_history=True, + add_progbar=verbose != 0, + model=self, + verbose=verbose, + epochs=1, + steps=data_handler.inferred_steps, + ) + + # Initialize to prevent errors if 0 epochs are evaluated. + logs = {} + + test_function_runner = self._get_test_function_runner(callbacks) + self._test_counter.assign(0) + callbacks.on_test_begin() + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.start() + for ( + _, + dataset_or_iterator, + ) in data_handler.enumerate_epochs(): # Single epoch. + self.reset_metrics() + with data_handler.catch_stop_iteration(): + for step in data_handler.steps(): + with tf.profiler.experimental.Trace("test", step_num=step, _r=1): + callbacks.on_test_batch_begin(step) + logs = test_function_runner.run_step( + dataset_or_iterator, + data_handler, + step, + self._pss_evaluation_shards, + ) + + logs = tf_utils.sync_to_numpy_or_python_type(logs) + # Override with model metrics instead of last step logs + if self._pss_evaluation_shards: + logs = self._aggregate_exact_metrics(logs) + else: + logs = self._validate_and_get_metrics_result(logs) + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.stop() + callbacks.on_test_end(logs=logs) + + if return_dict: + return logs + else: + return training_module.flatten_metrics_in_order(logs, self.metrics_names) + + def _disallow_exact_eval_with_add_metrics(self): + metrics_from_add_metric = [metric for layer in self._flatten_layers() for metric in layer._metrics] + compiled_metrics = self.compiled_metrics.metrics + if any([metric not in compiled_metrics for metric in metrics_from_add_metric]): + raise ValueError( + "Detected that a metric was added to this model " + "via `Model.add_metric`. This is not currently " + "supported when using exact evaluation with " + "`tf.distribute.ParameterServerStrategy`." + ) + + def _infer_exact_eval_shards(self, pss_evaluation_shards): + if not self.distribute_strategy._should_use_with_coordinator: + return 0 + if pss_evaluation_shards == "auto": + # TODO(b/264265138) evaluate and improve this heuristic + return self.distribute_strategy._num_workers * 5 + return pss_evaluation_shards + + def _get_test_function_runner(self, callbacks): + if (self._pss_evaluation_shards and self.distribute_strategy._should_use_with_coordinator): + self.test_function = self._make_test_function_exact() + test_function_runner = training_module._ExactTestFunction(self.test_function, callbacks) + else: + self.test_function = self.make_test_function() + test_function_runner = training_module._TestFunction(self.test_function, callbacks) + return test_function_runner + + def predict_step(self, data): + """The logic for one inference step. + + This method can be overridden to support custom inference logic. + This method is called by `Model.make_predict_function`. + + This method should contain the mathematical logic for one step of + inference. This typically includes the forward pass. + + Configuration details for *how* this logic is run (e.g. `tf.function` + and `tf.distribute.Strategy` settings), should be left to + `Model.make_predict_function`, which can also be overridden. + + Args: + data: A nested structure of `Tensor`s. + + Returns: + The result of one inference step, typically the output of calling the + `Model` on data. + """ + x, _, _ = data_adapter.unpack_x_y_sample_weight(data) + return self.main_model(x, training=False) + + def make_predict_function(self, force=False): + """Creates a function that executes one step of inference. + + This method can be overridden to support custom inference logic. + This method is called by `Model.predict` and `Model.predict_on_batch`. + + Typically, this method directly controls `tf.function` and + `tf.distribute.Strategy` settings, and delegates the actual evaluation + logic to `Model.predict_step`. + + This function is cached the first time `Model.predict` or + `Model.predict_on_batch` is called. The cache is cleared whenever + `Model.compile` is called. You can skip the cache and generate again the + function with `force=True`. + + Args: + force: Whether to regenerate the predict function and skip the cached + function if available. + + Returns: + Function. The function created by this method should accept a + `tf.data.Iterator`, and return the outputs of the `Model`. + """ + if self.predict_function is not None and not force: + return self.predict_function + + def step_function(iterator): + """Runs a single evaluation step.""" + + def run_step(data): + outputs = self.predict_step(data) + # Ensure counter is updated only if `test_step` succeeds. + with tf.control_dependencies(training_module._minimum_control_deps(outputs)): + self._predict_counter.assign_add(1) + return outputs + + if self.jit_compile: + run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True) + + data = next(iterator) + outputs = self.distribute_strategy.run(run_step, args=(data,)) + outputs = training_module.reduce_per_replica(outputs, self.distribute_strategy, reduction="concat") + return outputs + + # Special case if steps_per_execution is one. + if ( + self._steps_per_execution is None or + self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution + ): + + def predict_function(iterator): + """Runs an evaluation execution with a single step.""" + return step_function(iterator) + + else: + + def predict_function(iterator): + """Runs an evaluation execution with multiple steps.""" + outputs = step_function(iterator) + for _ in tf.range(self._steps_per_execution - 1): + tf.autograph.experimental.set_loop_options( + shape_invariants=[ + ( + outputs, + tf.nest.map_structure( + lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=True).shape, + outputs, + ), + ) + ] + ) + step_outputs = step_function(iterator) + outputs = tf.nest.map_structure(lambda t1, t2: training_module.concat([t1, t2]), outputs, step_outputs) + return outputs + + if not self.run_eagerly: + predict_function = tf.function(predict_function, reduce_retracing=True) + self.predict_function = predict_function + + return self.predict_function + + @traceback_utils.filter_traceback + def predict( + self, + x, + batch_size=None, + verbose="auto", + steps=None, + callbacks=None, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + ): + """Generates output predictions for the input samples. + + Computation is done in batches. This method is designed for batch + processing of large numbers of inputs. It is not intended for use inside + of loops that iterate over your data and process small numbers of inputs + at a time. + + For small numbers of inputs that fit in one batch, + directly use `__call__()` for faster execution, e.g., + `model(x)`, or `model(x, training=False)` if you have layers such as + `tf.keras.layers.BatchNormalization` that behave differently during + inference. You may pair the individual model call with a `tf.function` + for additional performance inside your inner loop. + If you need access to numpy array values instead of tensors after your + model call, you can use `tensor.numpy()` to get the numpy array value of + an eager tensor. + + Also, note the fact that test loss is not affected by + regularization layers like noise and dropout. + + Note: See [this FAQ entry]( + https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call) + for more details about the difference between `Model` methods + `predict()` and `__call__()`. + + Args: + x: Input samples. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A `tf.data` dataset. + - A generator or `keras.utils.Sequence` instance. + A more detailed description of unpacking behavior for iterator + types (Dataset, generator, Sequence) is given in the `Unpacking + behavior for iterator-like inputs` section of `Model.fit`. + batch_size: Integer or `None`. + Number of samples per batch. + If unspecified, `batch_size` will default to 32. + Do not specify the `batch_size` if your data is in the + form of dataset, generators, or `keras.utils.Sequence` instances + (since they generate batches). + verbose: `"auto"`, 0, 1, or 2. Verbosity mode. + 0 = silent, 1 = progress bar, 2 = single line. + `"auto"` becomes 1 for most cases, and to 2 when used with + `ParameterServerStrategy`. Note that the progress bar is not + particularly useful when logged to a file, so `verbose=2` is + recommended when not running interactively (e.g. in a production + environment). Defaults to 'auto'. + steps: Total number of steps (batches of samples) + before declaring the prediction round finished. + Ignored with the default value of `None`. If x is a `tf.data` + dataset and `steps` is None, `predict()` will + run until the input dataset is exhausted. + callbacks: List of `keras.callbacks.Callback` instances. + List of callbacks to apply during prediction. + See [callbacks]( + https://www.tensorflow.org/api_docs/python/tf/tf_keras/callbacks). + max_queue_size: Integer. Used for generator or + `keras.utils.Sequence` input only. Maximum size for the + generator queue. If unspecified, `max_queue_size` will default + to 10. + workers: Integer. Used for generator or `keras.utils.Sequence` input + only. Maximum number of processes to spin up when using + process-based threading. If unspecified, `workers` will default + to 1. + use_multiprocessing: Boolean. Used for generator or + `keras.utils.Sequence` input only. If `True`, use process-based + threading. If unspecified, `use_multiprocessing` will default to + `False`. Note that because this implementation relies on + multiprocessing, you should not pass non-pickleable arguments to + the generator as they can't be passed easily to children + processes. + + See the discussion of `Unpacking behavior for iterator-like inputs` for + `Model.fit`. Note that Model.predict uses the same interpretation rules + as `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for + all three methods. + + Returns: + Numpy array(s) of predictions. + + Raises: + RuntimeError: If `model.predict` is wrapped in a `tf.function`. + ValueError: In case of mismatch between the provided + input data and the model's expectations, + or in case a stateful model receives a number of samples + that is not a multiple of the batch size. + """ + version_utils.disallow_legacy_graph("Model", "predict") + self._check_call_args("predict") + training_module._disallow_inside_tf_function("predict") + + # TODO(yashkatariya): Cache model on the coordinator for faster + # prediction. If running under PSS, then swap it with OneDeviceStrategy + # so that execution will run on the coordinator. + original_pss_strategy = None + if self.distribute_strategy._should_use_with_coordinator: + original_pss_strategy = self.distribute_strategy + self._distribution_strategy = None + + # Cluster coordinator is set by `.fit()` and `.evaluate()` which is not + # needed in `.predict()` because all the predictions happen on the + # coordinator/locally. + if self._cluster_coordinator: + self._cluster_coordinator = None + + verbose = training_module._get_verbosity(verbose, self.distribute_strategy) + outputs = None + with self.distribute_strategy.scope(): + # Creates a `tf.data.Dataset` and handles batch and epoch iteration. + dataset_types = (tf.compat.v1.data.Dataset, tf.data.Dataset) + if (self._in_multi_worker_mode() or + training_module._is_tpu_multi_host(self.distribute_strategy)) and isinstance(x, dataset_types): + try: + options = tf.data.Options() + data_option = tf.data.experimental.AutoShardPolicy.DATA + options.experimental_distribute.auto_shard_policy = (data_option) + x = x.with_options(options) + except ValueError: + warnings.warn( + "Using Model.predict with MultiWorkerMirroredStrategy " + "or TPUStrategy and AutoShardPolicy.FILE might lead to " + "out-of-order result. Consider setting it to " + "AutoShardPolicy.DATA.", + stacklevel=2, + ) + + data_handler = data_adapter.get_data_handler( + x=x, + batch_size=batch_size, + steps_per_epoch=steps, + initial_epoch=0, + epochs=1, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + model=self, + steps_per_execution=self._steps_per_execution, + ) + + # Container that configures and calls `tf.keras.Callback`s. + if not isinstance(callbacks, callbacks_module.CallbackList): + callbacks = callbacks_module.CallbackList( + callbacks, + add_history=True, + add_progbar=verbose != 0, + model=self, + verbose=verbose, + epochs=1, + steps=data_handler.inferred_steps, + ) + + self.predict_function = self.make_predict_function() + self._predict_counter.assign(0) + callbacks.on_predict_begin() + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.start() + batch_outputs = None + for _, iterator in data_handler.enumerate_epochs(): # Single epoch. + with data_handler.catch_stop_iteration(): + for step in data_handler.steps(): + callbacks.on_predict_batch_begin(step) + tmp_batch_outputs = self.predict_function(iterator) + if data_handler.should_sync: + context.async_wait() + batch_outputs = ( + tmp_batch_outputs # No error, now safe to assign. + ) + if outputs is None: + outputs = tf.nest.map_structure( + lambda batch_output: [batch_output], + batch_outputs, + ) + else: + tf.__internal__.nest.map_structure_up_to( + batch_outputs, + lambda output, batch_output: output.append(batch_output), + outputs, + batch_outputs, + ) + end_step = step + data_handler.step_increment + callbacks.on_predict_batch_end(end_step, {"outputs": batch_outputs}) + if batch_outputs is None: + raise ValueError( + "Unexpected result of `predict_function` " + "(Empty batch_outputs). Please use " + "`Model.compile(..., run_eagerly=True)`, or " + "`tf.config.run_functions_eagerly(True)` for more " + "information of where went wrong, or file a " + "issue/bug to `tf.keras`." + ) + if self.autotune_steps_per_execution: + self._steps_per_execution_tuner.stop() + callbacks.on_predict_end() + all_outputs = tf.__internal__.nest.map_structure_up_to( + batch_outputs, training_module.potentially_ragged_concat, outputs + ) + + # If originally PSS strategy was used, then replace it back since + # predict is running under `OneDeviceStrategy` after the swap and once + # its done we need to replace it back to PSS again. + if original_pss_strategy is not None: + self._distribution_strategy = original_pss_strategy + + return tf_utils.sync_to_numpy_or_python_type(all_outputs) + + def reset_metrics(self): + """Resets the state of all the metrics in the model. + + Examples: + + >>> inputs = tf.keras.layers.Input(shape=(3,)) + >>> outputs = tf.keras.layers.Dense(2)(inputs) + >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs) + >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"]) + + >>> x = np.random.random((2, 3)) + >>> y = np.random.randint(0, 2, (2, 2)) + >>> _ = model.fit(x, y, verbose=0) + >>> assert all(float(m.result()) for m in model.metrics) + + >>> model.reset_metrics() + >>> assert all(float(m.result()) == 0 for m in model.metrics) + + """ + for m in self.metrics: + m.reset_state() + + def train_on_batch( + self, + x, + y=None, + sample_weight=None, + class_weight=None, + reset_metrics=True, + return_dict=False, + ): + """Runs a single gradient update on a single batch of data. + + Args: + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + y: Target data. Like the input data `x`, it could be either Numpy + array(s) or TensorFlow tensor(s). + sample_weight: Optional array of the same length as x, containing + weights to apply to the model's loss for each sample. In the case + of temporal data, you can pass a 2D array with shape (samples, + sequence_length), to apply a different weight to every timestep of + every sample. + class_weight: Optional dictionary mapping class indices (integers) + to a weight (float) to apply to the model's loss for the samples + from this class during training. This can be useful to tell the + model to "pay more attention" to samples from an under-represented + class. When `class_weight` is specified and targets have a rank of + 2 or greater, either `y` must be one-hot encoded, or an explicit + final dimension of `1` must be included for sparse class labels. + reset_metrics: If `True`, the metrics returned will be only for this + batch. If `False`, the metrics will be statefully accumulated + across batches. + return_dict: If `True`, loss and metric results are returned as a + dict, with each key being the name of the metric. If `False`, they + are returned as a list. + + Returns: + Scalar training loss + (if the model has a single output and no metrics) + or list of scalars (if the model has multiple outputs + and/or metrics). The attribute `model.metrics_names` will give you + the display labels for the scalar outputs. + + Raises: + RuntimeError: If `model.train_on_batch` is wrapped in a `tf.function`. + """ + self._assert_compile_was_called() + self._check_call_args("train_on_batch") + training_module._disallow_inside_tf_function("train_on_batch") + if reset_metrics: + self.reset_metrics() + with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState( # noqa: E501 + self + ): + iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x, y, sample_weight, class_weight) + self.train_function = self.make_train_function() + logs = self.train_function(iterator) + + logs = tf_utils.sync_to_numpy_or_python_type(logs) + if return_dict: + return logs + else: + return training_module.flatten_metrics_in_order(logs, self.metrics_names) + + def test_on_batch( + self, + x, + y=None, + sample_weight=None, + reset_metrics=True, + return_dict=False, + ): + """Test the model on a single batch of samples. + + Args: + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays (in case the + model has multiple inputs). + - A TensorFlow tensor, or a list of tensors (in case the model has + multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + y: Target data. Like the input data `x`, it could be either Numpy + array(s) or TensorFlow tensor(s). It should be consistent with `x` + (you cannot have Numpy inputs and tensor targets, or inversely). + sample_weight: Optional array of the same length as x, containing + weights to apply to the model's loss for each sample. In the case + of temporal data, you can pass a 2D array with shape (samples, + sequence_length), to apply a different weight to every timestep of + every sample. + reset_metrics: If `True`, the metrics returned will be only for this + batch. If `False`, the metrics will be statefully accumulated + across batches. + return_dict: If `True`, loss and metric results are returned as a + dict, with each key being the name of the metric. If `False`, they + are returned as a list. + + Returns: + Scalar test loss (if the model has a single output and no metrics) + or list of scalars (if the model has multiple outputs + and/or metrics). The attribute `model.metrics_names` will give you + the display labels for the scalar outputs. + + Raises: + RuntimeError: If `model.test_on_batch` is wrapped in a + `tf.function`. + """ + self._assert_compile_was_called() + self._check_call_args("test_on_batch") + training_module._disallow_inside_tf_function("test_on_batch") + if reset_metrics: + self.reset_metrics() + with self.distribute_strategy.scope(): + iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x, y, sample_weight) + self.test_function = self.make_test_function() + logs = self.test_function(iterator) + + logs = tf_utils.sync_to_numpy_or_python_type(logs) + if return_dict: + return logs + else: + return training_module.flatten_metrics_in_order(logs, self.metrics_names) + + def predict_on_batch(self, x): + """Returns predictions for a single batch of samples. + + Args: + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays (in case the + model has multiple inputs). + - A TensorFlow tensor, or a list of tensors (in case the model has + multiple inputs). + + Returns: + Numpy array(s) of predictions. + + Raises: + RuntimeError: If `model.predict_on_batch` is wrapped in a + `tf.function`. + """ + self._check_call_args("predict_on_batch") + training_module._disallow_inside_tf_function("predict_on_batch") + with self.distribute_strategy.scope(): + iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x) + self.predict_function = self.make_predict_function() + outputs = self.predict_function(iterator) + return tf_utils.sync_to_numpy_or_python_type(outputs) + + @doc_controls.do_not_generate_docs + def fit_generator( + self, + generator, + steps_per_epoch=None, + epochs=1, + verbose=1, + callbacks=None, + validation_data=None, + validation_steps=None, + validation_freq=1, + class_weight=None, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + shuffle=True, + initial_epoch=0, + ): + """Fits the model on data yielded batch-by-batch by a Python generator. + + DEPRECATED: + `Model.fit` now supports generators, so there is no longer any need to + use this endpoint. + """ + warnings.warn( + "`Model.fit_generator` is deprecated and " + "will be removed in a future version. " + "Please use `Model.fit`, which supports generators.", + stacklevel=2, + ) + return self.fit( + generator, + steps_per_epoch=steps_per_epoch, + epochs=epochs, + verbose=verbose, + callbacks=callbacks, + validation_data=validation_data, + validation_steps=validation_steps, + validation_freq=validation_freq, + class_weight=class_weight, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + shuffle=shuffle, + initial_epoch=initial_epoch, + ) + + @doc_controls.do_not_generate_docs + def evaluate_generator( + self, + generator, + steps=None, + callbacks=None, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + verbose=0, + ): + """Evaluates the model on a data generator. + + DEPRECATED: + `Model.evaluate` now supports generators, so there is no longer any + need to use this endpoint. + """ + warnings.warn( + "`Model.evaluate_generator` is deprecated and " + "will be removed in a future version. " + "Please use `Model.evaluate`, which supports generators.", + stacklevel=2, + ) + self._check_call_args("evaluate_generator") + + return self.evaluate( + generator, + steps=steps, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + verbose=verbose, + callbacks=callbacks, + ) + + @doc_controls.do_not_generate_docs + def predict_generator( + self, + generator, + steps=None, + callbacks=None, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + verbose=0, + ): + """Generates predictions for the input samples from a data generator. + + DEPRECATED: + `Model.predict` now supports generators, so there is no longer any + need to use this endpoint. + """ + warnings.warn( + "`Model.predict_generator` is deprecated and " + "will be removed in a future version. " + "Please use `Model.predict`, which supports generators.", + stacklevel=2, + ) + return self.predict( + generator, + steps=steps, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + verbose=verbose, + callbacks=callbacks, + ) + + def _check_call_args(self, method_name): + """Check that `call()` has only one positional arg.""" + # Always allow first arg, regardless of arg name. + fullargspec = self.main_model._call_spec.full_argspec + if fullargspec.defaults: + positional_args = fullargspec.args[:-len(fullargspec.defaults)] + else: + positional_args = fullargspec.args + if "training" in positional_args: + positional_args.remove("training") + + # self and first arg can be positional. + if len(positional_args) > 2: + extra_args = positional_args[2:] + raise ValueError( + f"Models passed to `{method_name}` can only have `training` " + "and the first argument in `call()` as positional arguments, " + f"found: {extra_args}." + ) + + def _validate_compile(self, optimizer, metrics, **kwargs): + """Performs validation checks for the default `compile()`.""" + if any(isinstance(opt, optimizer_v1.Optimizer) for opt in tf.nest.flatten(optimizer)): + raise ValueError( + f"`tf.compat.v1.keras` Optimizer ({optimizer}) is " + "not supported when eager execution is enabled. Use a " + "`tf.keras` Optimizer instead, or disable eager " + "execution." + ) + + kwargs.pop("cloning", None) # Legacy DistStrat argument, never used. + kwargs.pop("experimental_run_tf_function", None) # Always `True`. + distribute_arg = kwargs.pop("distribute", None) + if distribute_arg is not None: + raise ValueError( + "`distribute` argument in compile is not available in TF 2.0. " + "Please create the model under the `strategy.scope()`. " + f"Received: {distribute_arg}." + ) + target_tensor_arg = kwargs.pop("target_tensors", None) + if target_tensor_arg is not None: + raise ValueError( + "`target_tensors` argument is not supported when executing " + f"eagerly. Received: {target_tensor_arg}." + ) + invalid_kwargs = set(kwargs) - {"sample_weight_mode"} + if invalid_kwargs: + raise TypeError( + "Invalid keyword argument(s) in `compile()`: " + f"{(invalid_kwargs,)}. Valid keyword arguments include " + '"cloning", "experimental_run_tf_function", "distribute",' + ' "target_tensors", or "sample_weight_mode".' + ) + + # Model must be created and compiled with the same DistStrat. + if tf.distribute.has_strategy(): + strategy = tf.distribute.get_strategy() + for v in self.main_model.variables: + if not strategy.extended.variable_created_in_scope(v): + raise ValueError( + f"Variable ({v}) was not created in the distribution " + f"strategy scope of ({strategy}). It is most likely " + "because some layers, model, or optimizer was being " + "created outside the distribution strategy scope. Try " + "to make sure your code looks similar " + "to the following.\nwith strategy.scope():\n" + " model=_create_model()\n" + " model.compile(...)" + ) + + # Model metrics must be created in the same distribution strategy scope + # as the model. + strategy = self.distribute_strategy + for metric in tf.nest.flatten(metrics): + for v in getattr(metric, "variables", []): + if not strategy.extended.variable_created_in_scope(v): + raise ValueError( + f"Metric ({metric}) passed to `model.compile` was " + "created inside a different distribution strategy " + "scope than the model. All metrics must be created " + "in the same distribution strategy " + f"scope as the model (in this case {strategy}). " + "If you pass in a string identifier for a metric to " + "compile, the metric will automatically be created " + "in the correct distribution strategy scope." + ) + + # Model metrics must be created in the same distribution strategy scope + # as the model. + for opt in tf.nest.flatten(optimizer): + for v in getattr(opt, "_weights", []): + if not strategy.extended.variable_created_in_scope(v): + raise ValueError( + f"Optimizer ({optimizer}) passed to `model.compile` " + "was created inside a different distribution strategy " + "scope than the model. All optimizers must be created " + "in the same distribution strategy scope as the model " + f"(in this case {strategy}). If you pass in a string " + "identifier for an optimizer to compile, the optimizer " + "will automatically be created in the correct " + "distribution strategy scope." + ) + + def _maybe_load_initial_counters_from_ckpt(self, steps_per_epoch, initial_epoch): + """Maybe load initial epoch from ckpt, considering worker recovery. + + Refer to tensorflow/python/tf_keras/distribute/worker_training_state.py + for more information. + + Args: + steps_per_epoch: The number of step per epoch. + initial_epoch: The original initial_epoch user passes in `fit()`. + mode: The mode for running `model.fit()`. + + Returns: + If the training is recovering from previous failure under multi-worker + training setting, return the (epoch, step) the training is supposed to + continue at. Otherwise, return the `initial_epoch, initial_step` the + user passes in. + """ + initial_step = 0 + if self._training_state is not None: + return self._training_state.maybe_load_initial_counters_from_ckpt( + steps_per_epoch, initial_epoch, mode=ModeKeys.TRAIN + ) + return (initial_epoch, initial_step) + + def _assert_compile_was_called(self): + # Checks whether `compile` has been called. If it has been called, + # then the optimizer is set. This is different from whether the + # model is compiled + # (i.e. whether the model is built and its inputs/outputs are set). + if not self._is_compiled: + raise RuntimeError( + "You must compile your model before " + "training/testing. " + "Use `model.compile(optimizer, loss)`." + ) + + def _check_sample_weight_warning(self, x, sample_weight): + # Datasets can include sample weight, by returning a tuple with the + # structure of `(x, y, sample_weight)`. + sample_weight_present = sample_weight is not None or ( + isinstance(x, tf.data.Dataset) and isinstance(x.element_spec, tuple) and len(x.element_spec) == 3 + ) + + if (sample_weight_present and self.compiled_metrics._user_weighted_metrics is None): + logging.warning( + "`evaluate()` received a value for `sample_weight`, but " + "`weighted_metrics` were not provided. Did you mean to pass " + "metrics to `weighted_metrics` in `compile()`? If this is " + "intentional you can pass `weighted_metrics=[]` to `compile()` " + "in order to silence this warning." + ) + + def _should_eval(self, epoch, validation_freq): + epoch = epoch + 1 # one-index the user-facing epoch. + if isinstance(validation_freq, int): + return epoch % validation_freq == 0 + elif isinstance(validation_freq, list): + return epoch in validation_freq + else: + raise ValueError( + "Expected `validation_freq` to be a list or int. " + f"Received: validation_freq={validation_freq} of the " + f"type {type(validation_freq)}." + ) + + ###################################################################### + # Functions below exist only as v1 / v2 compatibility shims. + ###################################################################### + + def _get_compile_args(self, user_metrics=True): + """Used for saving or cloning a Model. + + Args: + user_metrics: Whether to return user-supplied metrics or `Metric` + objects. If True, returns the user-supplied metrics. + Defaults to `True`. + + Returns: + Dictionary of arguments that were used when compiling the model. + """ + self._assert_compile_was_called() + saved_metrics = self.compiled_metrics._user_metrics + saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics + + if not user_metrics: + if saved_metrics is not None: + saved_metrics = self.compiled_metrics._metrics + if saved_weighted_metrics is not None: + saved_weighted_metrics = self.compiled_metrics._weighted_metrics + + compile_args = { + "optimizer": self.optimizer, + "loss": self.compiled_loss._user_losses, + "metrics": saved_metrics, + "weighted_metrics": saved_weighted_metrics, + "loss_weights": self.compiled_loss._user_loss_weights, + } + return compile_args + + def _get_callback_model(self): + return self + + def _in_multi_worker_mode(self): + return self.distribute_strategy.extended._in_multi_worker_mode() + + @property + def _compile_was_called(self): + return self._is_compiled + + @property + def main_model(self): + """ + Returns: + The main model + """ + if len(self._model) == 1: + return self._model["main"] + else: + for name, _model in self._model.items(): + if "main" in name: + return _model + ValueError("Could not find the main model.") + + @tf.__internal__.tracking.no_automatic_dependency_tracking + def _maybe_create_attribute(self, name, default_value): + """Create attribute (with the default value) if it hasn't been created. + + This is useful for fields that is used for tracking purpose, + _trainable_weights, or _layers. Note that user could create a layer + subclass and assign an internal field before invoking the + Layer.__init__(), the __setattr__() need to create the tracking fields + and __init__() need to not override them. + + Args: + name: String, the name of the attribute. + default_value: Object, the default value of the attribute. + """ + if not hasattr(self, name): + self.__setattr__(name, default_value) + + def _get_trainable_state(self): + """Get the `trainable` state of each sublayer. + + Returns: + A dict mapping all sublayers to their `trainable` value. + """ + trainable_state = weakref.WeakKeyDictionary() + for layer in self.main_model._flatten_layers(): + trainable_state[layer] = layer.trainable + return trainable_state diff --git a/deepray/core/utils/misc/distribution_utils.py b/deepray/core/utils/misc/distribution_utils.py index 9a9d072f..0042d469 100644 --- a/deepray/core/utils/misc/distribution_utils.py +++ b/deepray/core/utils/misc/distribution_utils.py @@ -284,40 +284,6 @@ def undo_set_up_synthetic_data(): _undo_monkey_patch_dataset_method(tf.distribute.experimental.MultiWorkerMirroredStrategy) -def configure_cluster(worker_hosts=None, task_index=-1): - """Set multi-worker cluster spec in TF_CONFIG environment variable. - - Args: - worker_hosts: comma-separated list of worker ip:port pairs. - - Returns: - Number of workers in the cluster. - """ - tf_config = json.loads(os.environ.get('TF_CONFIG', '{}')) - if tf_config: - num_workers = (len(tf_config['cluster'].get('chief', [])) + len(tf_config['cluster'].get('worker', []))) - elif worker_hosts: - workers = worker_hosts.split(',') - num_workers = len(workers) - if num_workers > 1 and task_index < 0: - raise ValueError('Must specify task_index when number of workers > 1') - task_index = 0 if num_workers == 1 else task_index - os.environ['TF_CONFIG'] = json.dumps( - { - 'cluster': { - 'worker': workers - }, - 'task': { - 'type': 'worker', - 'index': task_index - } - } - ) - else: - num_workers = 1 - return num_workers - - def get_strategy_scope(strategy): if strategy: strategy_scope = strategy.scope() diff --git a/deepray/core/utils/misc/keras_utils.py b/deepray/core/utils/misc/keras_utils.py deleted file mode 100644 index a4c24e97..00000000 --- a/deepray/core/utils/misc/keras_utils.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Helper functions for the Keras implementations of models.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import multiprocessing -import os -import time - -from absl import logging -import tensorflow as tf -from tensorflow.core.protobuf import rewriter_config_pb2 -from tensorflow.python import tf2 -from tensorflow.python.eager import profiler - - -class BatchTimestamp(object): - """A structure to store batch time stamp.""" - - def __init__(self, batch_index, timestamp): - self.batch_index = batch_index - self.timestamp = timestamp - - def __repr__(self): - return "'BatchTimestamp'".format(self.batch_index, self.timestamp) - - -class TimeHistory(tf.keras.callbacks.Callback): - """Callback for Keras models.""" - - def __init__(self, batch_size, log_steps): - """Callback for logging performance. - - Args: - batch_size: Total batch size. - log_steps: Interval of steps between logging of batch level stats. - """ - self.batch_size = batch_size - super(TimeHistory, self).__init__() - self.log_steps = log_steps - self.global_steps = 0 - - # Logs start of step 1 then end of each step based on log_steps interval. - self.timestamp_log = [] - - # Records the time each epoch takes to run from start to finish of epoch. - self.epoch_runtime_log = [] - - def on_train_end(self, logs=None): - self.train_finish_time = time.time() - - def on_epoch_begin(self, epoch, logs=None): - self.epoch_start = time.time() - - def on_batch_begin(self, batch, logs=None): - self.global_steps += 1 - if self.global_steps == 1: - self.start_time = time.time() - self.timestamp_log.append(BatchTimestamp(self.global_steps, self.start_time)) - - def on_batch_end(self, batch, logs=None): - """Records elapse time of the batch and calculates examples per second.""" - if self.global_steps % self.log_steps == 0: - timestamp = time.time() - elapsed_time = timestamp - self.start_time - examples_per_second = (self.batch_size * self.log_steps) / elapsed_time - self.timestamp_log.append(BatchTimestamp(self.global_steps, timestamp)) - logging.info( - "BenchmarkMetric: {'global step':%d, 'time_taken': %f," - "'examples_per_second': %f}", self.global_steps, elapsed_time, examples_per_second - ) - self.start_time = timestamp - - def on_epoch_end(self, epoch, logs=None): - epoch_run_time = time.time() - self.epoch_start - self.epoch_runtime_log.append(epoch_run_time) - logging.info("BenchmarkMetric: {'epoch':%d, 'time_taken': %f}", epoch, epoch_run_time) - - -def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_per_epoch): - """Validate profile_steps flag value and return profiler callback.""" - profile_steps_error_message = ( - 'profile_steps must be a comma separated pair of positive integers, ' - 'specifying the first and last steps to be profiled.' - ) - try: - profile_steps = [int(i) for i in profile_steps.split(',')] - except ValueError: - raise ValueError(profile_steps_error_message) - if len(profile_steps) != 2: - raise ValueError(profile_steps_error_message) - start_step, stop_step = profile_steps - if start_step < 0 or start_step > stop_step: - raise ValueError(profile_steps_error_message) - if enable_tensorboard: - logging.warning( - 'Both TensorBoard and profiler callbacks are used. Note that the ' - 'TensorBoard callback profiles the 2nd step (unless otherwise ' - 'specified). Please make sure the steps profiled by the two callbacks ' - 'do not overlap.' - ) - return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch) - - -class ProfilerCallback(tf.keras.callbacks.Callback): - """Save profiles in specified step range to log directory.""" - - def __init__(self, log_dir, start_step, stop_step, steps_per_epoch): - super(ProfilerCallback, self).__init__() - self.log_dir = log_dir - self.start_step = start_step - self.stop_step = stop_step - self.start_epoch = start_step // steps_per_epoch - self.stop_epoch = stop_step // steps_per_epoch - self.start_step_in_epoch = start_step % steps_per_epoch - self.stop_step_in_epoch = stop_step % steps_per_epoch - self.should_start = False - self.should_stop = False - - def on_epoch_begin(self, epoch, logs=None): - if epoch == self.start_epoch: - self.should_start = True - if epoch == self.stop_epoch: - self.should_stop = True - - def on_batch_begin(self, batch, logs=None): - if batch == self.start_step_in_epoch and self.should_start: - self.should_start = False - profiler.start() - logging.info('Profiler started at Step %s', self.start_step) - - def on_batch_end(self, batch, logs=None): - if batch == self.stop_step_in_epoch and self.should_stop: - self.should_stop = False - results = profiler.stop() - profiler.save(self.log_dir, results) - logging.info( - 'Profiler saved profiles for steps between %s and %s to %s', self.start_step, self.stop_step, self.log_dir - ) - - -def set_session_config(enable_eager=False, enable_xla=False): - """Sets the session config.""" - if is_v2_0(): - set_config_v2(enable_xla=enable_xla) - else: - config = get_config_proto_v1(enable_xla=enable_xla) - if enable_eager: - tf.compat.v1.enable_eager_execution(config=config) - else: - sess = tf.Session(config=config) - tf.keras.backend.set_session(sess) - - -def get_config_proto_v1(enable_xla=False): - """Return config proto according to flag settings, or None to use default.""" - config = None - if enable_xla: - config = tf.compat.v1.ConfigProto() - config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_2) - return config - - -def set_config_v2(enable_xla=False): - """Config eager context according to flag values using TF 2.0 API.""" - if enable_xla: - tf.config.optimizer.set_jit(True) - - -def is_v2_0(): - """Returns true if using tf 2.0.""" - return tf2.enabled() - - -def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count): - """Set GPU thread mode and count, and adjust dataset threads count.""" - cpu_count = multiprocessing.cpu_count() - logging.info('Logical CPU cores: %s', cpu_count) - - # Allocate private thread pool for each GPU to schedule and launch kernels - per_gpu_thread_count = per_gpu_thread_count or 2 - os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode - os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) - logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT']) - logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE']) - - # Limit data preprocessing threadpool to CPU cores minus number of total GPU - # private threads and memory copy threads. - total_gpu_thread_count = per_gpu_thread_count * num_gpus - num_runtime_threads = num_gpus - if not datasets_num_private_threads: - datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8) - logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads) diff --git a/deepray/custom_ops/BUILD b/deepray/custom_ops/BUILD index 4b0226ff..99fabfd4 100644 --- a/deepray/custom_ops/BUILD +++ b/deepray/custom_ops/BUILD @@ -5,12 +5,15 @@ py_library( srcs = glob(["**/*.py"]), deps = [ "//deepray/custom_ops/correlation_cost", + "//deepray/custom_ops/embedding_bag", + "//deepray/custom_ops/embedding_variable", "//deepray/custom_ops/ffm_ops", - "//deepray/custom_ops/multiplex_1:multiplex_1_op", + "//deepray/custom_ops/multiplex_1", "//deepray/custom_ops/multiplex_2:multiplex_2_op", "//deepray/custom_ops/multiplex_3:multiplex_3_op", "//deepray/custom_ops/multiplex_4:multiplex_4_op", "//deepray/custom_ops/parquet_dataset", + "//deepray/custom_ops/seq2seq", "//deepray/custom_ops/simple_hash_table", "//deepray/custom_ops/sleep:sleep_op", "//deepray/custom_ops/training_ops", diff --git a/deepray/custom_ops/correlation_cost/BUILD b/deepray/custom_ops/correlation_cost/BUILD index 0a9c71a9..9927511c 100644 --- a/deepray/custom_ops/correlation_cost/BUILD +++ b/deepray/custom_ops/correlation_cost/BUILD @@ -1,19 +1,24 @@ +load("@rules_python//python:defs.bzl", "py_test") load("//deepray:deepray.bzl", "custom_op_library") licenses(["notice"]) # Apache 2.0 package(default_visibility = ["//visibility:public"]) +CORRELATION_COST_OP_SRCS = [ + "cc/kernels/correlation_cost_op.cc", + "cc/ops/correlation_cost_op.cc", +] + custom_op_library( name = "_correlation_cost_ops.so", - srcs = [ - "cc/kernels/correlation_cost_op.cc", - "cc/kernels/correlation_cost_op.h", - "cc/ops/correlation_cost_op.cc", + srcs = CORRELATION_COST_OP_SRCS + ["cc/kernels/correlation_cost_op.h"], + gpu_deps = [ + "@cub_archive//:cub", ], - cuda_srcs = [ - "cc/kernels/correlation_cost_op.h", + gpu_srcs = [ "cc/kernels/correlation_cost_op_gpu.cu.cc", + "cc/kernels/correlation_cost_op.h", ], ) @@ -25,12 +30,7 @@ py_library( "*.py", ], ), - data = [ - ":_correlation_cost_ops.so", - ], - deps = [ - "//deepray/utils", - ], + data = [":_correlation_cost_ops.so"], ) py_test( @@ -38,7 +38,12 @@ py_test( size = "small", srcs = glob(["python/tests/*"]), main = "python/tests/run_all_test.py", + python_version = "PY3", deps = [ ":correlation_cost", + "//deepray/utils", + "@pypi_pytest//:pkg", + "@pypi_tensorflow//:pkg", + "@pypi_typeguard//:pkg", ], ) diff --git a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc index 9496d47c..9978bcdb 100644 --- a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc +++ b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc @@ -17,8 +17,9 @@ limitations under the License. #define EIGEN_USE_GPU +#include + #include "correlation_cost_op.h" -#include "cub/device/device_reduce.cuh" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/util/gpu_kernel_helper.h" diff --git a/deepray/custom_ops/correlation_cost/python/optical_flow.py b/deepray/custom_ops/correlation_cost/python/optical_flow.py index d7565e5d..38742732 100644 --- a/deepray/custom_ops/correlation_cost/python/optical_flow.py +++ b/deepray/custom_ops/correlation_cost/python/optical_flow.py @@ -18,7 +18,7 @@ from typeguard import typechecked from deepray.utils.resource_loader import LazySO -_correlation_cost_so = LazySO("custom_ops/correlation_cost/_correlation_cost_ops.so") +gen_correlation_cost_ops = LazySO("custom_ops/correlation_cost/_correlation_cost_ops.so") def _correlation_cost( @@ -76,7 +76,7 @@ def _correlation_cost( """ with tf.name_scope(name or "correlation_cost"): - op_call = _correlation_cost_so.ops.deepray_correlation_cost + op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost if data_format == "channels_last": op_data_format = "NHWC" @@ -116,7 +116,7 @@ def _correlation_cost_grad(op, grad_output): input_b = tf.convert_to_tensor(op.inputs[1], name="input_b") grad_output_tensor = tf.convert_to_tensor(grad_output, name="grad_output") - op_call = _correlation_cost_so.ops.deepray_correlation_cost_grad + op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost_grad grads = op_call( input_a, input_b, diff --git a/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py b/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py index d5c4af3d..8261049e 100644 --- a/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py +++ b/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py @@ -1,8 +1,7 @@ from pathlib import Path import sys - import pytest if __name__ == "__main__": dirname = Path(__file__).absolute().parent - sys.exit(pytest.main([str(dirname)])) + sys.exit(pytest.main(["-s", str(dirname)])) diff --git a/deepray/custom_ops/distributed_embeddings/BUILD b/deepray/custom_ops/distributed_embeddings/BUILD index acba7f11..33e86195 100644 --- a/deepray/custom_ops/distributed_embeddings/BUILD +++ b/deepray/custom_ops/distributed_embeddings/BUILD @@ -11,16 +11,25 @@ custom_op_library( "cc/kernels/embedding_lookup_kernels.cc", "cc/ops/embedding_lookup_ops.cc", ], - cuda_srcs = [ + gpu_deps = [ + "@cub_archive//:cub", + "@com_github_NVIDIA_cuCollections//:cuCollections", + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cuda_runtime", + "@local_config_cuda//cuda:cudart", + ], + gpu_srcs = [ + # TODO: Update cuCollections version + "cc/kernels/embedding_lookup.h", "cc/kernels/embedding_lookup_kernels.cu.cc", ], deps = [ - "@cuCollections//:cuco_hash_table", + "//deepray/custom_ops/utils:ok_status_util", ], ) py_library( - name = "distributed_embeddings_ops", + name = "distributed_embeddings", srcs = glob( [ "python/*.py", @@ -42,6 +51,6 @@ py_test( srcs = glob(["python/tests/*"]), main = "python/tests/run_all_test.py", deps = [ - ":distributed_embeddings_ops", + ":distributed_embeddings", ], ) diff --git a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc index eca712d0..09638d40 100644 --- a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc +++ b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc @@ -21,8 +21,9 @@ #include -#include "cub/cub.cuh" -#include "cuco/static_map.cuh" +#include +#include + #include "embedding_lookup.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/util/gpu_kernel_helper.h" diff --git a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc b/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc index feb00d43..7d498505 100644 --- a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc +++ b/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc @@ -15,6 +15,7 @@ * limitations under the License. */ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -30,7 +31,7 @@ REGISTER_OP("ReadVariableNoCopy") TF_RETURN_IF_ERROR( shape_inference::ValidateVariableResourceHandle(c, &shape_and_type)); c->set_output(0, shape_and_type[0].shape); - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("RowToSplit") @@ -40,7 +41,7 @@ REGISTER_OP("RowToSplit") .Output("row_split: Tindices") .SetShapeFn([](shape_inference::InferenceContext* c) { // TODO - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("EmbeddingLookupVariableHotness") @@ -66,7 +67,7 @@ REGISTER_OP("EmbeddingLookupVariableHotness") outdim_0 -= 1; } c->set_output(0, c->Matrix(outdim_0, c->Dim(params_shape, 1))); - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("EmbeddingLookupVariableHotnessGrad") @@ -86,7 +87,7 @@ REGISTER_OP("EmbeddingLookupVariableHotnessGrad") c->Vector(shape_inference::InferenceContext::kUnknownDim)); c->set_output(1, c->Matrix(shape_inference::InferenceContext::kUnknownDim, c->Dim(grad_shape, 1))); - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("IntegerLookup") @@ -99,7 +100,7 @@ REGISTER_OP("IntegerLookup") .Output("values: T") .SetShapeFn([](shape_inference::InferenceContext* c) { c->set_output(0, c->input(2)); - return Status::OK(); + return TFOkStatus; }); } // namespace tensorflow diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py index 7a094719..e98161d9 100644 --- a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py +++ b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py @@ -31,8 +31,6 @@ flags.DEFINE_bool("graph_mode", default=False, help="Run in graph mode.") flags.DEFINE_string("mixed_precision_policy", default=None, help="Mixed precision policy to be set.") -FLAGS = flags.FLAGS - large_testcase_sizes = [ [2, 8], [2, 16], [10, 8], [10, 16], [10, 16], [10, 16], [10, 16], [10, 16], [10, 32], [10, 128], [10, 128], [10, 128], [10, 128], [10, 1024], [100, 16], [100, 32], [100, 32], [100, 32], [100, 32], [100, 128], diff --git a/deepray/custom_ops/embedding_bag/BUILD b/deepray/custom_ops/embedding_bag/BUILD new file mode 100644 index 00000000..89e3236a --- /dev/null +++ b/deepray/custom_ops/embedding_bag/BUILD @@ -0,0 +1,49 @@ +load("@rules_python//python:defs.bzl", "py_test") +load("//deepray:deepray.bzl", "custom_op_library") + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +custom_op_library( + name = "_embedding_bag_ops.so", + srcs = [ + "cc/kernels/embedding_bag_ops.cc", + "cc/kernels/embedding_bag_ops.h", + "cc/ops/embedding_bag_ops.cc", + ], + gpu_deps = [ + "@local_config_cuda//cuda:cuda_runtime", + ], + gpu_srcs = [ + "cc/kernels/embedding_bag_ops.h", + "cc/kernels/embedding_bag_ops_gpu.cu.cc", + "cc/kernels/embedding_bag_backward_kernels.cu.cc", + ], +) + +py_library( + name = "embedding_bag", + srcs = glob( + [ + "python/*.py", + "*.py", + ], + ), + data = [":_embedding_bag_ops.so"], +) + +py_test( + name = "embedding_bag_test", + size = "small", + srcs = glob(["python/tests/*"]), + main = "python/tests/run_all_test.py", + python_version = "PY3", + deps = [ + ":embedding_bag", + "//deepray/utils", + "@pypi_pytest//:pkg", + "@pypi_tensorflow//:pkg", + "@pypi_typeguard//:pkg", + ], +) diff --git a/deepray/custom_ops/embedding_bag/__init__.py b/deepray/custom_ops/embedding_bag/__init__.py new file mode 100644 index 00000000..7f50af3e --- /dev/null +++ b/deepray/custom_ops/embedding_bag/__init__.py @@ -0,0 +1 @@ +from .python.embedding_bag import EmbeddingBag, _embedding_bag diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc new file mode 100644 index 00000000..b6cdce68 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc @@ -0,0 +1,247 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include +#include +#include + +#include "embedding_bag_ops.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +constexpr int MAX_THREADS_PER_BLOCK = 1024; + +namespace tensorflow { +namespace addons { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +template +__global__ void PrepTempArraysKernel( + const Tindices *__restrict__ indices, Tindices *__restrict__ sortedIndices, + Tindices *__restrict__ sortedIndicesCounter, const int indices_size) { + const int arrayIdx = (blockIdx.x * kThreadsPerBlock) + threadIdx.x; + if (arrayIdx < + indices_size) { // Make sure we don't run off the end of the actual array + sortedIndices[arrayIdx] = indices[arrayIdx]; + sortedIndicesCounter[arrayIdx] = arrayIdx; + } +} + +// Define the CUDA kernel. +template +__global__ void EmbeddingBagWeightsGradKernel( + const int value_dim, const Tindices *__restrict__ indices, + const T *__restrict__ values, const T *__restrict__ dloss, + T *__restrict__ weights_grad, Combiner combiner) { + const int sample_idx = blockIdx.x; + const int bag_idx = blockIdx.y; + const int bag_dim = gridDim.y; + const int valueBaseIdx = + indices[(sample_idx * bag_dim) + bag_idx] * value_dim; + const int dlossBaseIdx = sample_idx * value_dim; + // Use a full-precision accumulator even for half-precision inputs + float partialDotProduct = 0.0f; + for (int i = threadIdx.x; i < value_dim; + i += blockDim.x) // Note that some threads may stop one iteration + // earlier if the block straddles the end of the array + { + partialDotProduct += + static_cast(values[valueBaseIdx + i] * dloss[dlossBaseIdx + i]); + } + unsigned activeMask = 0xffffffff; +#pragma unroll + for (int offset = kThreadsPerBlock / 2; offset > 0; offset /= 2) { + partialDotProduct += + __shfl_down_sync(activeMask, partialDotProduct, offset); + } + if (combiner == Combiner::kMean) { + partialDotProduct /= static_cast(bag_dim); + } + // Thread 0 now has the full dot product + if (threadIdx.x == 0) { + weights_grad[(sample_idx * bag_dim) + bag_idx] = + static_cast(partialDotProduct); + } +} + +template +__global__ void EmbeddingBagValuesGradKernel( + const int value_dim, const int bag_dim, + const Tindices *__restrict__ sortedIndices, + const Tindices *__restrict__ counter, const T *__restrict__ values, + const T *__restrict__ weights, const T *__restrict__ dloss, + T *__restrict__ values_grad, Combiner combiner) { + const int startIdx = blockIdx.x; + const int chunk = blockIdx.y; + const int kThreadsPerBlock = blockDim.x; + const int featureIdx = threadIdx.x + (chunk * kThreadsPerBlock); + // The core problem here is that we want to avoid parallel writes to the + // same element of the grads. We avoid that by pre-sorting a copy of the + // indices tensor, and also co-sorting a 'counter' array so that we still know + // which element of the incoming gradient tensor corresponds to each. Then, we + // take the slightly lazy approach of spinning up a warp for each element of + // the indices array, but having each warp check the previous element before + // it starts. If the two elements are the same, then the warp immediately + // returns without doing anything. If not, then the warp iterates forward and + // accumulates gradient until it hits a different index element, at which + // point it writes the accumulated value and returns. This ensures that each + // row of the values grad tensor is handled by one and exactly one warp. + const int valuesIdx = ldg(sortedIndices + startIdx); + if (startIdx > 0) { + const int prevIdx = ldg(sortedIndices + startIdx - 1); + if (prevIdx == valuesIdx) { + return; // Another block is handling this index, exit + } + } + int endIdx = startIdx; + while (endIdx < gridDim.x - 1) // Don't run off the end of the array + { + int nextIdx = endIdx + 1; + int nextValuesIdx = ldg(sortedIndices + nextIdx); + if (nextValuesIdx == valuesIdx) { + endIdx += 1; + } else { + break; + } + } + if (featureIdx < value_dim) // Don't run off the end of the row + { + const int outputOffset = (valuesIdx * value_dim) + featureIdx; + float accum = 0.0f; // Full precision even if the inputs aren't + + for (int currentIdx = startIdx; currentIdx <= endIdx; ++currentIdx) { + int originalIdxPosition = ldg(counter + currentIdx); + T weight = weights[originalIdxPosition]; + // The floor division on this line is correct and intentional + T featureDloss = + ldg(dloss + (originalIdxPosition / bag_dim) + featureIdx); + accum += static_cast(weight * featureDloss); + } + if (combiner == Combiner::kMean) { + accum /= static_cast(bag_dim); + } + values_grad[outputOffset] = static_cast(accum); + } +} + +// Define the GPU implementation that launches the CUDA kernel. +template +struct EmbeddingBagBackwardFunctor { + // indices should remain unchanged, but thrust complains if it's a const + // pointer + void operator()(const GPUDevice &d, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::ConstTensor grads, + typename TTypes::Tensor params_grads, + typename TTypes::Tensor weights_grads, + Combiner combiner, OpKernelContext *context) { + // I copy-pasted this bit from histogram_op_gpu.cu.cc and I sure hope it + // works + tensorflow::AllocatorAttributes gpu_allocator; + gpu_allocator.set_on_host(false); + gpu_allocator.set_gpu_compatible(true); + + Tensor sortedIndicesTensor; + Tensor sortedIndicesCounterTensor; + + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + TensorShape({indices.size()}), + &sortedIndicesTensor, gpu_allocator)); + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum::value, + TensorShape({indices.size()}), + &sortedIndicesCounterTensor, gpu_allocator)); + auto sortedIndices = sortedIndicesTensor.flat(); + auto sortedIndicesCounter = sortedIndicesCounterTensor.flat(); + // Note: I tried splitting the two kernels into different streams but + // performance was barely affected. + const Eigen::Index batch_dim = indices.dimension(0); + const Eigen::Index bag_dim = indices.dimension(1); + const Eigen::Index output_dim = params.dimension(1); + const auto params_size = params.size(); + const int kThreadsPerBlock = 32; + dim3 gridShape = dim3(batch_dim, bag_dim, 1); + TF_CHECK_OK(GpuLaunchKernel( + EmbeddingBagWeightsGradKernel, gridShape, + kThreadsPerBlock, 0, d.stream(), output_dim, indices.data(), + params.data(), grads.data(), weights_grads.data(), combiner)); + + const int indices_size = indices.size(); + const int values_size = params.size(); + const int total_blocks = Eigen::divup(indices_size, kThreadsPerBlock); + gridShape = dim3(total_blocks, 1, 1); + + TF_CHECK_OK(GpuLaunchKernel( + PrepTempArraysKernel, gridShape, + kThreadsPerBlock, 0, d.stream(), indices.data(), sortedIndices.data(), + sortedIndicesCounter.data(), indices_size)); + + thrust::device_ptr sortedIndicesCounterDevicePtr( + sortedIndicesCounter.data()); + thrust::device_ptr sortedIndicesDevicePtr(sortedIndices.data()); + thrust::device_ptr paramsGradDevicePtr(params_grads.data()); + thrust::fill(paramsGradDevicePtr, + paramsGradDevicePtr + static_cast(params_size), + static_cast(0.0f)); + thrust::sort_by_key(sortedIndicesDevicePtr, + sortedIndicesDevicePtr + indices_size, + sortedIndicesCounterDevicePtr); + // Handle each row with as few thread blocks as possible + int threadsPerBlock; + int blocksPerRow; + if (output_dim <= MAX_THREADS_PER_BLOCK) { + blocksPerRow = 1; + threadsPerBlock = output_dim; + } else { + blocksPerRow = + Eigen::divup(static_cast(output_dim), MAX_THREADS_PER_BLOCK); + threadsPerBlock = + Eigen::divup(static_cast(output_dim), blocksPerRow); + } + // int blocksPerRow = 1; + // while (threadsPerBlock > MAX_THREADS_PER_BLOCK) { + // threadsPerBlock = (threadsPerBlock + 1) / 2; // Ceiling division + // blocksPerRow *= 2; + // } + gridShape = dim3(indices_size, blocksPerRow, 1); + TF_CHECK_OK(GpuLaunchKernel( + EmbeddingBagValuesGradKernel, gridShape, threadsPerBlock, + 0, d.stream(), output_dim, bag_dim, sortedIndices.data(), + sortedIndicesCounter.data(), params.data(), weights.data(), + grads.data(), params_grads.data(), combiner)); + } +}; + +// Explicitly instantiate functors for the types of OpKernels registered. +template struct EmbeddingBagBackwardFunctor; +template struct EmbeddingBagBackwardFunctor; +template struct EmbeddingBagBackwardFunctor; +template struct EmbeddingBagBackwardFunctor; +template struct EmbeddingBagBackwardFunctor; +template struct EmbeddingBagBackwardFunctor; +} // namespace functor +} // namespace addons +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc new file mode 100644 index 00000000..fd6169d1 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc @@ -0,0 +1,330 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "embedding_bag_ops.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace addons { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { +// CPU specialization of actual computation. +template +struct EmbeddingBagFunctor { + static constexpr int64 kPacketSize = Eigen::internal::packet_traits::size; + using VectorMap = Eigen::Map>; + using ConstVectorMap = Eigen::Map>; + + void operator()(const CPUDevice &device, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::Tensor output, Combiner combiner) { + const Eigen::Index bags = indices.dimension(0); + const Eigen::Index sequence_length = indices.dimension(1); + const Eigen::Index output_dim = params.dimension(1); + + const auto work = [&](Eigen::Index start, Eigen::Index end) { + for (Eigen::Index bag = start; bag < end; ++bag) { + VectorMap output_slice(&output(bag, 0), output_dim); + output_slice.setZero(); + for (Eigen::Index seq = 0; seq < sequence_length; ++seq) { + const ConstVectorMap params_slice(¶ms(indices(bag, seq), 0), + output_dim); + output_slice += params_slice * weights(bag, seq); + } + if (combiner == Combiner::kMean) { + output_slice /= static_cast(sequence_length); + } + } + }; + + const double bytes_loaded = + sequence_length * (sizeof(Tindices) + sizeof(T)) + + (sequence_length * output_dim) * sizeof(T); + const double bytes_stored = output_dim * sizeof(T); + const double compute_cycles = + (sequence_length * output_dim) * + (Eigen::TensorOpCost::AddCost() + Eigen::TensorOpCost::MulCost()); + const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, compute_cycles, + /*vectorized=*/true, + /*packet_size=*/kPacketSize); + device.parallelFor(bags, cost, std::move(work)); + } +}; + +// CPU specialization of actual computation. +template +struct EmbeddingBagBackwardFunctor { + static constexpr int64 kPacketSize = Eigen::internal::packet_traits::size; + using VectorMap = Eigen::Map>; + using ConstVectorMap = Eigen::Map>; + + void operator()(const CPUDevice &device, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::ConstTensor grads, + typename TTypes::Tensor params_grads, + typename TTypes::Tensor weights_grads, + Combiner combiner, OpKernelContext *context) { + const Eigen::Index sequence_length = indices.dimension(1); + const Eigen::Index output_dim = params.dimension(1); + + std::unordered_map index_map; + // The pair (x, {y_i}) in index_vec means + // index y_i in `indices` contributes to bag `x`. + std::vector>> index_vec; + for (Eigen::Index i = 0; i < indices.size(); ++i) { + Tindices index = indices.data()[i]; + if (index_map.find(index) == index_map.end()) { + index_map[index] = index_vec.size(); + index_vec.push_back({index, {}}); + } + index_vec[index_map[index]].second.push_back(i); + } + + const auto compute_params_grads = [&](Eigen::Index start, + Eigen::Index end) { + for (Eigen::Index i = start; i < end; ++i) { + VectorMap params_grads_slice(¶ms_grads(index_vec[i].first, 0), + output_dim); + for (Eigen::Index index : index_vec[i].second) { + const Eigen::Index bag = index / sequence_length; + const Eigen::Index seq = index % sequence_length; + const ConstVectorMap grads_slice(&grads(bag, 0), output_dim); + params_grads_slice += grads_slice * weights(bag, seq); + } + if (combiner == Combiner::kMean) { + params_grads_slice /= static_cast(sequence_length); + } + } + }; + + const Eigen::Index num_unique_params = index_vec.size(); + const double bytes_loaded = 100 * output_dim * sizeof(T); + const double bytes_stored = output_dim * sizeof(T); + const double compute_cycles = + 100 * output_dim * + (Eigen::TensorOpCost::AddCost() + Eigen::TensorOpCost::MulCost()); + const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, compute_cycles, + /*vectorized=*/true, + /*packet_size=*/kPacketSize); + params_grads.setZero(); + device.parallelFor(num_unique_params, cost, + std::move(compute_params_grads)); + + const auto compute_weights_grads = + [&](const Eigen::array &coords) -> T { + const Eigen::Index bag = coords[0]; + const Eigen::Index seq = coords[1]; + const ConstVectorMap grads_slice(&grads(bag, 0), output_dim); + const ConstVectorMap params_slice(¶ms(indices(bag, seq), 0), + output_dim); + T output = params_slice.dot(grads_slice); + if (combiner == Combiner::kMean) { + output /= static_cast(sequence_length); + } + return output; + }; + + weights_grads.device(device) = + weights_grads.generate(std::move(compute_weights_grads)); + } +}; +} // namespace functor + +namespace { +bool ValidateCombiner(const std::string &combiner_string, Combiner *combiner) { + if (combiner_string == "SUM") { + *combiner = Combiner::kSum; + } else if (combiner_string == "MEAN") { + *combiner = Combiner::kMean; + } else { + return false; + } + return true; +} +} // namespace + +template +class EmbeddingBagOp : public OpKernel { + public: + explicit EmbeddingBagOp(OpKernelConstruction *context) : OpKernel(context) { + std::string combiner_string; + OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string)); + OP_REQUIRES( + context, ValidateCombiner(combiner_string, &combiner_), + errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner.")); + } + + void Compute(OpKernelContext *context) override { + const Tensor &indices = context->input(0); + const Tensor ¶ms = context->input(1); + const Tensor &weights = context->input(2); + + const TensorShape &indices_shape = indices.shape(); + const TensorShape ¶ms_shape = params.shape(); + const TensorShape &weights_shape = weights.shape(); + + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices_shape), + errors::InvalidArgument("indices shape should be 2-D.")); + OP_REQUIRES(context, indices_shape == weights_shape, + errors::InvalidArgument( + "Shape of indices and weights should be equal.")); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(params_shape), + errors::InvalidArgument("params shape should be 2-D.")); + + TensorShape output_shape = {indices_shape.dim_size(0), + params_shape.dim_size(1)}; + + Tensor *output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + + functor::EmbeddingBagFunctor()( + context->eigen_device(), indices.tensor(), + params.tensor(), weights.tensor(), output->tensor(), + combiner_); + } + + private: + Combiner combiner_; +}; + +template +class EmbeddingBagBackwardOp : public OpKernel { + public: + explicit EmbeddingBagBackwardOp(OpKernelConstruction *context) + : OpKernel(context) { + std::string combiner_string; + OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string)); + OP_REQUIRES( + context, ValidateCombiner(combiner_string, &combiner_), + errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner.")); + } + + void Compute(OpKernelContext *context) override { + const Tensor &indices = context->input(0); + const Tensor ¶ms = context->input(1); + const Tensor &weights = context->input(2); + const Tensor &grads = context->input(3); + + Tensor *params_grads = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, params.shape(), ¶ms_grads)); + Tensor *weights_grads = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output(1, weights.shape(), &weights_grads)); + functor::EmbeddingBagBackwardFunctor()( + context->eigen_device(), indices.tensor(), + params.tensor(), weights.tensor(), grads.tensor(), + params_grads->tensor(), weights_grads->tensor(), combiner_, + context); // Pass the context so the GPU op can allocate the temporary + // arrays it needs + } + + private: + Combiner combiner_; +}; + +// Register the CPU kernels. +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagBackwardOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagBackwardOp); +REGISTER_CPU_KERNEL(Eigen::half); +REGISTER_CPU_KERNEL(float); +REGISTER_CPU_KERNEL(double); +#undef REGISTER_CPU_KERNEL + +#if GOOGLE_CUDA +namespace functor { +// Forward declarations of the functor specializations for GPU. +#define DECLARE_GPU_SPEC(T, Tindices) \ + template <> \ + void EmbeddingBagFunctor::operator()( \ + const GPUDevice &, typename TTypes::ConstTensor, \ + typename TTypes::ConstTensor, typename TTypes::ConstTensor, \ + typename TTypes::Tensor, Combiner); \ + extern template struct EmbeddingBagFunctor; + +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC(T, int32); \ + DECLARE_GPU_SPEC(T, int64); + +DECLARE_GPU_SPECS(Eigen::half); +DECLARE_GPU_SPECS(float); +DECLARE_GPU_SPECS(double); +#undef DECLARE_GPU_SPEC +#undef DECLARE_GPU_SPECS +} // namespace functor + +// Register the GPU kernels. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagBackwardOp); \ + REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + EmbeddingBagBackwardOp); +REGISTER_GPU_KERNEL(Eigen::half); +REGISTER_GPU_KERNEL(float); +REGISTER_GPU_KERNEL(double); +#undef REGISTER_GPU_KERNEL +#endif // GOOGLE_CUDA +} // namespace addons +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h new file mode 100644 index 00000000..b05c58e7 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h @@ -0,0 +1,57 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_ +#define TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace addons { + +enum class Combiner { + kSum, + kMean, +}; + +namespace functor { + +template +struct EmbeddingBagFunctor { + void operator()(const Device &device, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::Tensor output, Combiner combiner); +}; + +template +struct EmbeddingBagBackwardFunctor { + void operator()(const Device &device, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::ConstTensor grads, + typename TTypes::Tensor params_grads, + typename TTypes::Tensor weights_grads, + Combiner combiner, OpKernelContext *context); +}; + +} // namespace functor +} // namespace addons +} // namespace tensorflow + +#endif // TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_ diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc new file mode 100644 index 00000000..7be3d552 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc @@ -0,0 +1,108 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "embedding_bag_ops.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +namespace addons { + +typedef Eigen::GpuDevice GPUDevice; + +namespace { +// Define the GPU kernel. +template +__global__ void EmbeddingBagGPUKernel(const Tindices *__restrict__ indices, + const T *__restrict__ params, + const T *__restrict__ weights, + T *__restrict__ output, + const Eigen::Index output_dim, + const Eigen::Index sequence_length, + Combiner combiner) { + // blockIdx.x indicates which row of the output we are writing to. It also + // indicates which `bag` we're reading from. + // blockIdx.y indicates which chunk of that row we are writing to. + // threadIdx.x indicates which element of that chunk we are writing to. + + // feature_idx is the position in the final dimension of the output that we + // are writing to. + const Eigen::Index feature_idx = blockIdx.y * kThreadsPerBlock + threadIdx.x; + // It's necessary in case output_dim is not evenly divided by blockDim.x. + if (feature_idx < output_dim) { + // output_idx is the offset of the output we are writing to. + const Eigen::Index output_idx = blockIdx.x * output_dim + feature_idx; + // bag_offset is the offset in indices corresponding to the first + // index of the `bag` that we will be summing over. + const Eigen::Index bag_offset = blockIdx.x * sequence_length; + T accum = static_cast(0); + for (Eigen::Index idx_offset = bag_offset; + idx_offset < bag_offset + sequence_length; ++idx_offset) { + accum += params[indices[idx_offset] * output_dim + feature_idx] * + weights[idx_offset]; + } + if (combiner == Combiner::kMean) { + accum /= static_cast(sequence_length); + } + output[output_idx] = accum; + } +} +} // namespace + +namespace functor { +// Define the GPU implementation that launches the CUDA kernel. +template +struct EmbeddingBagFunctor { + static constexpr int kThreadsPerBlock = 32; + + void operator()(const GPUDevice &device, + typename TTypes::ConstTensor indices, + typename TTypes::ConstTensor params, + typename TTypes::ConstTensor weights, + typename TTypes::Tensor output, Combiner combiner) { + const Eigen::Index bags = indices.dimension(0); + const Eigen::Index sequence_length = indices.dimension(1); + const Eigen::Index output_dim = params.dimension(1); + + const int blocks_per_value_vec = + Eigen::divup(output_dim, static_cast(kThreadsPerBlock)); + const dim3 grids = dim3(bags, blocks_per_value_vec); + + TF_CHECK_OK(GpuLaunchKernel( + EmbeddingBagGPUKernel, grids, + kThreadsPerBlock, 0, device.stream(), indices.data(), params.data(), + weights.data(), output.data(), output_dim, sequence_length, combiner)); + } +}; + +// Explicit instantiation of the GPU functor. +#define DECLARE_GPU_SPECS(T) \ + template struct EmbeddingBagFunctor; \ + template struct EmbeddingBagFunctor; + +DECLARE_GPU_SPECS(Eigen::half); +DECLARE_GPU_SPECS(float); +DECLARE_GPU_SPECS(double); +#undef DECLARE_GPU_SPECS + +} // namespace functor +} // namespace addons +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc b/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc new file mode 100644 index 00000000..38a39cb1 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc @@ -0,0 +1,70 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { +namespace addons { + +using ::tensorflow::shape_inference::DimensionHandle; +using ::tensorflow::shape_inference::InferenceContext; +using ::tensorflow::shape_inference::ShapeHandle; + +REGISTER_OP("Deepray>EmbeddingBag") + .Input("indices: Tindices") + .Input("params: T") + .Input("weights: T") + .Output("output: T") + .Attr("T: {bfloat16, half, float, double}") + .Attr("Tindices: {int32, int64}") + .Attr("combiner: {'SUM', 'MEAN'} = 'SUM'") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle indices, params, weights, unused, output; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &indices)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, ¶ms)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &weights)); + DimensionHandle output_dim = c->Dim(params, 1); + TF_RETURN_IF_ERROR( + c->ReplaceDim(indices, c->Rank(indices) - 1, output_dim, &output)); + TF_RETURN_IF_ERROR(c->Merge(indices, weights, &unused)); + c->set_output(0, output); + return Status(); + }); + +REGISTER_OP("Deepray>EmbeddingBagGrad") + .Input("indices: Tindices") + .Input("params: T") + .Input("weights: T") + .Input("grads: T") + .Output("params_grads: T") + .Output("weights_grads: T") + .Attr("T: {bfloat16, half, float, double}") + .Attr("Tindices: {int32, int64}") + .Attr("combiner: {'SUM', 'MEAN'} = 'SUM'") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle indices, params, weights, unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &indices)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, ¶ms)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &weights)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &unused)); + TF_RETURN_IF_ERROR(c->Merge(indices, weights, &unused)); + c->set_output(0, c->input(1)); + c->set_output(1, c->input(2)); + return Status(); + }); + +} // namespace addons +} // namespace tensorflow diff --git a/deepray/layers/nlp/__init__.py b/deepray/custom_ops/embedding_bag/python/__init__.py similarity index 100% rename from deepray/layers/nlp/__init__.py rename to deepray/custom_ops/embedding_bag/python/__init__.py diff --git a/deepray/custom_ops/embedding_bag/python/embedding_bag.py b/deepray/custom_ops/embedding_bag/python/embedding_bag.py new file mode 100644 index 00000000..9c6acc04 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/python/embedding_bag.py @@ -0,0 +1,143 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from typeguard import typechecked + +from deepray.utils.types import Constraint, Initializer, Regularizer +from deepray.utils.resource_loader import LazySO + +_embedding_bag_so = LazySO("custom_ops/embedding_bag/_embedding_bag_ops.so") + + +def _embedding_bag( + indices, + params, + weights=None, + combiner="sum", + name=None, +): + """EmbeddingBag computation. + + See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html). + + Equivalent to tf.gather() followed by tf.reduce_{sum,mean}() across the last dimension, with optional + weights. Fusing these into a single op has massive benefits for execution speed and particularly + memory usage, as the intermediate output of the gather never needs to be materialized. + + Args: + indices: An int32 or int64 `Tensor` of the indices to gather from + `params`. Must be at least 2-dimensional, as the last dimension + will be summed out. Maximum value must be less than params.shape[0]. + params: A float32 `Tensor` from which to gather params. Must be rank 2. + weights: A float32 `Tensor` of weights which will be applied to each of + the gathered embedding vectors before the sum step. + name: A name for the operation (optional). + + Returns: + A `Tensor` of the format specified by `data_format`. + """ + if weights is None: + weights = tf.ones_like(indices, dtype=params.dtype) + elif combiner != "sum": + raise RuntimeError("Combiner mode must be 'sum' when weights are supplied to EmbeddingBag!") + + return _embedding_bag_so.ops.deepray_embedding_bag(indices, params, weights, combiner=combiner.upper(), name=name) + + +@tf.RegisterGradient("Deepray>EmbeddingBag") +def _embedding_bag_grad(op, grads): + indices, params, weights = op.inputs[:3] + combiner = op.get_attr("combiner") + value_grads, weight_grads = _embedding_bag_so.ops.deepray_embedding_bag_grad( + indices, params, weights, grads, combiner=combiner + ) + return [None, value_grads, weight_grads] + + +@tf.keras.utils.register_keras_serializable(package="Deepray") +class EmbeddingBag(tf.keras.layers.Layer): + """EmbeddingBag Layer. + + See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html). + + Equivalent to tf.gather() followed by tf.reduce_sum() across the last dimension, with optional + weights. Fusing these into a single op has massive benefits for execution speed and particularly + memory usage, as the intermediate output of the gather never needs to be materialized. + + Input Shapes: + indices: An int32 or int64 `Tensor` of the indices to gather from + `params`. Must be at least 2-dimensional, as the last dimension + will be summed out. Maximum value must be less than params.shape[0]. + params: A float32 `Tensor` from which to gather params. Must be rank 2. + weights: A float32 `Tensor` of weights which will be applied to each of + the gathered embedding vectors before the sum step. + + Output shape: + indices.shape[:-1], params.shape[-1] + """ + + @typechecked + def __init__( + self, + input_dim: int, + output_dim: int, + embeddings_initializer: Initializer = "uniform", + embeddings_regularizer: Regularizer = None, + embeddings_constraint: Constraint = None, + mask_zero: bool = False, + combiner: str = "sum", + **kwargs, + ): + super(EmbeddingBag, self).__init__(**kwargs) + if input_dim <= 0 or output_dim <= 0: + raise ValueError( + "Both `input_dim` and `output_dim` should be positive, " + "found input_dim {} and output_dim {}".format(input_dim, output_dim) + ) + self.input_dim = input_dim + self.output_dim = output_dim + self.embeddings_initializer = tf.keras.initializers.get(embeddings_initializer) + self.embeddings_regularizer = tf.keras.regularizers.get(embeddings_regularizer) + self.embeddings_constraint = tf.keras.constraints.get(embeddings_constraint) + self.mask_zero = mask_zero + self.supports_masking = mask_zero + self.combiner = combiner + + def build(self, input_shape): + self.embeddings = self.add_weight( + shape=(self.input_dim, self.output_dim), + name="embeddings", + initializer=self.embeddings_initializer, + regularizer=self.embeddings_regularizer, + constraint=self.embeddings_constraint, + ) + self.built = True + + def call(self, indices, weights=None): + return _embedding_bag(indices, self.embeddings, weights, combiner=self.combiner) + + def get_config(self): + config = { + "input_dim": self.input_dim, + "output_dim": self.output_dim, + "embeddings_initializer": tf.keras.initializers.serialize(self.embeddings_initializer), + "embeddings_regularizer": tf.keras.regularizers.serialize(self.embeddings_regularizer), + "embeddings_constraint": tf.keras.constraints.serialize(self.embeddings_constraint), + "mask_zero": self.mask_zero, + "combiner": self.combiner, + } + base_config = super(EmbeddingBag, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/deepray/layers/nlp/transformer/__init__.py b/deepray/custom_ops/embedding_bag/python/tests/__init__.py similarity index 100% rename from deepray/layers/nlp/transformer/__init__.py rename to deepray/custom_ops/embedding_bag/python/tests/__init__.py diff --git a/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py new file mode 100644 index 00000000..f1d1ee33 --- /dev/null +++ b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py @@ -0,0 +1,116 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for EmbeddingBag layer.""" + +import pytest +import numpy as np +import tensorflow as tf + +from deepray.custom_ops.embedding_bag import EmbeddingBag, _embedding_bag +from deepray.utils import test_utils + + +def manual_embedding_bag(indices, params, weights=None, combiner="mean"): + gathered = tf.gather(params, indices) + if weights is not None: + gathered *= tf.expand_dims(weights, -1) + if combiner == "sum": + return tf.reduce_sum(gathered, -2, keepdims=False) + else: + assert combiner == "mean" + assert weights is None + return tf.reduce_mean(gathered, -2, keepdims=False) + + +@pytest.mark.with_device(["cpu", "gpu"]) +@pytest.mark.parametrize("input_shape", [(16, 32)]) +@pytest.mark.parametrize("input_dim", [63, 64]) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("combiner", ["sum", "mean"]) +def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner): + indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype) + params = np.random.random(size=(input_dim, 16)).astype(dtype) + if combiner == "sum": + weights = np.random.random(size=indices.shape).astype(dtype) + else: + weights = None + expected = manual_embedding_bag(indices, params, weights, combiner=combiner) + embedding_bag = EmbeddingBag(input_dim, 16, combiner=combiner, dtype=dtype) + embedding_bag.build(indices.shape) + embedding_bag.set_weights([params]) + indices = tf.convert_to_tensor(indices) + if weights is not None: + weights = tf.convert_to_tensor(weights) + output = embedding_bag( + indices, + weights, + ) + test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2) + + +@pytest.mark.with_device(["cpu", "gpu"]) +@pytest.mark.parametrize("input_shape", [(16, 32)]) +@pytest.mark.parametrize("input_dim", [63, 64]) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("combiner", ["sum", "mean"]) +@pytest.mark.usefixtures("maybe_run_functions_eagerly") +def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner): + indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype) + params = np.random.random(size=(input_dim, 16)).astype(dtype) + if combiner == "sum": + weights = np.random.random(size=indices.shape).astype(dtype) + else: + weights = None + + indices = tf.convert_to_tensor(indices) + params = tf.convert_to_tensor(params) + if weights is not None: + weights = tf.convert_to_tensor(weights) + + embedding_bag_fn = tf.function(_embedding_bag) + + if combiner == "sum": + with tf.GradientTape(persistent=True) as tape: + tape.watch([params, weights]) + output = embedding_bag_fn(indices, params, weights, combiner="sum") + expected = manual_embedding_bag(indices, params, weights, combiner="sum") + + grads = tape.gradient(output, [params, weights]) + expected_grads = tape.gradient(expected, [params, weights]) + # Gather returns sparse IndexedSlices so we have to sum them together. + test_utils.assert_allclose_according_to_type( + tf.convert_to_tensor(expected_grads[0]), + tf.convert_to_tensor(grads[0]), + half_rtol=1e-2, + half_atol=1e-2, + ) + test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2) + else: + with tf.GradientTape(persistent=True) as tape: + tape.watch(params) + output = embedding_bag_fn(indices, params, combiner=combiner) + expected = manual_embedding_bag(indices, params, combiner=combiner) + + grads = tape.gradient(output, [params]) + expected_grads = tape.gradient(expected, [params]) + # Gather returns sparse IndexedSlices so we have to sum them together. + test_utils.assert_allclose_according_to_type( + tf.convert_to_tensor(expected_grads[0]), + tf.convert_to_tensor(grads[0]), + half_rtol=1e-2, + half_atol=1e-2, + ) diff --git a/deepray/seq2seq/tests/run_all_test.py b/deepray/custom_ops/embedding_bag/python/tests/run_all_test.py similarity index 72% rename from deepray/seq2seq/tests/run_all_test.py rename to deepray/custom_ops/embedding_bag/python/tests/run_all_test.py index d5c4af3d..8261049e 100644 --- a/deepray/seq2seq/tests/run_all_test.py +++ b/deepray/custom_ops/embedding_bag/python/tests/run_all_test.py @@ -1,8 +1,7 @@ from pathlib import Path import sys - import pytest if __name__ == "__main__": dirname = Path(__file__).absolute().parent - sys.exit(pytest.main([str(dirname)])) + sys.exit(pytest.main(["-s", str(dirname)])) diff --git a/deepray/custom_ops/embedding_variable/BUILD b/deepray/custom_ops/embedding_variable/BUILD new file mode 100644 index 00000000..ab953b1b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/BUILD @@ -0,0 +1,282 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda") +load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library") +load("//deepray:deepray.bzl", "custom_op_library") + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +proto_library( + name = "config_proto", + srcs = ["config.proto"], +) + +cc_proto_library( + name = "config_proto_cc", + deps = [":config_proto"], +) + +py_proto_library( + name = "config_proto_py_pb2", + srcs = ["config.proto"], + default_runtime = "@com_google_protobuf//:protobuf_python", + protoc = "@com_google_protobuf//:protoc", + srcs_version = "PY3", + deps = [ + "@com_google_protobuf//:protobuf_python", + ], +) + +py_library( + name = "embedding_variable", + srcs = glob( + [ + "python/*.py", + "python/**/*.py", + "*.py", + ], + ), + data = [ + ":_group_embedding_ops.so", + ":_kv_variable_ops.so", + ], + srcs_version = "PY3", + deps = [ + ":config_proto_py_pb2", + "//deepray/utils", + ], +) + +cc_library( + name = "save_restore_tensor_ev", + hdrs = [ + "cc/kernels/save_restore_tensor_ev.h", + ], + deps = [ + "//deepray/custom_ops/embedding_variable/cc/lib:tensor_bundle", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cc_library( + name = "kv_variable_util", + srcs = ["cc/kernels/kv_variable_util.cc"], + hdrs = [ + "cc/kernels/kv_variable_util.h", + ], + copts = ["-Wno-unused-result"], + deps = [ + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + ], +) + +cuda_library( + name = "training_ali_lib", + srcs = [ + "cc/kernels/training_ali_ops_gpu.cu.cc", + ], + hdrs = [ + "cc/kernels/training_ali_ops_gpu.h", + ], + deps = [ + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + ], +) + +GROUP_EMBEDDING_OP_SRCS = [ + "cc/group_embedding/group_embedding_lookup_ops.cc", + "cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc", + "cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h", + "cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc", +] + +GROUP_EMBEDDING_OP_GPU_SRCS = [ + "cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc", + "cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc", + "cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h", + "cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h", +] + +cuda_library( + name = "fused_embedding_common_cuh", + hdrs = ["cc/fused_embedding/fused_embedding_common.cu.h"], +) + +FUSED_EMBEDDING_OP_SRCS = [ + "cc/fused_embedding/embedding_lookup_sparse_post_op.cc", + "cc/fused_embedding/embedding_lookup_sparse_pre_op.cc", + "cc/fused_embedding/fused_embedding_ops.cc", +] + +FUSED_EMBEDDING_OP_GPU_SRCS = [ + "cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc", + "cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc", +] + +custom_op_library( + name = "_kv_variable_ops.so", + srcs = [ + "cc/kernels/kv_variable_lookup_ops.cc", + "cc/kernels/kv_variable_ops.cc", + "cc/kernels/kv_variable_restore_ops.cc", + "cc/kernels/save_restore_ops.cc", + "cc/kernels/training_adagrad_ops.cc", + "cc/kernels/training_adam_async_ops.cc", + "cc/kernels/training_adam_ops.cc", + "cc/kernels/training_ali_op_helpers.h", + "cc/kernels/training_ftrl_ops.cc", + "cc/kernels/training_sgd_ops.cc", + "cc/ops/kv_variable_ops.cc", + "cc/ops/save_restore_ops.cc", + "cc/ops/training_adagrad_ops.cc", + "cc/ops/training_adam_async_ops.cc", + "cc/ops/training_adam_ops.cc", + "cc/ops/training_ftrl_ops.cc", + "cc/ops/training_sgd_ops.cc", + ], + copts = ["-Wno-unused-result"] + if_cuda(["-DGOOGLE_CUDA=1"]), + gpu_deps = [ + ":training_ali_lib", + ], + gpu_srcs = [ + "cc/kernels/training_ali_ops_gpu.h", + ], + deps = [ + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable:kv_variable_util", + "//deepray/custom_ops/unique_ops:unique_ali_util", + "//deepray/custom_ops/utils:spin_rw_lock", + "@com_github_google_leveldb//:leveldb", + "@sparsehash_c11//:dense_hash_map", + ], +) + +py_test( + name = "multiplex_1_test", + size = "medium", + srcs = ["multiplex_1_test.py"], + python_version = "PY3", + srcs_version = "PY3", + tags = [ + "no_mac", # TODO(b/216321151): Re-enable this test. + ], + deps = [ + ":embedding_variable", + "@pypi_numpy//:pkg", + "@pypi_tensorflow//:pkg", + ], +) + +custom_op_library( + name = "_raw_ops.so", + srcs = [ + "cc/kernels/embedding_collection.cc", + "cc/kernels/embedding_collection.hpp", + "cc/ops/embedding_collection.cc", + ], + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + deps = [ + ":hotness_calculate", + ], +) + +cuda_library( + name = "hotness_calculate", + srcs = [ + "cc/kernels/hotness_calculate.cu.cc", + ], + hdrs = [ + "cc/kernels/hotness_calculate.h", + ], + defines = [ + "TF_VERSION_MAJOR=2", + ], + deps = [ + "//deepray/custom_ops/utils:check_util", + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cuda_runtime", + "@local_config_cuda//cuda:cudart", + ], +) + +custom_op_library( + name = "_save_restore_ops.so", + srcs = [ + "cc/kernels/save_restore_ops.cc", + "cc/ops/save_restore_ops.cc", + ], + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + deps = [ + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + ], +) + +custom_op_library( + name = "_group_embedding_ops.so", + srcs = [ + "cc/ops/group_embedding_ops.cc", + ] + GROUP_EMBEDDING_OP_SRCS, + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + gpu_deps = [ + ":fused_embedding_common_cuh", + ], + gpu_srcs = GROUP_EMBEDDING_OP_GPU_SRCS, + deps = [ + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + "//deepray/custom_ops/unique_ops:unique_ali_util", + ], +) + +cc_test( + name = "group_embedding_ops_test", + size = "small", + srcs = ["cc/group_embedding/group_embedding_lookup_ops_test.cc"], + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + deps = [ + ":_group_embedding_ops.so", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable:kv_variable_util", + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + "//deepray/custom_ops/utils:fake_input", + "//deepray/custom_ops/utils:kernel_benchmark_testlib", + "//deepray/custom_ops/utils:ops_testutil", + "//deepray/custom_ops/utils:tensor_testutil", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + +custom_op_library( + name = "_incr_save_restore_ops.so", + srcs = [ + "cc/incr_save_restore/incr_save_restore_ops.cc", + "cc/incr_save_restore/incr_save_restore_ops.h", + "cc/ops/incr_save_restore_ops.cc", + ], + deps = [ + ":save_restore_tensor_ev", + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + ], +) + +cc_test( + name = "incr_save_restore_ops_test", + size = "small", + srcs = ["cc/incr_save_restore/incr_save_restore_ops_test.cc"], + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + deps = [ + ":_incr_save_restore_ops.so", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable:kv_variable_util", + "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib", + "//deepray/custom_ops/utils:fake_input", + "//deepray/custom_ops/utils:kernel_benchmark_testlib", + "//deepray/custom_ops/utils:ops_testutil", + "//deepray/custom_ops/utils:tensor_testutil", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/deepray/custom_ops/embedding_variable/__init__.py b/deepray/custom_ops/embedding_variable/__init__.py new file mode 100644 index 00000000..abbf9a39 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/__init__.py @@ -0,0 +1,3 @@ +from .python import kv_variable_ops +from .python import group_embedding_lookup_ops +from .python.kv_variable_ops import gen_kv_variable_ops \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/BUILD b/deepray/custom_ops/embedding_variable/cc/embedding/BUILD new file mode 100644 index 00000000..8a2b3f95 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/BUILD @@ -0,0 +1,269 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda") +load( + "@org_tensorflow//tensorflow:tensorflow.bzl", + "tf_copts", +) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "embedding_ops_lib", + deps = [ + ":embedding_gpu", + ":embedding_var", + ":embedding_var_ckpt_data", + ":multi_tier_storage", + ":ssd_record_descriptor", + ], +) + +cc_library( + name = "ssd_record_descriptor", + srcs = ["ssd_record_descriptor.cc"], + hdrs = [ + "counter_filter_descriptor_impl.h", + "dynamic_dim_feature_descriptor_impl.h", + "embedding_config.h", + "embedding_memory_pool.h", + "embedding_var_dump_iterator.h", + "feature_descriptor.h", + "feature_descriptor_impl.h", + "hbm_multi_tier_feature_descriptor.h", + "kv_interface.h", + "normal_feature_descriptor.h", + "ssd_record_descriptor.h", + ], + copts = [ + "-Wno-unused-result", + "-Wno-c++11-narrowing", + ], + deps = [ + "//deepray/custom_ops/embedding_variable/cc/lib:allocator", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable:save_restore_tensor_ev", + # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_lib", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:tf_header_lib", + "@sparsehash_c11//:dense_hash_map", + ], +) + +cuda_library( + name = "multi_tier_storage", + srcs = [ + "multi_tier_storage.cu.cc", + ], + hdrs = [ + "bloom_filter_policy.h", + "cache.h", + "cache_factory.h", + "cache_thread_pool_creator.h", + "counter_filter_descriptor_impl.h", + "counter_filter_policy.h", + "cpu_hash_map_kv.h", + "dram_leveldb_storage.h", + "dram_pmem_storage.h", + "dram_ssd_storage.h", + "dynamic_dim_feature_descriptor_impl.h", + "emb_file.h", + "emb_file_creator.h", + "embedding_config.h", + "embedding_memory_pool.h", + "embedding_var.h", + "embedding_var_ckpt_data.h", + "embedding_var_context.h", + "embedding_var_dump_iterator.h", + "embedding_var_restore.h", + "eviction_manager.h", + "feature_descriptor.h", + "feature_descriptor_impl.h", + "filter_factory.h", + "filter_policy.h", + "globalstep_shrink_policy.h", + "gpu_hash_map_kv.h", + "hbm_dram_ssd_storage.h", + "hbm_dram_storage.h", + "hbm_multi_tier_feature_descriptor.h", + "hbm_storage_iterator.h", + "intra_thread_copy_id_allocator.h", + "kv_interface.h", + "l2weight_shrink_policy.h", + "leveldb_kv.h", + "multi_tier_storage.h", + "normal_feature_descriptor.h", + "nullable_filter_policy.h", + "shrink_policy.h", + "single_tier_storage.h", + "ssd_hash_kv.h", + "ssd_record_descriptor.h", + "storage.h", + "storage_config.h", + "storage_factory.h", + ], + copts = [ + "-Wno-unused-result", + ], + deps = [ + ":embedding_gpu", + "//deepray/custom_ops/embedding_variable/cc/lib:allocator", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/utils:spin_rw_lock", + "@com_github_google_leveldb//:leveldb", + # "@org_tensorflow//tensorflow/core:framework_headers_lib", + # "@org_tensorflow//tensorflow/core/common_runtime:core_cpu", + # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_runtime", + # "@org_tensorflow//tensorflow/core/kernels:gpu_device_array", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:tf_header_lib", + "@sparsehash_c11//:dense_hash_map", + ], +) + +cc_library( + name = "embedding_var_ckpt_data", + srcs = ["embedding_var_ckpt_data.cc"], + hdrs = [ + "counter_filter_descriptor_impl.h", + "dynamic_dim_feature_descriptor_impl.h", + "embedding_config.h", + "embedding_memory_pool.h", + "embedding_var_ckpt_data.h", + "embedding_var_dump_iterator.h", + "feature_descriptor.h", + "feature_descriptor_impl.h", + "hbm_multi_tier_feature_descriptor.h", + "kv_interface.h", + "normal_feature_descriptor.h", + ], + copts = [ + "-Wno-c++11-narrowing", + ], + deps = [ + "//deepray/custom_ops/embedding_variable/cc/lib:allocator", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/embedding_variable:save_restore_tensor_ev", + # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_lib", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:tf_header_lib", + "@sparsehash_c11//:dense_hash_map", + ], +) + +cuda_library( + name = "embedding_gpu", + srcs = [ + "batch.cu.cc", + "gpu_hash_table.cu.cc", + ], + hdrs = [ + "batch.h", + "gpu_hash_table.h", + ], + copts = tf_copts(allow_exceptions = True) + if_cuda([ + "-DNV_CUDNN_DISABLE_EXCEPTION", + ]) + select({ + "//conditions:default": [], + "@local_config_cuda//cuda:using_nvcc": [ + "-nvcc_options=relaxed-constexpr", + #"-nvcc_options=ftz=true", + ], + "@local_config_cuda//cuda:using_clang": [ + "-fcuda-flush-denormals-to-zero", + ], + }), + visibility = ["//visibility:public"], + deps = [ + "@com_github_google_leveldb//:leveldb", + "@cuCollections//:cuco_hash_table", + "@libcuckoo", + # "@org_tensorflow//tensorflow/core:framework_headers_lib", + # "@org_tensorflow//tensorflow/core/platform:stream_executor", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:tf_header_lib", + "@sparsehash_c11//:dense_hash_map", + ], + alwayslink = 1, +) + +cuda_library( + name = "embedding_var", + srcs = [ + "embedding_var.cu.cc", + "embedding_var_restore.cc", + ], + hdrs = [ + "bloom_filter_policy.h", + "cache.h", + "cache_factory.h", + "cache_thread_pool_creator.h", + "counter_filter_descriptor_impl.h", + "counter_filter_policy.h", + "cpu_hash_map_kv.h", + "dram_leveldb_storage.h", + "dram_pmem_storage.h", + "dram_ssd_storage.h", + "dynamic_dim_feature_descriptor_impl.h", + "emb_file.h", + "emb_file_creator.h", + "embedding_config.h", + "embedding_memory_pool.h", + "embedding_var.h", + "embedding_var_context.h", + "embedding_var_dump_iterator.h", + "embedding_var_restore.h", + "eviction_manager.h", + "feature_descriptor.h", + "feature_descriptor_impl.h", + "filter_factory.h", + "filter_policy.h", + "globalstep_shrink_policy.h", + "gpu_hash_map_kv.h", + "hbm_dram_ssd_storage.h", + "hbm_dram_storage.h", + "hbm_multi_tier_feature_descriptor.h", + "hbm_storage_iterator.h", + "intra_thread_copy_id_allocator.h", + "kv_interface.h", + "l2weight_shrink_policy.h", + "leveldb_kv.h", + "normal_feature_descriptor.h", + "nullable_filter_policy.h", + "shrink_policy.h", + "single_tier_storage.h", + "ssd_hash_kv.h", + "storage.h", + "storage_config.h", + "storage_factory.h", + ], + copts = tf_copts() + ["-g"] + select({ + "//conditions:default": [], + "@local_config_cuda//cuda:using_nvcc": [ + "-nvcc_options=relaxed-constexpr", + ], + "@local_config_cuda//cuda:using_clang": [ + "-fcuda-flush-denormals-to-zero", + ], + }) + [ + "-Wno-unused-result", + ], + deps = [ + "//deepray/custom_ops/embedding_variable/cc/lib:allocator", + ":embedding_gpu", + ":embedding_var_ckpt_data", + ":multi_tier_storage", + ":ssd_record_descriptor", + "//deepray/custom_ops/embedding_variable:config_proto_cc", + "//deepray/custom_ops/utils:spin_rw_lock", + "@com_github_google_leveldb//:leveldb", + # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_runtime", + # "@org_tensorflow//tensorflow/core/kernels:gpu_device_array", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:tf_header_lib", + "@sparsehash_c11//:dense_hash_map", + ], +) diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc new file mode 100644 index 00000000..6323b151 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc @@ -0,0 +1,219 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#if GOOGLE_CUDA + +#include "batch.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { +namespace embedding { +template +__global__ void BatchCopy(V** batch, V* val_base, int value_len, int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / value_len; + int item_pos = i % value_len; + + if (i < limit * value_len) { + val_base[i] = *(batch[item_id] + item_pos); + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void BatchCopy(T**, T*, int, int); +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_int32(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_int64(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX + +template +__global__ void BatchUnpack(V** dev_value_address, V* memcpy_buffer_gpu, + int value_len, int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / value_len; + int item_pos = i % value_len; + + if (i < limit * value_len) { + *(dev_value_address[item_id] + item_pos) = memcpy_buffer_gpu[i]; + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void BatchUnpack(T**, T*, int, int); +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_int32(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_int64(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX + +template +__global__ void CopyEmbedding(V** batch, V** batch_data_space, int total_dims, + int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / total_dims; + int item_pos = i % total_dims; + + if (i < limit * total_dims) { + *(batch_data_space[item_id] + item_pos) = *(batch[item_id] + item_pos); + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void CopyEmbedding(T**, T**, int, int); +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX +} // namespace embedding + +template +__global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr, + int embedding_dim, long long int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / embedding_dim; + int item_pos = i % embedding_dim; + + if (i < limit * embedding_dim) { + *(a[item_id] + item_pos) += g[i] * g[i]; + *(v[item_id] + item_pos) -= lr * g[i] * rsqrt(*(a[item_id] + item_pos)); + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void SparseApplyAdagradGPU(T**, T**, const T*, T, \ + int, long long int); +TF_CALL_float(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_double(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX + +template +__global__ void SparseApplyAdamGPU(V** var, V** m, V** v, const V* g, V lr, + V beta1, V beta2, V epsilon, V beta1_power, + V beta2_power, int embedding_dim, + long long int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / embedding_dim; + int item_pos = i % embedding_dim; + + if (i < limit * embedding_dim) { + const V alpha = lr * sqrt(static_cast(1) - beta2_power) / + (static_cast(1) - beta1_power); + *(m[item_id] + item_pos) = + *(m[item_id] + item_pos) * beta1 + g[i] * (1 - beta1); + *(v[item_id] + item_pos) = + *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1 - beta2); + *(var[item_id] + item_pos) -= (*(m[item_id] + item_pos) * alpha) / + (sqrt(*(v[item_id] + item_pos)) + epsilon); + } + __syncthreads(); +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void SparseApplyAdamGPU( \ + T**, T**, T**, const T*, T, T, T, T, T, T, int, long long int); +TF_CALL_float(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_double(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX + +template +__global__ void SparseApplyAdamAsyncGPU(V** var, V** m, V** v, const V* g, V lr, + V beta1, V beta2, V epsilon, + V* beta1_power_ptr, V* beta2_power_ptr, + int embedding_dim, + long long int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / embedding_dim; + int item_pos = i % embedding_dim; + + if (i < limit * embedding_dim) { + V beta1_power = *beta1_power_ptr; + V beta2_power = *beta2_power_ptr; + const V alpha = lr * sqrt(static_cast(1) - beta2_power) / + (static_cast(1) - beta1_power); + *(m[item_id] + item_pos) = + *(m[item_id] + item_pos) * beta1 + g[i] * (1 - beta1); + *(v[item_id] + item_pos) = + *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1 - beta2); + *(var[item_id] + item_pos) -= (*(m[item_id] + item_pos) * alpha) / + (sqrt(*(v[item_id] + item_pos)) + epsilon); + } + __syncthreads(); + + if (i == 0) { + *beta1_power_ptr *= beta1; + *beta2_power_ptr *= beta2; + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void SparseApplyAdamAsyncGPU( \ + T**, T**, T**, const T*, T, T, T, T, T*, T*, int, long long int); +TF_CALL_float(REGISTER_KERNELS_ALL_INDEX) + TF_CALL_double(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX + + template + __global__ + void SparseApplyAdamAsyncSparseRmspropGPU(V** var, V** m, V** v, const V* g, + V lr, V beta1, V beta2, V epsilon, + int embedding_dim, + long long int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / embedding_dim; + int item_pos = i % embedding_dim; + + if (i < limit * embedding_dim) { + *(v[item_id] + item_pos) = + *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1.0 - beta2); + *(m[item_id] + item_pos) = + *(m[item_id] + item_pos) * beta1 + + rsqrt(*(v[item_id] + item_pos) + epsilon) * lr * g[i]; + *(var[item_id] + item_pos) -= *(m[item_id] + item_pos); + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void SparseApplyAdamAsyncSparseRmspropGPU( \ + T**, T**, T**, const T*, T, T, T, T, int, long long int); +TF_CALL_float(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_double(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX + +template +__global__ void SparseApplyAdamWGPU(V** var, V** m, V** v, const V* g, V alpha, + V beta1, V beta2, V epsilon, V weight_decay, + int embedding_dim, long long int limit) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int item_id = i / embedding_dim; + int item_pos = i % embedding_dim; + + if (i < limit * embedding_dim) { + *(m[item_id] + item_pos) += + (g[i] - *(m[item_id] + item_pos)) * (1.0 - beta1); + *(v[item_id] + item_pos) += + (g[i] * g[i] - *(v[item_id] + item_pos)) * (1.0 - beta2); + *(var[item_id] + item_pos) -= + (*(m[item_id] + item_pos) * alpha) / + (sqrt(*(v[item_id] + item_pos)) + epsilon) + + weight_decay * (*(var[item_id] + item_pos)); + } +} + +#define REGISTER_KERNELS_ALL_INDEX(T) \ + template __global__ void SparseApplyAdamWGPU( \ + T**, T**, T**, const T*, T, T, T, T, T, int, long long int); +TF_CALL_float(REGISTER_KERNELS_ALL_INDEX); +TF_CALL_double(REGISTER_KERNELS_ALL_INDEX); +#undef REGISTER_KERNELS_ALL_INDEX +} // namespace tensorflow +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/batch.h b/deepray/custom_ops/embedding_variable/cc/embedding/batch.h new file mode 100644 index 00000000..800e2e3c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/batch.h @@ -0,0 +1,66 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_ + +#if GOOGLE_CUDA +namespace tensorflow { +namespace embedding { + +template +__global__ void BatchCopy(V** batch, V* val_base, int value_len, int limit); + +template +__global__ void BatchUnpack(V** dev_value_address, V* memcpy_buffer_gpu, + int value_len, int limit); + +template +__global__ void CopyEmbedding(V** batch, V** batch_data_space, int total_dims, + int limit); +} // namespace embedding + +template +__global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr, + int embedding_dim, long long int limit); + +template +__global__ void SparseApplyAdamGPU(V** var, V** m, V** v, const V* g, V lr, + V beta1, V beta2, V epsilon, V beta1_power, + V beta2_power, int embedding_dim, + long long int limit); + +template +__global__ void SparseApplyAdamAsyncGPU(V** var, V** m, V** v, const V* g, V lr, + V beta1, V beta2, V epsilon, + V* beta1_power_ptr, V* beta2_power_ptr, + int embedding_dim, long long int limit); + +template +__global__ void SparseApplyAdamAsyncSparseRmspropGPU(V** var, V** m, V** v, + const V* g, V lr, V beta1, + V beta2, V epsilon, + int embedding_dim, + long long int limit); + +template +__global__ void SparseApplyAdamWGPU(V** var, V** m, V** v, const V* g, V alpha, + V beta1, V beta2, V epsilon, V weight_decay, + int embedding_dim, long long int limit); +} // namespace tensorflow + +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h new file mode 100644 index 00000000..6d30bbc8 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h @@ -0,0 +1,438 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_ + +#include "embedding_config.h" +#include "filter_policy.h" +#include "intra_thread_copy_id_allocator.h" + +namespace tensorflow { + +namespace { +const static std::vector default_seeds = { + 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, + 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; +} + +template +class BloomFilterPolicy : public FilterPolicy { + using FilterPolicy::ev_; + using FilterPolicy::config_; + + public: + BloomFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), FilterPolicy(config, ev) { + switch (config_.counter_type) { + case DT_UINT64: + VLOG(2) << "The type of bloom counter is uint64"; + bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(long)); + break; + case DT_UINT32: + VLOG(2) << "The type of bloom counter is uint32"; + bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(int)); + break; + case DT_UINT16: + VLOG(2) << "The type of bloom counter is uint16"; + bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(int16)); + break; + case DT_UINT8: + VLOG(2) << "The type of bloom counter is uint8"; + bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(bool)); + break; + default: + VLOG(2) << "defualt type of counter is uint64"; + bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(long)); + } + GenerateSeed(config.kHashFunc); + } + + Status Lookup(K key, V* val, const V* default_value_ptr, + const V* default_value_no_permission) override { + void* value_ptr = nullptr; + Status s = ev_->LookupKey(key, &value_ptr); + if (s.ok()) { + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } else { + memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); + } + return OkStatus(); + } + +#if GOOGLE_CUDA + void BatchLookup(const EmbeddingVarContext& ctx, const K* keys, + V* output, int64 num_of_keys, V* default_value_ptr, + V* default_value_no_permission) override { + std::vector value_ptr_list(num_of_keys, nullptr); + ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); + std::vector embedding_ptr(num_of_keys, nullptr); + auto do_work = [this, value_ptr_list, &embedding_ptr, default_value_ptr, + default_value_no_permission](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + void* value_ptr = value_ptr_list[i]; + if (value_ptr != nullptr) { + embedding_ptr[i] = + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + } else { + embedding_ptr[i] = default_value_no_permission; + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + auto stream = ctx.compute_stream; + auto event_mgr = ctx.event_mgr; + ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(), + stream, event_mgr, ctx.gpu_device); + } + + void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptrs_list, + int64 num_of_keys) { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> lookup_or_create_ids(num_worker_threads); + std::vector> lookup_or_create_cursor(num_worker_threads); + std::vector> lookup_or_create_ptrs(num_worker_threads); + IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); + std::vector> not_found_cursor_list(num_worker_threads + 1); + uint64 main_thread_id = Env::Default()->GetCurrentThreadId(); + + auto do_work = [this, keys, value_ptrs_list, &lookup_or_create_ids, + &lookup_or_create_ptrs, &lookup_or_create_cursor, + main_thread_id, + &thread_copy_id_alloc](int64 start, int64 limit) { + int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id); + for (int i = start; i < limit; i++) { + if (GetBloomFreq(keys[i]) >= config_.filter_freq) { + lookup_or_create_ids[copy_id].emplace_back(keys[i]); + lookup_or_create_ptrs[copy_id].emplace_back(value_ptrs_list[i]); + lookup_or_create_cursor[copy_id].emplace_back(i); + } else { + AddFreq(keys[i], 1); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + + std::vector total_ids(num_of_keys); + std::vector total_ptrs(num_of_keys); + std::vector total_cursors(num_of_keys); + int num_of_admit_id = 0; + for (int i = 0; i < num_worker_threads; i++) { + if (lookup_or_create_ids[i].size() > 0) { + memcpy(total_ids.data() + num_of_admit_id, + lookup_or_create_ids[i].data(), + sizeof(K) * lookup_or_create_ids[i].size()); + memcpy(total_ptrs.data() + num_of_admit_id, + lookup_or_create_ptrs[i].data(), + sizeof(void*) * lookup_or_create_ptrs[i].size()); + memcpy(total_cursors.data() + num_of_admit_id, + lookup_or_create_cursor[i].data(), + sizeof(int) * lookup_or_create_cursor[i].size()); + num_of_admit_id += lookup_or_create_ids[i].size(); + } + } + + ev_->BatchLookupOrCreateKey(ctx, total_ids.data(), total_ptrs.data(), + num_of_keys, not_found_cursor_list); + for (int i = 0; i < total_ptrs.size(); i++) { + value_ptrs_list[total_cursors[i]] = total_ptrs[i]; + } + } +#endif // GOOGLE_CUDA + + void LookupOrCreate(K key, V* val, const V* default_value_ptr, + void** value_ptr, int count, + const V* default_value_no_permission) override { + if (GetBloomFreq(key) >= config_.filter_freq) { + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } else { + AddFreq(key, count); + memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); + } + } + + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, + int64 count) override { + *value_ptr = nullptr; + if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) { + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + ev_->storage()->Insert(key, value_ptr); + s = OkStatus(); + } + *is_filter = true; + feat_desc_->AddFreq(*value_ptr, count); + } else { + *is_filter = false; + AddFreq(key, count); + } + return OkStatus(); + } + + int64 GetFreq(K key, void* val) override { return GetBloomFreq(key); } + + int64 GetFreq(K key) override { return GetBloomFreq(key); } + + void* GetBloomCounter() const { return bloom_counter_; } + + bool is_admit(K key, void* value_ptr) override { + if (value_ptr == nullptr) { + return false; + } else { + return GetFreq(key, value_ptr) >= config_.filter_freq; + } + } + + private: + int64 GetBloomFreq(K key) { + std::vector hash_val; + for (int64 i = 0; i < config_.kHashFunc; i++) { + hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter); + } + int64 min_freq; + switch (config_.counter_type) { + case DT_UINT64: + min_freq = GetMinFreq(hash_val); + break; + case DT_UINT32: + min_freq = GetMinFreq(hash_val); + break; + case DT_UINT16: + min_freq = GetMinFreq(hash_val); + break; + case DT_UINT8: + min_freq = GetMinFreq(hash_val); + break; + default: + min_freq = GetMinFreq(hash_val); + } + return min_freq; + } + +#define mix(h) \ + ({ \ + (h) ^= (h) >> 23; \ + (h) *= 0x2127599bf4325c37ULL; \ + (h) ^= (h) >> 47; \ + }) + + uint64_t FastHash64(K key, uint64_t seed) { + const uint64_t m = 0x880355f21e6d1965ULL; + + uint64_t h = seed ^ (8 * m); + uint64_t v; + v = key; + h ^= mix(v); + h *= m; + + v = 0; + h ^= mix(v); + h *= m; + + return mix(h); + } + + template + int64 GetMinFreq(std::vector hash_val) { + VBloom min_freq = *((VBloom*)bloom_counter_ + hash_val[0]); + for (auto it : hash_val) { + min_freq = std::min(*((VBloom*)bloom_counter_ + it), min_freq); + } + return min_freq; + } + + template + void SetMinFreq(std::vector hash_val, int64 freq) { + for (auto it : hash_val) { + *((VBloom*)bloom_counter_ + it) = freq; + } + } + + void SetBloomFreq(K key, int64 freq) { + std::vector hash_val; + for (int64 i = 0; i < config_.kHashFunc; i++) { + hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter); + } + switch (config_.counter_type) { + case DT_UINT64: + SetMinFreq(hash_val, freq); + break; + case DT_UINT32: + SetMinFreq(hash_val, freq); + break; + case DT_UINT16: + SetMinFreq(hash_val, freq); + break; + case DT_UINT8: + SetMinFreq(hash_val, freq); + break; + default: + SetMinFreq(hash_val, freq); + } + } + + Status Restore(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool to_dram, bool is_incr, + RestoreBuffer& restore_buff) override { + K* key_buff = (K*)restore_buff.key_buffer; + V* value_buff = (V*)restore_buff.value_buffer; + int64* version_buff = (int64*)restore_buff.version_buffer; + int64* freq_buff = (int64*)restore_buff.freq_buffer; + if (to_dram) { + LOG(FATAL) << "BloomFilter dosen't support ImportToDRAM"; + return OkStatus(); + } + + for (auto i = 0; i < key_num; ++i) { + // this can describe by graph(Mod + DynamicPartition), + // but memory waste and slow + if (*(key_buff + i) % bucket_num % partition_num != partition_id) { + VLOG(1) << "skip EV key:" << *(key_buff + i); + continue; + } + void* value_ptr = nullptr; + int64 new_freq = freq_buff[i]; + int64 import_version = -1; + if (config_.steps_to_live != 0 || config_.record_version) { + import_version = version_buff[i]; + } + if (!is_filter) { + if (freq_buff[i] >= config_.filter_freq) { + SetBloomFreq(key_buff[i], freq_buff[i]); + } else { + SetBloomFreq(key_buff[i], config_.filter_freq); + new_freq = config_.filter_freq; + } + } else { + SetBloomFreq(key_buff[i], freq_buff[i]); + } + if (new_freq >= config_.filter_freq) { + ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(), + new_freq, import_version, config_.emb_index); + } + } + return OkStatus(); + } + + void AddFreq(K key) { + std::vector hash_val; + for (int64 i = 0; i < config_.kHashFunc; i++) { + hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter); + } + + for (auto it : hash_val) { + switch (config_.counter_type) { + case DT_UINT64: + if (*((uint64*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint64*)bloom_counter_ + it, 1); + break; + case DT_UINT32: + if (*((uint32*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint32*)bloom_counter_ + it, 1); + break; + case DT_UINT16: + if (*((uint16*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint16*)bloom_counter_ + it, 1); + break; + case DT_UINT8: + if (*((uint8*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint8*)bloom_counter_ + it, 1); + break; + default: + if (*((uint64*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint64*)bloom_counter_ + it, 1); + } + } + } + + void AddFreq(K key, int64 count) { + std::vector hash_val; + for (int64 i = 0; i < config_.kHashFunc; i++) { + hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter); + } + + for (auto it : hash_val) { + switch (config_.counter_type) { + case DT_UINT64: + if (*((uint64*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint64*)bloom_counter_ + it, count); + break; + case DT_UINT32: + if (*((uint32*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint32*)bloom_counter_ + it, count); + break; + case DT_UINT16: + if (*((uint16*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint16*)bloom_counter_ + it, count); + break; + case DT_UINT8: + if (*((uint8*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint8*)bloom_counter_ + it, count); + break; + default: + if (*((uint64*)bloom_counter_ + it) < config_.filter_freq) + __sync_fetch_and_add((uint64*)bloom_counter_ + it, count); + } + } + } + + void GenerateSeed(int64 kHashFunc) { + if (kHashFunc < default_seeds.size()) { + for (int64 i = 0; i < kHashFunc; i++) { + seeds_.emplace_back(default_seeds[i]); + } + } else { + for (int64 i = 0; i < default_seeds.size(); i++) { + seeds_.emplace_back(default_seeds[i]); + } + int64 last_seed = 98; + for (int64 i = default_seeds.size(); i < kHashFunc; i++) { + for (int64 j = last_seed;; j++) { + if (j % 2 == 0) continue; + bool is_prime = true; + for (int64 k = 0; k <= std::sqrt(j) + 1; k++) { + if (j % k == 0) is_prime = false; + } + if (is_prime) { + seeds_.emplace_back(j); + last_seed = j; + break; + } + } + } + } + } + + private: + void* bloom_counter_; + embedding::FeatureDescriptor* feat_desc_; + std::vector seeds_; +}; +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache.h new file mode 100644 index 00000000..5c9a51a9 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache.h @@ -0,0 +1,521 @@ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_ +#include +#include +#include +#include +#include +#include + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace embedding { + +template +class BatchCache { + public: + BatchCache() {} + virtual ~BatchCache() {} + void update(const Tensor& t) { update((K*)t.data(), t.NumElements()); } + void add_to_prefetch_list(const Tensor& t) { + add_to_prefetch_list((K*)t.data(), t.NumElements()); + } + void add_to_cache(const Tensor& t) { + add_to_cache((K*)t.data(), t.NumElements()); + } + + void update(const Tensor& t, const Tensor& counts_tensor) { + update((K*)t.data(), t.NumElements(), nullptr, + (int64*)counts_tensor.data()); + } + + virtual size_t get_evic_ids(K* evic_ids, size_t k_size) = 0; + virtual size_t get_cached_ids(K* cached_ids, size_t k_size, + int64* cached_versions, + int64* cached_freqs) = 0; + virtual void update(const K* batch_ids, size_t batch_size, + bool use_locking = true) = 0; + virtual void update(const K* batch_ids, size_t batch_size, + const int64* batch_versions, const int64* batch_freqs, + bool use_locking = true) = 0; + virtual void add_to_prefetch_list(const K* batch_ids, size_t batch_size) = 0; + virtual void add_to_cache(const K* batch_ids, size_t batch_size) = 0; + virtual size_t size() = 0; + virtual void reset_status() { + num_hit = 0; + num_miss = 0; + } + std::string DebugString() { + float hit_rate = 0.0; + if (num_hit > 0 || num_miss > 0) { + hit_rate = num_hit * 100.0 / (num_hit + num_miss); + } + return strings::StrCat("HitRate = ", hit_rate, + " %, visit_count = ", num_hit + num_miss, + ", hit_count = ", num_hit); + } + virtual mutex_lock maybe_lock_cache(mutex& mu, mutex& temp_mu, + bool use_locking) { + if (use_locking) { + mutex_lock l(mu); + return l; + } else { + mutex_lock l(temp_mu); + return l; + } + } + + protected: + int64 num_hit; + int64 num_miss; +}; + +template +class PrefetchNode { + public: + explicit PrefetchNode() : key_(-1), ref_count_(1) {} + explicit PrefetchNode(K id) : key_(id), ref_count_(1) {} + virtual ~PrefetchNode() {} + virtual void Ref() { ref_count_++; }; + virtual void UnRef() { ref_count_--; }; + virtual K key() { return key_; } + virtual int64 ref_count() { return ref_count_; } + + protected: + K key_; + int64 ref_count_; +}; + +template +class PrefetchLFUNode : public PrefetchNode { + public: + explicit PrefetchLFUNode(K id) { + PrefetchNode::key_ = id; + PrefetchNode::ref_count_ = 1; + freq_ = 1; + } + + PrefetchLFUNode(K id, int64 freq) { + PrefetchNode::key_ = id; + PrefetchNode::ref_count_ = 1; + freq_ = freq; + } + + void Ref() override { + PrefetchNode::ref_count_++; + freq_++; + } + + int64 freq() { return freq_; } + + private: + int64 freq_; +}; + +template +class LRUCache : public BatchCache { + public: + LRUCache() { + mp.clear(); + head = new LRUNode(0); + tail = new LRUNode(0); + head->next = tail; + tail->pre = head; + BatchCache::num_hit = 0; + BatchCache::num_miss = 0; + } + + size_t size() { + mutex_lock l(mu_); + return mp.size(); + } + + size_t get_evic_ids(K* evic_ids, size_t k_size) { + mutex_lock l(mu_); + size_t true_size = 0; + LRUNode* evic_node = tail->pre; + LRUNode* rm_node = evic_node; + for (size_t i = 0; i < k_size && evic_node != head; ++i) { + evic_ids[i] = evic_node->id; + rm_node = evic_node; + evic_node = evic_node->pre; + mp.erase(rm_node->id); + delete rm_node; + true_size++; + } + evic_node->next = tail; + tail->pre = evic_node; + return true_size; + } + + size_t get_cached_ids(K* cached_ids, size_t k_size, int64* cached_versions, + int64* cached_freqs) override { + mutex_lock l(mu_); + LRUNode* it = head->next; + size_t i; + for (i = 0; i < k_size && it != tail; i++, it = it->next) { + cached_ids[i] = it->id; + } + return i; + } + + void update(const K* batch_ids, size_t batch_size, bool use_locking = true) { + mutex temp_mu; + auto lock = BatchCache::maybe_lock_cache(mu_, temp_mu, use_locking); + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + typename std::map::iterator it = mp.find(id); + if (it != mp.end()) { + LRUNode* node = it->second; + node->pre->next = node->next; + node->next->pre = node->pre; + head->next->pre = node; + node->next = head->next; + head->next = node; + node->pre = head; + BatchCache::num_hit++; + } else { + LRUNode* newNode = new LRUNode(id); + head->next->pre = newNode; + newNode->next = head->next; + head->next = newNode; + newNode->pre = head; + mp[id] = newNode; + BatchCache::num_miss++; + } + } + } + + void update(const K* batch_ids, size_t batch_size, const int64* batch_version, + const int64* batch_freqs, bool use_locking = true) override { + // TODO: add to rank accroding to the version of ids + update(batch_ids, batch_size); + } + + void add_to_prefetch_list(const K* batch_ids, const size_t batch_size) { + mutex_lock l(mu_); + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it_prefetch = prefetch_id_table.find(id); + if (it_prefetch == prefetch_id_table.end()) { + auto it_cache = mp.find(id); + if (it_cache != mp.end()) { + LRUNode* node = it_cache->second; + node->pre->next = node->next; + node->next->pre = node->pre; + delete node; + mp.erase(id); + } + prefetch_id_table[id] = new PrefetchNode(id); + } else { + it_prefetch->second->Ref(); + } + } + } + + void add_to_cache(const K* batch_ids, const size_t batch_size) { + mutex_lock l(mu_); + std::vector ids_to_cache(batch_size); + int64 nums_to_cache = 0; + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it_prefetch = prefetch_id_table.find(id); + if (it_prefetch == prefetch_id_table.end()) { + LOG(FATAL) << "The id should be prefetched before being used."; + } + it_prefetch->second->UnRef(); + if (it_prefetch->second->ref_count() == 0) { + delete it_prefetch->second; + prefetch_id_table.erase(id); + ids_to_cache[nums_to_cache] = id; + nums_to_cache++; + } + } + update(ids_to_cache.data(), nums_to_cache, false); + } + + private: + class LRUNode { + public: + K id; + LRUNode *pre, *next; + LRUNode(K id) : id(id), pre(nullptr), next(nullptr) {} + }; + LRUNode *head, *tail; + std::map mp; + std::unordered_map*> prefetch_id_table; + mutex mu_; +}; + +template +class LFUCache : public BatchCache { + public: + LFUCache() { + min_freq = std::numeric_limits::max(); + max_freq = 0; + freq_table.emplace_back( + std::pair*, int64>(new std::list, 0)); + BatchCache::num_hit = 0; + BatchCache::num_miss = 0; + } + + size_t size() { + mutex_lock l(mu_); + return key_table.size(); + } + + size_t get_cached_ids(K* cached_ids, size_t k_size, int64* cached_versions, + int64* cached_freqs) override { + mutex_lock l(mu_); + size_t i = 0; + size_t curr_freq = max_freq; + auto it = freq_table[max_freq - 1].first->begin(); + while (i < k_size && curr_freq >= min_freq) { + cached_ids[i] = (*it).key; + cached_freqs[i] = (*it).freq; + i++; + it++; + if (it == freq_table[curr_freq - 1].first->end()) { + do { + curr_freq--; + } while (freq_table[curr_freq - 1].second == 0 && + curr_freq >= min_freq); + if (curr_freq >= min_freq) { + it = freq_table[curr_freq - 1].first->begin(); + } + } + } + return i; + } + + size_t get_evic_ids(K* evic_ids, size_t k_size) { + mutex_lock l(mu_); + size_t true_size = 0; + size_t st_freq = min_freq; + for (size_t i = 0; i < k_size && key_table.size() > 0; ++i) { + auto rm_it = freq_table[st_freq - 1].first->back(); + key_table.erase(rm_it.key); + evic_ids[i] = rm_it.key; + ++true_size; + freq_table[st_freq - 1].first->pop_back(); + freq_table[st_freq - 1].second--; + if (freq_table[st_freq - 1].second == 0) { + ++st_freq; + while (st_freq <= max_freq) { + if (freq_table[st_freq - 1].second == 0) { + ++st_freq; + } else { + min_freq = st_freq; + break; + } + } + if (st_freq > max_freq) { + reset_min_and_max_freq(); + } + } + } + return true_size; + } + + void update(const K* batch_ids, size_t batch_size, bool use_locking = true) { + mutex temp_mu; + auto lock = BatchCache::maybe_lock_cache(mu_, temp_mu, use_locking); + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it = key_table.find(id); + if (it == key_table.end()) { + freq_table[0].first->emplace_front(LFUNode(id, 1)); + freq_table[0].second++; + key_table[id] = freq_table[0].first->begin(); + min_freq = 1; + max_freq = std::max(max_freq, min_freq); + BatchCache::num_miss++; + } else { + typename std::list::iterator node = it->second; + size_t freq = node->freq; + freq_table[freq - 1].first->erase(node); + freq_table[freq - 1].second--; + if (freq_table[freq - 1].second == 0) { + if (min_freq == freq) min_freq += 1; + } + if (freq == freq_table.size()) { + freq_table.emplace_back( + std::pair*, int64>(new std::list, 0)); + } + max_freq = std::max(max_freq, freq + 1); + freq_table[freq].first->emplace_front(LFUNode(id, freq + 1)); + freq_table[freq].second++; + key_table[id] = freq_table[freq].first->begin(); + BatchCache::num_hit++; + } + } + } + + void update(const K* batch_ids, const size_t batch_size, + const int64* batch_versions, const int64* batch_freqs, + bool use_locking = true) override { + mutex temp_mu; + auto lock = BatchCache::maybe_lock_cache(mu_, temp_mu, use_locking); + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it = key_table.find(id); + size_t freq = batch_freqs[i]; + if (it == key_table.end()) { + if (freq < min_freq) { + min_freq = freq; + } + + if (freq > max_freq) { + max_freq = freq; + int64 prev_size = freq_table.size(); + if (max_freq > prev_size) { + freq_table.resize( + max_freq, std::pair*, int64>(nullptr, 0)); + for (int64 j = prev_size; j < max_freq; j++) { + freq_table[j].first = new std::list; + } + } + } + freq_table[freq - 1].first->emplace_front(LFUNode(id, freq)); + freq_table[freq - 1].second++; + key_table[id] = freq_table[freq - 1].first->begin(); + BatchCache::num_miss++; + } else { + typename std::list::iterator node = it->second; + size_t last_freq = node->freq; + size_t curr_freq = last_freq + freq; + freq_table[last_freq - 1].first->erase(node); + freq_table[last_freq - 1].second--; + + if (curr_freq > max_freq) { + max_freq = curr_freq; + freq_table.resize(max_freq, std::pair*, int64>( + new std::list, 0)); + } + + if (freq_table[last_freq - 1].second == 0) { + if (min_freq == last_freq) { + update_min_freq(); + } + } + + freq_table[curr_freq - 1].first->emplace_front(LFUNode(id, curr_freq)); + freq_table[curr_freq - 1].second++; + key_table[id] = freq_table[curr_freq - 1].first->begin(); + BatchCache::num_hit++; + } + } + } + + void add_to_prefetch_list(const K* batch_ids, const size_t batch_size) { + mutex_lock l(mu_); + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it_prefetch = prefetch_id_table.find(id); + if (it_prefetch == prefetch_id_table.end()) { + auto it_cache = key_table.find(id); + if (it_cache != key_table.end()) { + auto cache_node = it_cache->second; + int64 freq = cache_node->freq; + freq_table[freq - 1].first->erase(cache_node); + freq_table[freq - 1].second--; + key_table.erase(id); + if (freq_table[freq - 1].second == 0) { + if (freq == max_freq) { + update_max_freq(); + } + if (freq == min_freq) { + update_min_freq(); + } + } + prefetch_id_table[id] = new PrefetchLFUNode(id, freq); + } else { + prefetch_id_table[id] = new PrefetchLFUNode(id); + } + } else { + it_prefetch->second->Ref(); + } + } + } + + void add_to_cache(const K* batch_ids, const size_t batch_size) { + mutex_lock l(mu_); + std::vector ids_to_cache(batch_size); + std::vector freqs_to_cache(batch_size); + int64 nums_to_cache = 0; + for (size_t i = 0; i < batch_size; ++i) { + K id = batch_ids[i]; + auto it_prefetch = prefetch_id_table.find(id); + if (it_prefetch == prefetch_id_table.end()) { + LOG(FATAL) << "The id should be prefetched before being used."; + } + it_prefetch->second->UnRef(); + if (it_prefetch->second->ref_count() == 0) { + int64 freq = it_prefetch->second->freq(); + delete it_prefetch->second; + prefetch_id_table.erase(id); + ids_to_cache[nums_to_cache] = id; + freqs_to_cache[nums_to_cache] = freq; + nums_to_cache++; + } + } + const int64* versions_to_cache = nullptr; + update(ids_to_cache.data(), nums_to_cache, versions_to_cache, + freqs_to_cache.data(), false); + } + + private: + void reset_min_and_max_freq() { + min_freq = std::numeric_limits::max(); + max_freq = 0; + } + + void update_min_freq() { + size_t i; + for (i = min_freq + 1; i <= max_freq; i++) { + if (freq_table[i - 1].second != 0) { + min_freq = i; + break; + } + } + if (i > max_freq) { + reset_min_and_max_freq(); + } + } + + void update_max_freq() { + size_t i; + for (i = max_freq - 1; i >= min_freq; i--) { + if (freq_table[i - 1].second != 0) { + max_freq = i; + break; + } + } + if (i < min_freq) { + reset_min_and_max_freq(); + } + } + + class LFUNode { + public: + K key; + size_t freq; + LFUNode(K key, size_t freq) : key(key), freq(freq) {} + }; + size_t min_freq; + size_t max_freq; + std::vector*, int64>> freq_table; + std::unordered_map::iterator> key_table; + std::unordered_map*> prefetch_id_table; + mutex mu_; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h new file mode 100644 index 00000000..97e4cf2c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h @@ -0,0 +1,47 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_ + +#include "cache.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" + +namespace tensorflow { +namespace embedding { +class CacheFactory { + public: + template + static BatchCache* Create(CacheStrategy cache_strategy, std::string name) { + switch (cache_strategy) { + case CacheStrategy::LRU: + LOG(INFO) << " Use Storage::LRU in multi-tier EmbeddingVariable " + << name; + return new LRUCache(); + case CacheStrategy::LFU: + LOG(INFO) << " Use Storage::LFU in multi-tier EmbeddingVariable " + << name; + return new LFUCache(); + default: + LOG(INFO) << " Invalid Cache strategy, \ + use LFU in multi-tier EmbeddingVariable " + << name; + return new LFUCache(); + } + } +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h new file mode 100644 index 00000000..3c43a41c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h @@ -0,0 +1,45 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_ + +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { + +namespace embedding { +template +class MultiTierStorage; + +class CacheThreadPoolCreator { + public: + static thread::ThreadPool* Create() { + int64 num_threads = 1; + TF_CHECK_OK( + ReadInt64FromEnvVar("TF_MULTI_TIER_EV_CACHE_THREADS", 1, &num_threads)); + static thread::ThreadPool cache_thread_pool(Env::Default(), ThreadOptions(), + "MultiTier_Embedding_Cache", + num_threads, + /*low_latency_hint=*/false); + return &cache_thread_pool; + } +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/config.proto b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto new file mode 100644 index 00000000..424fc5e1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto @@ -0,0 +1,58 @@ +syntax = "proto3"; + +package tensorflow.embedding; + +enum StorageType { + // none + DEFAULT = 0; + + // one level + DRAM = 1; + PMEM_MEMKIND = 2; + PMEM_LIBPMEM = 3; + SSDHASH = 4; + LEVELDB = 5; + HBM = 6; + + // two level + DRAM_PMEM = 11; + DRAM_SSDHASH = 12; + HBM_DRAM = 13; + DRAM_LEVELDB = 14; + + // three level + DRAM_PMEM_SSDHASH = 101; + HBM_DRAM_SSDHASH = 102; + +} + +enum CopyBackFlag { + NOT_COPYBACK = 0; + COPYBACK = 1; + COPYBACK_AND_DESTROY = 2; +} + +enum SlotType { + EMBEDDING_VARIABLE = 0; + VARIABLE = 1; +} + +enum CacheStrategy { + LRU = 0; + LFU = 1; +} + +enum EmbeddingVariableType { + IMMUTABLE = 0; + MUTABLE = 1; +} + +enum ValuePtrStatus { + OK = 0; + IS_DELETED = 1; + NOT_IN_DRAM = 2; +} + +enum IsSetInitialized { + NOT_SET_INITAILIZED = 0; +} diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h new file mode 100644 index 00000000..bb5682c5 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h @@ -0,0 +1,252 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#include + +#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl : public FeatureDescriptorImpl { + public: + CounterFilterDescriptorImpl(Allocator* alloc, int64 slot_num, + bool need_record_freq, bool need_record_version, + int64 filter_freq, StorageType storage_type) + : filter_freq_(filter_freq), + is_record_freq_(need_record_freq), + FeatureDescriptorImpl(slot_num, need_record_freq, + need_record_version) { + if (filter_freq >= (1L << version_offset_bits_)) { + LOG(FATAL) << "Filter freqeuncy threshold shouldn't bigger than 2^12."; + } + + if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { +#if GOOGLE_CUDA + feat_desc_impl_.reset(new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, need_record_freq, need_record_version)); +#endif // GOOGLE_CUDA + } else { + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + alloc, slot_num, need_record_freq, need_record_version)); + } + } + + CounterFilterDescriptorImpl(CounterFilterDescriptorImpl* feat_desc_impl) + : filter_freq_(feat_desc_impl->filter_freq_), + FeatureDescriptorImpl(feat_desc_impl) { +#if GOOGLE_CUDA + if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl*)) { + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); + } else { +#endif // GOOGLE_CUDA + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); +#if GOOGLE_CUDA + } +#endif // GOOGLE_CUDA + } + + ~CounterFilterDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return feat_desc_impl_->InitSlotInfo(emb_index, embedding_dim, + default_value); + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + return feat_desc_impl_->InitSlotInfo(feat_desc_impl); + } + + V* GetEmbedding(void* val, int emb_index) override { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + bool IsAdmit(void* val) override { return (GetFlag(val) == 0); } + + void* Admit(void* val) override { + if (!IsAdmit(val)) { + return feat_desc_impl_->Allocate(); + } else { + LOG(FATAL) << "Only unadmited feature could be admited."; + return nullptr; + } + } + + void* Allocate() override { + uint64* val = (uint64*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + alloc_bytes_); + uint64 flag = 1L << flag_offset_bits_; + uint64 version = (0xffffffffffffffff << version_offset_bits_); + uint64 freq = 0; + *val = version + freq; + val = (uint64*)((uint64)val | flag); + return (void*)val; + } + + void* Allocate(int64 freq) override { + if (freq < filter_freq_) { + return Allocate(); + } else { + return feat_desc_impl_->Allocate(); + } + } + + void Deallocate(void* val) override { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val : vals) { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + } + + void AddFreq(void* val, int64 count) override { + uint64* tmp = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + __sync_fetch_and_add(tmp, count); + } else { + feat_desc_impl_->AddFreq(val, count); + } + } + + void SetAllocator(Allocator* alloc) override { + feat_desc_impl_->SetAllocator(alloc); + } + + void SetValue(void* val, int64 emb_index, V* value) { + if (IsAdmit(val)) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + } + + void SetDefaultValue(void* val, int64 key) override { + feat_desc_impl_->SetDefaultValue(val, key); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + feat_desc_impl_->SetDefaultValues(keys, init_cursor, value_ptrs, + compute_stream, event_mgr, gpu_device); + } +#endif + + int64 GetFreq(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + return *((uint64*)tmp) & ((1L << version_offset_bits_) - 1); + } else { + if (is_record_freq_) { + return feat_desc_impl_->GetFreq(val); + } else { + return filter_freq_; + } + } + } + + int64 GetVersion(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + int64 version = *(uint64*)tmp >> version_offset_bits_; + if (version == 0xffffffffffff) { + version = -1; + } + return version; + } else { + return feat_desc_impl_->GetVersion(val); + } + } + + void UpdateVersion(void* val, int64 version) override { + if (!IsAdmit(val)) { + void* tmp_ptr = GetPtr(val); + uint64 tmp_val = 0; + uint64 result = 0; + do { + tmp_val = *(uint64*)tmp_ptr; + version = version << version_offset_bits_; + uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1); + result = version + freq; + } while ( + !__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result)); + } else { + feat_desc_impl_->UpdateVersion(val, version); + } + } + + void SetFreq(void* val, int64 freq) override { + uint64* tmp_ptr = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + uint64 tmp = *tmp_ptr; + tmp = ~((1L << version_offset_bits_) - 1) & tmp; + tmp += freq; + __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp); + } else { + feat_desc_impl_->SetFreq(val, freq); + } + } + + int data_bytes() override { return alloc_bytes_; } + + private: + uint64 GetFlag(void* val) { return (uint64)val >> flag_offset_bits_; } + + void* GetPtr(void* val) { + return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1)); + } + + int64 filter_freq_; + int alloc_bytes_ = 8; + Allocator* alloc_ = ev_allocator(); + const int freq_offset_bits_ = 0; + const int version_offset_bits_ = 16; + const int flag_offset_bits_ = 48; + std::unique_ptr> feat_desc_impl_; + bool is_record_freq_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h new file mode 100644 index 00000000..4098aa75 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h @@ -0,0 +1,189 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_ + +#include "embedding_config.h" +#include "filter_policy.h" + +namespace tensorflow { + +template +class CounterFilterPolicy : public FilterPolicy { + using FilterPolicy::ev_; + using FilterPolicy::config_; + + public: + CounterFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), FilterPolicy(config, ev) {} + + Status Lookup(K key, V* val, const V* default_value_ptr, + const V* default_value_no_permission) override { + void* value_ptr = nullptr; + Status s = ev_->LookupKey(key, &value_ptr); + if (s.ok() && feat_desc_->IsAdmit(value_ptr)) { + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } else { + memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); + } + return OkStatus(); + } + +#if GOOGLE_CUDA + void BatchLookup(const EmbeddingVarContext& ctx, const K* keys, + V* output, int64 num_of_keys, V* default_value_ptr, + V* default_value_no_permission) override { + std::vector value_ptr_list(num_of_keys, nullptr); + ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); + std::vector embedding_ptr(num_of_keys, nullptr); + auto do_work = [this, keys, value_ptr_list, &embedding_ptr, + default_value_ptr, + default_value_no_permission](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + void* value_ptr = value_ptr_list[i]; + int64 freq = GetFreq(keys[i], value_ptr); + if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) { + embedding_ptr[i] = + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + } else { + embedding_ptr[i] = default_value_no_permission; + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + auto stream = ctx.compute_stream; + auto event_mgr = ctx.event_mgr; + ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(), + stream, event_mgr, ctx.gpu_device); + } + + void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptrs_list, + int64 num_of_keys) override { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> not_found_cursor_list(num_worker_threads + 1); + ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs_list, num_of_keys, + not_found_cursor_list); + } +#endif // GOOGLE_CUDA + + void LookupOrCreate(K key, V* val, const V* default_value_ptr, + void** value_ptr, int count, + const V* default_value_no_permission) override { + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + if (is_filter) { + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } else { + memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); + } + } + + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, + int64 count) override { + *is_filter = false; + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + if (count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + feat_desc_->Deallocate(*value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + ev_->storage()->Insert(key, value_ptr); + s = OkStatus(); + } else if (!feat_desc_->IsAdmit(*value_ptr)) { + int64 freq = feat_desc_->GetFreq(*value_ptr); + if (freq + count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetFreq(admit_value_ptr, freq); + feat_desc_->UpdateVersion(admit_value_ptr, + feat_desc_->GetVersion(*value_ptr)); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + } else { + *is_filter = true; + } + feat_desc_->AddFreq(*value_ptr, count); + return s; + } + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); + } + + int64 GetFreq(K key) override { + void* value_ptr = nullptr; + TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); + return feat_desc_->GetFreq(value_ptr); + } + + Status Restore(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool to_dram, bool is_incr, + RestoreBuffer& restore_buff) override { + K* key_buff = (K*)restore_buff.key_buffer; + V* value_buff = (V*)restore_buff.value_buffer; + int64* version_buff = (int64*)restore_buff.version_buffer; + int64* freq_buff = (int64*)restore_buff.freq_buffer; + for (auto i = 0; i < key_num; ++i) { + // this can describe by graph(Mod + DynamicPartition), + // but memory waste and slow + if (*(key_buff + i) % bucket_num % partition_num != partition_id) { + VLOG(1) << "skip EV key:" << *(key_buff + i); + continue; + } + int64 import_freq = 0; + int64 import_version = -1; + if (!is_filter) { + if (freq_buff[i] >= config_.filter_freq) { + import_freq = freq_buff[i]; + } else { + import_freq = config_.filter_freq; + } + } else { + import_freq = freq_buff[i]; + } + if (config_.steps_to_live != 0 || config_.record_version) { + import_version = version_buff[i]; + } + ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); + } + return OkStatus(); + } + + bool is_admit(K key, void* value_ptr) override { + return feat_desc_->IsAdmit(value_ptr); + } + + private: + embedding::FeatureDescriptor* feat_desc_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h new file mode 100644 index 00000000..3aaaf9d0 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h @@ -0,0 +1,214 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_ + +#include "kv_interface.h" +#include "sparsehash/dense_hash_map_lockless" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace embedding { + +template +class LocklessHashMap : public KVInterface { + public: + LocklessHashMap(FeatureDescriptor* feat_desc) : feat_desc_(feat_desc) { + hash_map_.max_load_factor(0.8); + hash_map_.set_empty_key_and_value(LocklessHashMap::EMPTY_KEY_, + nullptr); + hash_map_.set_counternum(16); + hash_map_.set_deleted_key(LocklessHashMap::DELETED_KEY_); + pthread_key_create(&key_, NULL); + } + + ~LocklessHashMap() override { pthread_key_delete(key_); } + + Status Lookup(K key, void** value_ptr) override { + auto iter = hash_map_.find_wait_free(key); + if (iter.first == LocklessHashMap::EMPTY_KEY_) { + return errors::NotFound("Unable to find Key: ", key, + " in LocklessHashMap."); + } else { + *value_ptr = iter.second; + return OkStatus(); + } + } + + Status Contains(K key) override { + auto iter = hash_map_.find_wait_free(key); + if (iter.first == LocklessHashMap::EMPTY_KEY_) { + return errors::NotFound("Unable to find Key: ", key, + " in LocklessHashMap."); + } else { + return OkStatus(); + } + } + + Status Insert(K key, const void* value_ptr) override { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(key, const_cast(value_ptr)))); + // insert fail, exist key + if ((*(iter.first)).second != value_ptr) { + return errors::AlreadyExists("already exists Key: ", key, + " in LocklessHashMap."); + } else { + return OkStatus(); + } + } + + // Other Method + int64 Size() const override { return hash_map_.size_lockless(); } + + // Remove KV + Status Remove(K key) override { + if (hash_map_.erase_lockless(key)) { + return OkStatus(); + } else { + return errors::NotFound("Unable to find Key: ", key, + " in LocklessHashMap."); + } + } + + Status Commit(K key, const void* value_ptr) override { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(key, const_cast(value_ptr)))); + if ((*(iter.first)).second != value_ptr) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap(&((*(iter.first)).second), + (*(iter.first)).second, value_ptr); + } + return OkStatus(); + } + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + for (int i = 0; i < keys.size(); ++i) { + auto iter = hash_map_.insert_lockless(std::move( + std::pair(keys[i], const_cast(value_ptrs[i])))); + if ((*(iter.first)).second != value_ptrs[i]) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap(&((*(iter.first)).second), + (*(iter.first)).second, value_ptrs[i]); + } + } + return OkStatus(); + } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + std::pair* hash_map_dump; + int64 bucket_count; + auto it = hash_map_.GetSnapshot(); + hash_map_dump = it.first; + bucket_count = it.second; + for (int64 j = 0; j < bucket_count; j++) { + if (hash_map_dump[j].first != LocklessHashMap::EMPTY_KEY_ && + hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_) { + key_list->emplace_back(hash_map_dump[j].first); + value_ptr_list->emplace_back(hash_map_dump[j].second); + } + } + free(hash_map_dump); + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + std::pair* hash_map_dump; + int64 bucket_count; + auto it = hash_map_.GetSnapshot(); + hash_map_dump = it.first; + bucket_count = it.second; + for (int64 j = 0; j < bucket_count; j++) { + if (hash_map_dump[j].first != LocklessHashMap::EMPTY_KEY_ && + hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_) { + int part_id = + hash_map_dump[j].first % kSavedPartitionNum % partition_nums; + if (part_id != partition_id) { + key_list[part_id].emplace_back(hash_map_dump[j].first); + value_ptr_list[part_id].emplace_back(hash_map_dump[j].second); + } + } + } + + free(hash_map_dump); + return OkStatus(); + } + + std::string DebugString() const override { + LOG(INFO) << "map info size:" << Size() + << "map info bucket_count:" << hash_map_.bucket_count() + << "map info load_factor:" << hash_map_.load_factor() + << "map info max_load_factor:" << hash_map_.max_load_factor() + << "map info min_load_factor:" << hash_map_.min_load_factor(); + return ""; + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(key, old_value_ptr))); + bool flag = __sync_bool_compare_and_swap(&((*(iter.first)).second), + old_value_ptr, new_value_ptr); + if (flag) { + AppendToValuePtrQueue(old_value_ptr); + } else { + feat_desc_->Deallocate(new_value_ptr); + } + } + + private: + void AppendToValuePtrQueue(void* old_value_ptr) { + // A parameter that can be adjusted in the future + std::deque* value_ptr_queue = GetOutOfDateValuePtrQueue(); + if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) { + void* value_ptr = value_ptr_queue->front(); + feat_desc_->Deallocate(value_ptr); + value_ptr_queue->pop_front(); + } + value_ptr_queue->emplace_back(old_value_ptr); + } + + std::deque* GetOutOfDateValuePtrQueue() { + std::deque* value_ptr_queue = + static_cast*>(pthread_getspecific(key_)); + if (value_ptr_queue == nullptr) { + value_ptr_queue = new std::deque(); + pthread_setspecific(key_, value_ptr_queue); + } + return value_ptr_queue; + } + + private: + typedef google::dense_hash_map_lockless LockLessHashMap; + static const int EMPTY_KEY_; + static const int DELETED_KEY_; + LockLessHashMap hash_map_; + const int CAP_INVALID_VALUEPTR = 20000; + FeatureDescriptor* feat_desc_; + pthread_key_t key_; +}; +template +const int LocklessHashMap::EMPTY_KEY_ = -111; +template +const int LocklessHashMap::DELETED_KEY_ = -222; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h new file mode 100644 index 00000000..8ae59141 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h @@ -0,0 +1,151 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_ + +#include "deepray/custom_ops/utils/spin_rw_lock.h" +#include "kv_interface.h" +#include "sparsehash/dense_hash_map" +#include "tensorflow/core/framework/typed_allocator.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace embedding { + +template +class DenseHashMap : public KVInterface { + public: + DenseHashMap() : hash_map_(nullptr) { + hash_map_ = new dense_hash_map[partition_num_]; + for (int i = 0; i < partition_num_; i++) { + hash_map_[i].hash_map.max_load_factor(0.8); + hash_map_[i].hash_map.set_empty_key(-1); + hash_map_[i].hash_map.set_deleted_key(-2); + } + } + + ~DenseHashMap() override { delete[] hash_map_; } + + Status Lookup(K key, void** value_ptr) override { + int64 l_id = std::abs(key) % partition_num_; + spin_rd_lock l(hash_map_[l_id].mu); + auto iter = hash_map_[l_id].hash_map.find(key); + if (iter == hash_map_[l_id].hash_map.end()) { + return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap."); + } else { + *value_ptr = iter->second; + return OkStatus(); + } + } + + Status Contains(K key) override { + int64 l_id = std::abs(key) % partition_num_; + spin_rd_lock l(hash_map_[l_id].mu); + auto iter = hash_map_[l_id].hash_map.find(key); + if (iter == hash_map_[l_id].hash_map.end()) { + return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap."); + } else { + return OkStatus(); + } + } + + Status Insert(K key, const void* value_ptr) override { + int64 l_id = std::abs(key) % partition_num_; + spin_wr_lock l(hash_map_[l_id].mu); + auto iter = hash_map_[l_id].hash_map.find(key); + // insert fail, exist key + if (iter != hash_map_[l_id].hash_map.end()) { + return errors::AlreadyExists("already exists Key: ", key, + " in DenseHashMap."); + } else { + auto iter = hash_map_[l_id].hash_map.insert( + std::move(std::pair(key, const_cast(value_ptr)))); + return OkStatus(); + } + } + + // Other Method + int64 Size() const override { + int64 ret = 0; + for (int i = 0; i < partition_num_; i++) { + spin_rd_lock l(hash_map_[i].mu); + ret += hash_map_[i].hash_map.size(); + } + return ret; + } + + // Remove KV + Status Remove(K key) override { + int64 l_id = std::abs(key) % partition_num_; + spin_wr_lock l(hash_map_[l_id].mu); + if (hash_map_[l_id].hash_map.erase(key)) { + return OkStatus(); + } else { + return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap."); + } + } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + dense_hash_map hash_map_dump[partition_num_]; + for (int i = 0; i < partition_num_; i++) { + spin_rd_lock l(hash_map_[i].mu); + hash_map_dump[i].hash_map = hash_map_[i].hash_map; + } + for (int i = 0; i < partition_num_; i++) { + for (const auto it : hash_map_dump[i].hash_map) { + key_list->push_back(it.first); + value_ptr_list->push_back(it.second); + } + } + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + dense_hash_map hash_map_dump[partition_num_]; + for (int i = 0; i < partition_num_; i++) { + spin_rd_lock l(hash_map_[i].mu); + hash_map_dump[i].hash_map = hash_map_[i].hash_map; + } + for (int i = 0; i < partition_num_; i++) { + for (const auto it : hash_map_dump[i].hash_map) { + int part_id = it.first % kSavedPartitionNum % partition_nums; + if (part_id != partition_id) { + key_list[part_id].emplace_back(it.first); + value_ptr_list[part_id].emplace_back(it.second); + } + } + } + return OkStatus(); + } + + std::string DebugString() const override { return ""; } + + private: + const int partition_num_ = 1000; + struct dense_hash_map { + mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER; + google::dense_hash_map hash_map; + }; + dense_hash_map* hash_map_; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h new file mode 100644 index 00000000..cd795954 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h @@ -0,0 +1,221 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_ + +#include "cpu_hash_map_kv.h" +#include "leveldb_kv.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" + +namespace tensorflow { +template +class EmbeddingVar; + +namespace embedding { +template +class DramLevelDBStore : public MultiTierStorage { + public: + DramLevelDBStore(const StorageConfig& sc, FeatureDescriptor* feat_desc, + const std::string& name) + : dram_feat_desc_(feat_desc), MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + leveldb_ = new LevelDBStore(sc, feat_desc); + } + + ~DramLevelDBStore() override { + MultiTierStorage::DeleteFromEvictionManager(); + delete dram_; + delete leveldb_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore); + + Status Get(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = leveldb_->Get(key, value_ptr); + if (s.ok()) { + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + leveldb_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + return s; + } + + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); + } + + void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); + } + + Status GetOrCreate(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = leveldb_->Get(key, value_ptr); + if (s.ok()) { + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + leveldb_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + dram_->CreateAndInsert(key, value_ptr); + return OkStatus(); + } + + Status Remove(K key) override { + dram_->Remove(key); + leveldb_->Remove(key); + return OkStatus(); + } + + bool IsUseHbm() override { return false; } + + bool IsSingleHbm() override { return false; } + + int64 Size() const override { + int64 total_size = dram_->Size(); + total_size += leveldb_->Size(); + return total_size; + } + + int64 Size(int level) const override { + if (level == 0) { + return dram_->Size(); + } else if (level == 1) { + return leveldb_->Size(); + } else { + return -1; + } + } + + int LookupTier(K key) const override { + Status s = dram_->Contains(key); + if (s.ok()) return 0; + s = leveldb_->Contains(key); + if (s.ok()) return 1; + return -1; + } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector key_list, tmp_leveldb_key_list; + std::vector value_ptr_list, tmp_leveldb_value_list; + TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); + + TF_CHECK_OK( + leveldb_->GetSnapshot(&tmp_leveldb_key_list, &tmp_leveldb_value_list)); + + for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) { + tmp_leveldb_value_list[i] = + (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset)); + } + + std::vector leveldb_key_list; + for (int64 i = 0; i < tmp_leveldb_key_list.size(); i++) { + Status s = dram_->Contains(tmp_leveldb_key_list[i]); + if (!s.ok()) { + key_list.emplace_back(tmp_leveldb_key_list[i]); + leveldb_key_list.emplace_back(tmp_leveldb_key_list[i]); + value_ptr_list.emplace_back(tmp_leveldb_value_list[i]); + } + } + + ValueIterator* value_iter = leveldb_->GetValueIterator( + leveldb_key_list, emb_config.emb_index, value_len); + + { + mutex_lock l(*(leveldb_->get_mutex())); + std::vector*> feat_desc_list(2); + FeatureDescriptor hbm_feat_desc(1, 1, ev_allocator() /*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = &hbm_feat_desc; + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, emb_config, value_len, default_value, key_list, + value_ptr_list, feat_desc_list, value_iter))); + } + + for (auto it : tmp_leveldb_value_list) { + cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff)); + } + delete value_iter; + + return OkStatus(); + } + + Status Eviction(K* evict_ids, int64 evict_size) override { + void* value_ptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + dram_->DestroyValuePtr(value_ptr); + } + } + return OkStatus(); + } + + Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { + mutex_lock l(*(dram_->get_mutex())); + mutex_lock l1(*(leveldb_->get_mutex())); + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + MultiTierStorage::KeepInvalidValuePtr(value_ptr); + } + } + return OkStatus(); + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + protected: + int total_dim() override { return dram_feat_desc_->total_dim(); } + + private: + DramStorage* dram_; + LevelDBStore* leveldb_; + FeatureDescriptor* dram_feat_desc_ = nullptr; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h new file mode 100644 index 00000000..6f83ecb6 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h @@ -0,0 +1,218 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ + +#include "cpu_hash_map_kv.h" +#include "feature_descriptor.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" + +namespace tensorflow { +template +class EmbeddingVar; + +namespace embedding { + +template +class DramPmemStorage : public MultiTierStorage { + public: + DramPmemStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc, + const std::string& name) + : dram_feat_desc_(feat_desc), MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + pmem_feat_desc_ = new FeatureDescriptor(feat_desc); + pmem_feat_desc_->SetAllocator( + experimental_pmem_allocator(sc.path, sc.size[0])); + + pmem_ = new PmemLibpmemStorage(sc, pmem_feat_desc_); + } + + ~DramPmemStorage() override { + MultiTierStorage::DeleteFromEvictionManager(); + delete dram_; + delete pmem_; + delete pmem_feat_desc_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage); + + Status Get(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = pmem_->Get(key, value_ptr); + void* new_value_ptr = dram_->CreateValuePtr(); + if (s.ok()) { + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + dram_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + return s; + } + + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); + } + + void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); + } + + bool IsUseHbm() override { return false; } + + bool IsSingleHbm() override { return false; } + + Status GetOrCreate(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = pmem_->Get(key, value_ptr); + + void* new_value_ptr = dram_->CreateValuePtr(); + if (s.ok()) { + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); + } + *value_ptr = new_value_ptr; + + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + // Insert Failed, key already exist + dram_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + + Status Remove(K key) override { + dram_->Remove(key); + pmem_->Remove(key); + return OkStatus(); + } + + int64 Size() const override { + int64 total_size = dram_->Size(); + total_size += pmem_->Size(); + return total_size; + } + + int64 Size(int level) const override { + if (level == 0) { + return dram_->Size(); + } else if (level == 1) { + return pmem_->Size(); + } else { + return -1; + } + } + + int LookupTier(K key) const override { + Status s = dram_->Contains(key); + if (s.ok()) return 0; + s = pmem_->Contains(key); + if (s.ok()) return 1; + return -1; + } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector key_list, tmp_pmem_key_list; + std::vector value_ptr_list, tmp_pmem_value_list; + + TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); + dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len); + + TF_CHECK_OK(pmem_->GetSnapshot(&tmp_pmem_key_list, &tmp_pmem_value_list)); + pmem_->Shrink(tmp_pmem_key_list, tmp_pmem_value_list, shrink_args, + value_len); + + for (int64 i = 0; i < tmp_pmem_key_list.size(); i++) { + Status s = dram_->Contains(tmp_pmem_key_list[i]); + if (!s.ok()) { + key_list.emplace_back(tmp_pmem_key_list[i]); + value_ptr_list.emplace_back(tmp_pmem_value_list[i]); + } + } + + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, emb_config, value_len, default_value, key_list, + value_ptr_list, pmem_feat_desc_))); + + return OkStatus(); + } + + Status Eviction(K* evict_ids, int64 evict_size) override { + void* value_ptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + dram_->DestroyValuePtr(value_ptr); + } + } + return OkStatus(); + } + + Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { + mutex_lock l(*(dram_->get_mutex())); + mutex_lock l1(*(pmem_->get_mutex())); + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + MultiTierStorage::KeepInvalidValuePtr(value_ptr); + } + } + return OkStatus(); + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + pmem_feat_desc_->InitSlotInfo(dram_feat_desc_); + MultiTierStorage::Init(); + } + + protected: + int total_dim() override { return pmem_feat_desc_->total_dim(); } + + private: + DramStorage* dram_; + PmemLibpmemStorage* pmem_; + FeatureDescriptor* dram_feat_desc_ = nullptr; + FeatureDescriptor* pmem_feat_desc_ = nullptr; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h new file mode 100644 index 00000000..f8cdff26 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h @@ -0,0 +1,214 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_ + +#include "cpu_hash_map_kv.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" +#include "ssd_hash_kv.h" + +namespace tensorflow { +template +class EmbeddingVar; + +namespace embedding { +template +class DramSsdHashStorage : public MultiTierStorage { + public: + DramSsdHashStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc, + const std::string& name) + : dram_feat_desc_(feat_desc), MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + ssd_hash_ = new SsdHashStorage(sc, feat_desc); + } + + ~DramSsdHashStorage() override { + MultiTierStorage::DeleteFromEvictionManager(); + delete dram_; + delete ssd_hash_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage); + + Status Get(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = ssd_hash_->Get(key, value_ptr); + if (s.ok()) { + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + // Insert Failed, the key is already in Dram; + ssd_hash_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + return s; + } + + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); + } + + void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); + } + + Status GetOrCreate(K key, void** value_ptr) override { + Status s = dram_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = ssd_hash_->Get(key, value_ptr); + if (s.ok()) { + s = dram_->TryInsert(key, *value_ptr); + if (s.ok()) { + return s; + } + // Insert Failed, the key is already in Dram; + ssd_hash_->DestroyValuePtr(*value_ptr); + return dram_->Get(key, value_ptr); + } + dram_->CreateAndInsert(key, value_ptr); + return OkStatus(); + } + + Status Remove(K key) override { + dram_->Remove(key); + ssd_hash_->Remove(key); + return OkStatus(); + } + + int64 Size() const override { + int64 total_size = dram_->Size(); + total_size += ssd_hash_->Size(); + return total_size; + } + + int64 Size(int level) const override { + if (level == 0) { + return dram_->Size(); + } else if (level == 1) { + return ssd_hash_->Size(); + } else { + return -1; + } + } + + int LookupTier(K key) const override { + Status s = dram_->Contains(key); + if (s.ok()) return 0; + s = ssd_hash_->Contains(key); + if (s.ok()) return 1; + return -1; + } + + bool IsUseHbm() override { return false; } + + bool IsSingleHbm() override { return false; } + + bool IsUsePersistentStorage() override { return true; } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + dram_->Save(tensor_name, prefix, writer, emb_config, shrink_args, value_len, + default_value); + + ssd_hash_->Save(tensor_name, prefix, writer, emb_config, shrink_args, + value_len, default_value); + + return OkStatus(); + } + + Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len, + const std::string& ssd_emb_file_name, + EmbeddingVar* ev, + RestoreSSDBuffer& restore_buff) override { + std::map file_id_map; + for (int64 i = 0; i < restore_buff.num_of_files; i++) { + file_id_map[restore_buff.file_list_buf[i]] = i; + } + + ssd_hash_->CopyEmbFilesFromCkpt( + restore_buff.file_list_buf, restore_buff.invalid_record_count_list_buf, + restore_buff.record_count_list_buf, restore_buff.num_of_files, + ssd_emb_file_name); + + ssd_hash_->Import(restore_buff.key_list_buf, + restore_buff.key_file_id_list_buf, + restore_buff.key_offset_list_buf, + restore_buff.num_of_keys, file_id_map); + return OkStatus(); + } + + Status Eviction(K* evict_ids, int64 evict_size) override { + void* value_ptr = nullptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + dram_->DestroyValuePtr(value_ptr); + } + } + return OkStatus(); + } + + Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { + mutex_lock l(*(dram_->get_mutex())); + mutex_lock l1(*(ssd_hash_->get_mutex())); + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; + for (int64 i = 0; i < evict_size; ++i) { + if (dram_->Get(evict_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(evict_ids[i])); + MultiTierStorage::KeepInvalidValuePtr(value_ptr); + } + } + return OkStatus(); + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + ssd_hash_->Init(); + MultiTierStorage::Init(); + } + + protected: + int total_dim() override { return dram_feat_desc_->total_dim(); } + + private: + DramStorage* dram_ = nullptr; + SsdHashStorage* ssd_hash_ = nullptr; + FeatureDescriptor* dram_feat_desc_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h new file mode 100644 index 00000000..79e029a2 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h @@ -0,0 +1,195 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#include +#include +#include + +#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h" +#include "feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +constexpr int COLUMN_BITSET_BYTES = 5; +constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8; + +struct MetaHeader { + volatile unsigned char embed_num; + unsigned char value_type; + unsigned char header_size; + unsigned char column_bitset[COLUMN_BITSET_BYTES]; + + static const int kEmbeddingNumStartIndex = 0; + static const int kValueTypeStartIndex = + kEmbeddingNumStartIndex + sizeof(char); + static const int kHeaderSizeStartIndex = kValueTypeStartIndex + sizeof(char); + static const int kColumnBitsetIndex = kHeaderSizeStartIndex + sizeof(char); + + inline unsigned int GetEmbeddingNum() { return (unsigned int)embed_num; } + + inline void SetEmbeddingNum(size_t s) { embed_num = (unsigned char)s; } + + inline std::bitset GetColumnBitset() { + unsigned long meta = ((unsigned long*)this)[0]; + std::bitset bs(meta >> (8 * kColumnBitsetIndex)); + return bs; + } + + inline void SetColumnBitset(const std::bitset& bs, + unsigned int embnum) { + ((unsigned long*)(this))[0] = (bs.to_ulong() << (8 * kColumnBitsetIndex)) | + (header_size << (8 * kHeaderSizeStartIndex)) | + (value_type << (8 * kValueTypeStartIndex)) | + (embnum << (8 * kEmbeddingNumStartIndex)); + } + + inline unsigned int GetHeaderSize() { return (unsigned int)header_size; } + + inline void SetHeaderSize(size_t size) { header_size = (unsigned char)size; } +}; + +template +class DynmaicDimDescriptorImpl : public FeatureDescriptorImpl { + using FeatureDescriptorImpl::slot_infos_; + + public: + DynmaicDimDescriptorImpl(Allocator* alloc, int64 slot_num) + : alloc_bytes_(sizeof(std::atomic_flag) + sizeof(MetaHeader) + + sizeof(V*) * slot_num), + header_offset_bytes_(sizeof(V*) * slot_num), + flag_offset_bytes_(sizeof(MetaHeader) + sizeof(V*) * slot_num), + FeatureDescriptorImpl(slot_num, false, false) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + ~DynmaicDimDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return FeatureDescriptorImpl::SetEmbeddingInfo(emb_index, embedding_dim, + default_value); + } + + V* GetEmbedding(void* val, int emb_index) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->embed_num; + auto metadata = meta->GetColumnBitset(); + + if (!metadata.test(emb_index)) { + std::atomic_flag* flag = (std::atomic_flag*)(val + flag_offset_bytes_); + while (flag->test_and_set(std::memory_order_acquire)); + metadata = meta->GetColumnBitset(); + if (metadata.test(emb_index)) { + flag->clear(std::memory_order_release); + return ((V**)val)[emb_index]; + } + embnum++; + int64 alloc_value_len = slot_infos_[emb_index].embedding_dim; + V* tensor_val = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + sizeof(V) * alloc_value_len); + V* default_v = (V*)slot_infos_[emb_index].default_value; + memcpy(tensor_val, default_v, + sizeof(V) * slot_infos_[emb_index].default_value_len); + ((V**)val)[emb_index] = tensor_val; + + metadata.set(emb_index); + // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = + // metadata.to_ulong(); the ptr_ will be occaionally modified from + // 0x7f18700912a0 to 0x700912a0 must use ((V**)ptr_ + 1 + 1)[emb_index] = + // tensor_val; to avoid + // LOG(INFO)<<"emb_num: "<SetColumnBitset(metadata, embnum); + flag->clear(std::memory_order_release); + return tensor_val; + } else { + return ((V**)val)[emb_index]; + } + } + + bool IsAdmit(void* val) override { return true; } + + void* Admit(void* val) override {} + + void* Allocate() override { + void* val = + alloc_->AllocateRaw(Allocator::kAllocatorAlignment, alloc_bytes_); + memset(val, 0, alloc_bytes_); + new ((char*)val + header_offset_bytes_) MetaHeader(); + return val; + } + + void Deallocate(void* val) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->GetEmbeddingNum(); + // LOG(INFO)<<"emb_num in deallocate: "<GetColumnBitset(); + for (int i = 0; i < embnum; i++) { + if (metadata.test(i)) { + V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i]; + if (val_ptr != nullptr) { + alloc_->DeallocateRaw(val_ptr); + } + } + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val : vals) { + Deallocate(val); + } + } + + void AddFreq(void* val, int64 count) override {} + + void SetAllocator(Allocator* alloc) override { alloc_ = alloc; } + + void SetDefaultValue(void* val, int64 key) override {} + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy( + val_ptr, value, + sizeof(V) * + FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + + int64 GetFreq(void* val) override {} + + int64 GetVersion(void* val) override {} + + void UpdateVersion(void* val, int64 version) override {} + + void SetFreq(void* val, int64 freq) override {} + + int data_bytes() override { return alloc_bytes_; } + + private: + int alloc_bytes_ = 0; + int header_offset_bytes_ = 0; + int flag_offset_bytes_ = 0; + Allocator* alloc_ = ev_allocator(); +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h new file mode 100644 index 00000000..75506b4e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h @@ -0,0 +1,244 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +namespace embedding { +class EmbFile { + public: + EmbFile(const std::string& path, size_t ver, int64 buffer_size) + : version_(ver), + file_size_(buffer_size), + count_(0), + invalid_count_(0), + is_deleted_(false) { + std::stringstream ss; + ss << std::setw(4) << std::setfill('0') << ver << ".emb"; + filepath_ = path + ss.str(); + OpenFstream(); + } + + virtual ~EmbFile() {} + virtual void Reopen() = 0; + virtual void Read(char* val, const size_t val_len, const size_t offset) = 0; + + virtual void DeleteFile() { + is_deleted_ = true; + if (fs_.is_open()) { + fs_.close(); + } + close(fd_); + std::remove(filepath_.c_str()); + } + + void LoadExistFile(const std::string& old_file_path, size_t count, + size_t invalid_count) { + Env::Default()->CopyFile(old_file_path, filepath_); + Reopen(); + count_ = count; + invalid_count_ = invalid_count; + } + + void Flush() { + if (fs_.is_open()) { + fs_.flush(); + } + } + + void MapForRead() { + file_addr_for_read_ = + (char*)mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fd_, 0); + } + + void UnmapForRead() { munmap((void*)file_addr_for_read_, file_size_); } + + void ReadWithMemcpy(char* val, const size_t val_len, const size_t offset) { + memcpy(val, file_addr_for_read_ + offset, val_len); + } + + void Write(const char* val, const size_t val_len) { + if (fs_.is_open()) { + fs_.write(val, val_len); + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_DONTNEED); + } else { + fs_.open(filepath_, + std::ios::app | std::ios::in | std::ios::out | std::ios::binary); + fs_.write(val, val_len); + fs_.close(); + } + } + + size_t Count() const { return count_; } + + void AddCount(size_t n) { count_ += n; } + + size_t InvalidCount() const { return invalid_count_; } + + void AddInvalidCount(size_t n) { invalid_count_ += n; } + + void AddInvalidCountAtomic(size_t n) { + __sync_fetch_and_add(&invalid_count_, n); + } + + size_t Version() const { return version_; } + + bool IsDeleted() const { return is_deleted_; } + + bool IsNeedToBeCompacted() { + return (count_ >= invalid_count_) && (count_ / 3 < invalid_count_); + } + + protected: + void OpenFstream() { + fs_.open(filepath_, + std::ios::app | std::ios::in | std::ios::out | std::ios::binary); + CHECK(fs_.good()); + } + void CloseFstream() { + if (fs_.is_open()) { + fs_.close(); + } + } + + private: + size_t version_; + size_t count_; + size_t invalid_count_; + char* file_addr_for_read_; + std::fstream fs_; + + protected: + int64 file_size_; + int fd_; + bool is_deleted_; + std::string filepath_; +}; + +class MmapMadviseEmbFile : public EmbFile { + public: + MmapMadviseEmbFile(const std::string& path, size_t ver, int64 buffer_size) + : EmbFile(path, ver, buffer_size) { + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY); + file_addr_ = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ, + MAP_PRIVATE, fd_, 0); + } + + void Reopen() override { + CloseFstream(); + munmap((void*)file_addr_, EmbFile::file_size_); + close(EmbFile::fd_); + OpenFstream(); + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY); + file_addr_ = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ, + MAP_PRIVATE, fd_, 0); + } + + void DeleteFile() override { + is_deleted_ = true; + CloseFstream(); + munmap((void*)file_addr_, EmbFile::file_size_); + close(EmbFile::fd_); + std::remove(EmbFile::filepath_.c_str()); + } + + void Read(char* val, const size_t val_len, const size_t offset) override { + memcpy(val, file_addr_ + offset, val_len); + int err = madvise(file_addr_, EmbFile::file_size_, MADV_DONTNEED); + if (err < 0) { + LOG(FATAL) << "Failed to madvise the page, file_addr_: " + << (void*)file_addr_ << ", file_size: " << EmbFile::file_size_; + } + } + + private: + char* file_addr_; +}; + +class MmapEmbFile : public EmbFile { + public: + MmapEmbFile(const std::string& path, size_t ver, int64 buffer_size) + : EmbFile(path, ver, buffer_size) { + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY); + } + + void Reopen() override { + CloseFstream(); + close(EmbFile::fd_); + OpenFstream(); + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY); + } + + void Read(char* val, const size_t val_len, const size_t offset) override { + char* file_addr_tmp = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ, + MAP_PRIVATE, fd_, 0); + memcpy(val, file_addr_tmp + offset, val_len); + munmap((void*)file_addr_tmp, EmbFile::file_size_); + } +}; + +class DirectIoEmbFile : public EmbFile { + public: + DirectIoEmbFile(const std::string& path, size_t ver, int64 buffer_size) + : EmbFile(path, ver, buffer_size) { + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY | O_DIRECT); + } + + void Reopen() override { + EmbFile::CloseFstream(); + close(EmbFile::fd_); + OpenFstream(); + EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY | O_DIRECT); + } + + void Read(char* val, const size_t val_len, const size_t offset) override { + size_t page_size = getpagesize(); + int pages_to_read = val_len / page_size; + if (val_len % page_size != 0) { + pages_to_read += 1; + } + if (offset + val_len >= page_size * pages_to_read) { + pages_to_read += 1; + } + int aligned_offset = offset - (offset % page_size); + char* read_buffer = (char*)memalign(page_size, page_size * pages_to_read); + + int status = pread(EmbFile::fd_, (void*)read_buffer, + page_size * pages_to_read, aligned_offset); + if (status < 0) { + LOG(FATAL) << "Failed to pread, read size: " << page_size * pages_to_read + << ", offset: " << aligned_offset; + } + memcpy(val, read_buffer + (offset % page_size), val_len); + free(read_buffer); + } +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h new file mode 100644 index 00000000..a439315d --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h @@ -0,0 +1,97 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_ +#include +#include + +#include "emb_file.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +namespace embedding { + +enum class IoScheme { + MMAP_AND_MADVISE = 0, + MMAP = 1, + DIRECT_IO = 2, + INVALID = 3 +}; + +class EmbFileCreator { + public: + virtual EmbFile* Create(const std::string& path, const size_t version, + const size_t buffer_size) = 0; +}; + +class MmapAndMadviseEmbFileCreator : public EmbFileCreator { + public: + EmbFile* Create(const std::string& path, const size_t version, + const size_t buffer_size) override { + return new MmapMadviseEmbFile(path, version, buffer_size); + } +}; + +class MmapEmbFileCreator : public EmbFileCreator { + public: + EmbFile* Create(const std::string& path, const size_t version, + const size_t buffer_size) override { + return new MmapEmbFile(path, version, buffer_size); + } +}; + +class DirectIoEmbFileCreator : public EmbFileCreator { + public: + EmbFile* Create(const std::string& path, const size_t version, + const size_t buffer_size) override { + return new DirectIoEmbFile(path, version, buffer_size); + } +}; + +class EmbFileCreatorFactory { + public: + static EmbFileCreator* Create(const std::string& io_scheme) { + std::map scheme_map{ + {"mmap_and_madvise", IoScheme::MMAP_AND_MADVISE}, + {"mmap", IoScheme::MMAP}, + {"directio", IoScheme::DIRECT_IO}}; + + IoScheme scheme = IoScheme::INVALID; + if (scheme_map.find(io_scheme) != scheme_map.end()) { + scheme = scheme_map[io_scheme]; + } + + switch (scheme) { + case IoScheme::MMAP_AND_MADVISE: + static MmapAndMadviseEmbFileCreator mmap_madvise_file_creator; + return &mmap_madvise_file_creator; + case IoScheme::MMAP: + static MmapEmbFileCreator mmap_file_creator; + return &mmap_file_creator; + case IoScheme::DIRECT_IO: + static DirectIoEmbFileCreator directio_file_creator; + return &directio_file_creator; + default: + LOG(WARNING) << "Invalid IO scheme of SSDHASH," + << " use default mmap_and_advise scheme."; + static MmapAndMadviseEmbFileCreator default_file_creator; + return &default_file_creator; + } + } +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h new file mode 100644 index 00000000..e328ef91 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h @@ -0,0 +1,110 @@ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_ + +#include + +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +struct EmbeddingConfig { + int64 emb_index; + int64 primary_emb_index; + int64 block_num; + int64 slot_num; + std::string name; + int64 steps_to_live; + int64 filter_freq; + int64 max_freq; + float l2_weight_threshold; + int64 kHashFunc; + int64 num_counter; + DataType counter_type; + int64 default_value_dim; + float default_value_no_permission; + bool record_freq; + bool record_version; + bool is_inference; + + EmbeddingConfig(int64 emb_index = 0, int64 primary_emb_index = 0, + int64 block_num = 1, int slot_num = 0, + const std::string& name = "", int64 steps_to_live = 0, + int64 filter_freq = 0, int64 max_freq = 999999, + float l2_weight_threshold = -1.0, int64 max_element_size = 0, + float false_positive_probability = -1.0, + DataType counter_type = DT_UINT64, + int64 default_value_dim = 4096, + float default_value_no_permission = .0, + bool record_freq = false, bool record_version = false, + bool is_inference = false) + : emb_index(emb_index), + primary_emb_index(primary_emb_index), + block_num(block_num), + slot_num(slot_num), + name(name), + steps_to_live(steps_to_live), + filter_freq(filter_freq), + max_freq(max_freq), + l2_weight_threshold(l2_weight_threshold), + counter_type(counter_type), + default_value_dim(default_value_dim), + default_value_no_permission(default_value_no_permission), + record_freq(record_freq), + record_version(record_version), + is_inference(is_inference) { + if (max_element_size != 0 && false_positive_probability != -1.0) { + kHashFunc = calc_num_hash_func(false_positive_probability); + num_counter = + calc_num_counter(max_element_size, false_positive_probability); + } else { + kHashFunc = 0; + num_counter = 0; + } + } + + int64 calc_num_counter(int64 max_element_size, + float false_positive_probability) { + float loghpp = fabs(log(false_positive_probability)); + float factor = log(2) * log(2); + int64 num_bucket = ceil(loghpp / factor * max_element_size); + if (num_bucket * sizeof(counter_type) > 10 * (1L << 30)) + LOG(WARNING) << "The Size of BloomFilter is more than 10GB!"; + return num_bucket; + } + + bool is_counter_filter() { + if (filter_freq != 0 && kHashFunc == 0 && num_counter == 0) { + return true; + } else { + return false; + } + } + + int64 calc_num_hash_func(float false_positive_probability) { + float loghpp = fabs(log(false_positive_probability) / log(2)); + return ceil(loghpp); + } + bool is_primary() const { return emb_index == primary_emb_index; } + + bool is_save_freq() const { return filter_freq != 0 || record_freq; } + + bool is_save_version() const { return steps_to_live != 0 || record_version; } + + int64 get_filter_freq() { return filter_freq; } + + std::string DebugString() const { + return strings::StrCat( + "opname: ", name, " emb_index: ", emb_index, + " primary_emb_index: ", primary_emb_index, " block_num: ", block_num, + " slot_num: ", slot_num, " steps_to_live: ", steps_to_live, + " filter_freq: ", filter_freq, " max_freq: ", max_freq, + " l2_weight_threshold: ", l2_weight_threshold, + " default_value_dim: ", default_value_dim); + } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h new file mode 100644 index 00000000..030ea37d --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h @@ -0,0 +1,89 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_ +#include + +#include "tensorflow/core/framework/allocator.h" + +namespace tensorflow { +namespace embedding { +template +class EmbeddingMemoryPool { + public: + explicit EmbeddingMemoryPool(Allocator* alloc, int64 value_len, + int64 block_size) + : alloc_(alloc), value_len_(value_len), block_size_(block_size) { + embs_per_block_ = block_size_ / (sizeof(V) * value_len_); + CreateBlock(); + } + + ~EmbeddingMemoryPool() { + for (auto it : block_list_) { + alloc_->DeallocateRaw(it); + } + } + + V* Allocate() { + if (free_ptr_queue_.size() == 0) { + CreateBlock(); + } + V* ptr = free_ptr_queue_.front(); + free_ptr_queue_.pop_front(); + return ptr; + } + + void Deallocate(std::vector value_ptrs) { + int64 prev_size = value_ptrs_queue_.size(); + for (auto it : value_ptrs) { + value_ptrs_queue_.emplace_back(it); + } + if (value_ptrs_queue_.size() > embs_per_block_) { + int64 n = value_ptrs_queue_.size() - embs_per_block_; + n = std::min(prev_size, n); + for (int64 i = 0; i < n; i++) { + void* val = value_ptrs_queue_.front(); + free_ptr_queue_.emplace_back((V*)val); + value_ptrs_queue_.pop_front(); + } + } + } + + void Deallocate(V* ptr) { free_ptr_queue_.emplace_back(ptr); } + + private: + void CreateBlock() { + V* dev_addr = + (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + sizeof(V) * value_len_ * embs_per_block_); + block_list_.emplace_back(dev_addr); + for (int64 i = 0; i < embs_per_block_; i++) { + free_ptr_queue_.emplace_back(dev_addr + i * value_len_); + } + } + + int64 block_size_; + int64 value_len_; + int64 embs_per_block_; + Allocator* alloc_; + std::deque free_ptr_queue_; + std::deque value_ptrs_queue_; + std::vector block_list_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc new file mode 100644 index 00000000..7d8f889c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc @@ -0,0 +1,77 @@ +/* Copyright 2019 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "embedding_var.h" + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using se::DeviceMemoryBase; +using se::Stream; + +void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr) { + volatile bool is_kernel_finish = false; + event_mgr->ThenExecute(stream, + [&is_kernel_finish]() { is_kernel_finish = true; }); + while (!is_kernel_finish) { + } +} + +template +void EmbeddingVar::CopyEmbeddingsToBuffer( + V* val_base, int64 size, V** memcpy_address, se::Stream* compute_stream, + EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device) { + int block_dim = 128; + V** dev_value_address = (V**)GetBuffer(size); + DeviceMemoryBase gpu_dst_ptr(dev_value_address, size * sizeof(V*)); + compute_stream->ThenMemcpy(&gpu_dst_ptr, memcpy_address, size * sizeof(V*)); + + int limit = size; + int length = ValueLen(); + TF_CHECK_OK(GpuLaunchKernel(embedding::BatchCopy, + (limit + block_dim - 1) / block_dim * length, + block_dim, 0, gpu_device.stream(), + dev_value_address, val_base, length, limit)); + SyncWithEventMgr(compute_stream, event_mgr); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVar::CopyEmbeddingsToBuffer( \ + vtype*, int64, vtype**, se::Stream*, EventMgr*, \ + const Eigen::GpuDevice& gpu_device); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h new file mode 100644 index 00000000..57495fa4 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h @@ -0,0 +1,706 @@ +/* Copyright 2019 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_ + +#include "cache.h" +#include "embedding_config.h" +#include "embedding_var_context.h" +#include "embedding_var_restore.h" +#include "filter_factory.h" +#include "gpu_hash_map_kv.h" +#include "storage.h" +#include "storage_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/typed_allocator.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +#if GOOGLE_CUDA +void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr); +#endif // GOOGLE_CUDA + +template +class GPUHashTable; + +template +class EmbeddingVar : public ResourceBase { + public: + EmbeddingVar(const string& name, embedding::Storage* storage, + EmbeddingConfig emb_cfg, Allocator* alloc, + embedding::FeatureDescriptor* feat_desc) + : name_(name), + storage_(storage), + default_value_(nullptr), + default_value_no_permission_(nullptr), + value_len_(0), + alloc_(alloc), + default_value_alloc_(alloc), + emb_config_(emb_cfg), + feat_desc_(feat_desc) {} + + Status Init(const Tensor& default_tensor, int64 default_value_dim) { + if (storage_ == nullptr) { + return errors::InvalidArgument( + "Invalid ht_type to construct EmbeddingVar"); + } + + storage_type_ = storage_->GetStorageType(); + filter_ = FilterFactory::CreateFilter>( + emb_config_, this, storage_, feat_desc_); + emb_config_.default_value_dim = default_value_dim; + value_len_ = default_tensor.NumElements() / emb_config_.default_value_dim; + + if (storage_->IsUseHbm()) { +#if GOOGLE_CUDA + default_value_ = TypedAllocator::Allocate( + alloc_, default_tensor.NumElements(), AllocationAttributes()); + auto default_tensor_flat = default_tensor.flat(); + dev_addr_buffer_ = nullptr; + dev_addr_buffer_size_ = 0; + cudaMemcpy(default_value_, &default_tensor_flat(0), + default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice); +#endif // GOOGLE_CUDA + } else if (storage_->IsSingleHbm()) { +#if GOOGLE_CUDA + storage_->SetValueLen(value_len_); + default_value_ = TypedAllocator::Allocate( + alloc_, default_tensor.NumElements(), AllocationAttributes()); + auto default_tensor_flat = default_tensor.flat(); + cudaMemcpy(default_value_, &default_tensor_flat(0), + default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice); +#endif // GOOGLE_CUDA + } else { + alloc_ = ev_allocator(); + default_value_ = TypedAllocator::Allocate(default_value_alloc_, + default_tensor.NumElements(), + AllocationAttributes()); + + auto default_tensor_flat = default_tensor.flat(); + memcpy(default_value_, &default_tensor_flat(0), + default_tensor.TotalBytes()); + + default_value_no_permission_ = TypedAllocator::Allocate( + default_value_alloc_, value_len_, AllocationAttributes()); + for (int i = 0; i < value_len_; ++i) { + default_value_no_permission_[i] = + static_cast(emb_config_.default_value_no_permission); + } + } + bool is_all_slots_initialized = feat_desc_->InitSlotInfo( + emb_config_.emb_index, value_len_, + std::pair(default_value_, emb_config_.default_value_dim)); + if (is_all_slots_initialized) { + storage_->Init(); + SetAllSlotInitialized(); + } + + return OkStatus(); + } + + void SetInitialized() { is_initialized_ = true; } + + void SetAllSlotInitialized() { is_all_slot_initialized_ = true; } + + bool IsInitialized() const { return is_initialized_; } + + bool IsAllSlotInitialized() const { return is_all_slot_initialized_; } + + Status LookupKey(K key, void** value_ptr) { + return storage_->Get(key, value_ptr); + } + + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, + bool indices_as_pointer, int64 count = 1) { + if (indices_as_pointer) { + *value_ptr = (void*)key; + *is_filter = filter_->is_admit(key, *value_ptr); + return OkStatus(); + } else { + Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count); + return s; + } + } + + Status Insert(K key, V* value) { + void* value_ptr = nullptr; + CreateKey(key, &value_ptr, true); + feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value); + return OkStatus(); + } + + Status LookupOrCreateKey(K key, void** value_ptr) { + Status s = storage_->GetOrCreate(key, value_ptr); + TF_CHECK_OK(s); + return s; + } + + void CreateKey(K key, void** value_ptr, bool to_dram) { + storage_->CreateAndInsert(key, value_ptr, to_dram); + } + + void UpdateVersion(void* value_ptr, int64 gs) { + feat_desc_->UpdateVersion(value_ptr, gs); + } + + void BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) { + TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs)); + } + + void Eviction(K* evict_ids, int64 evict_size) { + TF_CHECK_OK(storage_->Eviction(evict_ids, evict_size)); + } + + int64 GetVersion(K key) { + void* value_ptr = nullptr; + TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr)); + return feat_desc_->GetVersion(value_ptr); + } + + int64 GetFreq(K key) { return filter_->GetFreq(key); } + + Status Lookup(K key, V* val, V* default_v) { + const V* default_value_ptr = + (default_v == nullptr) ? default_value_ : default_v; + return filter_->Lookup(key, val, default_value_ptr, + default_value_no_permission_); + } + + void GetEmbeddings(const EmbeddingVarContext& context, + const K* keys, V* output, int64 num_of_keys) { + auto do_work = [this, keys, output](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + V* default_v = + default_value_ + + (std::abs(keys[i]) % emb_config_.default_value_dim) * value_len_; + filter_->Lookup(keys[i], output + i * value_len_, default_v, + default_value_no_permission_); + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + } + + // Used for CPU Adaptive Embedding + void GetEmbeddings(const EmbeddingVarContext& context, + const K* keys, V* output, int64 num_of_keys, + V* default_value) { + auto do_work = [this, keys, output, default_value](int64 start, + int64 limit) { + for (int64 i = start; i < limit; ++i) { + V* default_v = default_value + i * value_len_; + void* value_ptr = nullptr; + filter_->LookupOrCreate(keys[i], output + i * value_len_, default_v, + &value_ptr, 1, default_value_no_permission_); + feat_desc_->AddFreq(value_ptr, 1); + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + } + + void GetOrCreateKey(const EmbeddingVarContext& context, + const Tensor& keys_tensor, void** value_ptrs, + int64 num_of_keys) { + const K* keys = (K*)keys_tensor.data(); + auto do_work = [this, keys, value_ptrs](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + bool is_filter = false; + filter_->LookupOrCreateKey(keys[i], &value_ptrs[i], &is_filter, 1); + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + + storage_->AddToCachePrefetchList(keys_tensor); + } + + void GatherEmbeddings(const EmbeddingVarContext& context, + const Tensor& keys_tensor, void** value_ptrs, V* output, + int64 num_of_keys) { + const K* keys = (K*)keys_tensor.data(); + auto do_work = [this, keys, value_ptrs, output](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); + V* value = nullptr; + if (is_admit) { + value = + feat_desc_->GetEmbedding(value_ptrs[i], emb_config_.emb_index); + } else { + value = default_value_no_permission_; + } + memcpy(output + i * value_len_, value, sizeof(V) * value_len_); + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + + storage_->AddToCache(keys_tensor); + } + +#if GOOGLE_CUDA + void GetEmbeddings(const EmbeddingVarContext& context, + const K* keys, V* output, int64 num_of_keys) { + if (IsSingleHbm()) { + storage_->BatchLookup(context.gpu_device, keys, output, num_of_keys, + default_value_); + } else { + filter_->BatchLookup(context, keys, output, num_of_keys, default_value_, + default_value_no_permission_); + } + } + + void GetOrCreateKey(const EmbeddingVarContext& context, + const Tensor& keys_tensor, void** value_ptrs, + int64 num_of_keys, bool indices_as_pointer = false) { + const K* keys = (K*)keys_tensor.data(); + filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); + storage_->AddToCachePrefetchList(keys_tensor); + } + + void BatchLookupOrCreateKey( + const EmbeddingVarContext& context, const K* keys, + void** value_ptrs, int64 num_of_keys, + std::vector>& not_found_cursor_list) { + storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys, + value_len_, not_found_cursor_list); + } + + void GatherEmbeddings(const EmbeddingVarContext& context, + const Tensor& keys_tensor, void** value_ptrs, V* output, + int64 num_of_keys) { + std::vector embedding_ptr(num_of_keys); + const K* keys = (K*)keys_tensor.data(); + auto do_work = [this, keys, value_ptrs, output, &embedding_ptr]( + int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); + feat_desc_->AddFreq(value_ptrs[i], 1); + if (is_admit) { + embedding_ptr[i] = + feat_desc_->GetEmbedding(value_ptrs[i], emb_config_.emb_index); + } else { + embedding_ptr[i] = default_value_no_permission_; + } + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + + auto stream = context.compute_stream; + auto event_mgr = context.event_mgr; + CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(), stream, + event_mgr, context.gpu_device); + + storage_->AddToCache(keys_tensor); + } + + void BatchLookupKey(const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys) { + storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys); + } + + Status LookupOrCreateKey(const EmbeddingVarContext& context, + const K* keys, void** value_ptrs, int64 num_of_keys, + int64* indices_counts, + bool indices_as_pointer = false) { + if (indices_as_pointer) { + auto lookup_key_and_set_version_fn = [keys, value_ptrs](int64 start, + int64 limit) { + for (int i = start; i < limit; i++) { + value_ptrs[i] = (void*)keys[i]; + } + }; + const int64 unit_cost = + 1000; // very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + unit_cost, lookup_key_and_set_version_fn); + } else { + filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); + } + + if (indices_counts != nullptr) { + auto add_freq_fn = [this, value_ptrs, indices_counts](int64 start, + int64 limit) { + for (int i = start; i < limit; i++) { + feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]); + } + }; + const int64 unit_cost = + 1000; // very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + unit_cost, add_freq_fn); + } + return OkStatus(); + } +#endif + +#if GOOGLE_CUDA + void CopyEmbeddingsToBuffer(V* val_base, int64 size, V** memcpy_address, + se::Stream* compute_stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device); +#endif // GOOGLE_CUDA + + typename TTypes::Flat flat(void* value_ptr) { + V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index); + Eigen::array dims({value_len_}); + return typename TTypes::Flat(val, dims); + } + + V* GetValuePtr(void* ptr) { + return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index); + } + + int64 ValueLen() const { return value_len_; } + + int64 Size() const { return storage_->Size(); } + + int64 CacheSize() const { return storage_->CacheSize(); } + + int64 MemoryUsage() const { + return storage_->Size() * (sizeof(K) + feat_desc_->data_bytes()); + } + + int64 MinFreq() { return emb_config_.filter_freq; } + + int64 StepsToLive() const { return emb_config_.steps_to_live; } + + bool IsMultiLevel() { return storage_->IsMultiLevel(); } + + bool IsUseHbm() { return storage_->IsUseHbm(); } + + bool IsSingleHbm() { return storage_->IsSingleHbm(); } + + bool IsUsePersistentStorage() { return storage_->IsUsePersistentStorage(); } + + void InitCache(embedding::CacheStrategy cache_strategy) { + storage_->InitCache(cache_strategy); + } + + std::string DebugString() const { return emb_config_.DebugString(); } + + void Restore(const std::string& name_string, + const std::string& file_name_string, int64 partition_id, + int64 partition_num, bool is_incr, BundleReader* reader, + bool reset_version = false, + const Eigen::GpuDevice* device = nullptr) { + return storage_->Restore(name_string, file_name_string, partition_id, + partition_num, value_len_, is_incr, reset_version, + emb_config_, device, reader, this, filter_); + } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, embedding::ShrinkArgs& shrink_args) { + return storage_->Save(tensor_name, prefix, writer, emb_config_, shrink_args, + value_len_, default_value_); + } + + void GetSnapshot(std::vector* key_list, std::vector* value_list, + std::vector* version_list, + std::vector* freq_list) { + std::vector value_ptr_list; + storage_->GetSnapshot(key_list, &value_ptr_list); + bool is_save_freq = emb_config_.is_save_freq(); + bool is_save_version = emb_config_.is_save_version(); + for (int64 i = 0; i < key_list->size(); i++) { + if (feat_desc_->IsAdmit(value_ptr_list[i])) { + V* val = + feat_desc_->GetEmbedding(value_ptr_list[i], emb_config_.emb_index); + value_list->emplace_back(val); + } else { + value_list->emplace_back(default_value_); + } + + if (is_save_version) { + int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]); + version_list->emplace_back(dump_version); + } + + if (is_save_freq) { + int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]); + freq_list->emplace_back(dump_freq); + } + } + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_num) { + return storage_->GetShardedSnapshot(key_list, value_ptr_list, partition_id, + partition_num); + } + + void ExportAndRemove(K* key_list, V* value_list, int64* version_list, + int64* freq_list, std::vector& tot_keys_list, + std::vector& tot_value_ptr_list) { + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true, + &save_unfiltered_features)); + + bool is_save_freq = emb_config_.is_save_freq(); + bool is_save_version = emb_config_.is_save_version(); + + for (int64 i = 0; i < tot_keys_list.size(); ++i) { + auto& value_ptr = tot_value_ptr_list[i]; + if ((int64)value_ptr == embedding::ValuePtrStatus::IS_DELETED) continue; + + bool is_admit = feat_desc_->IsAdmit(value_ptr); + bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); + + if (is_admit) { + key_list[i] = tot_keys_list[i]; + + if (!is_in_dram) { + auto tmp_value = value_list + i * value_len_; + tmp_value = (V*)embedding::ValuePtrStatus::NOT_IN_DRAM; + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + } else if (feat_desc_->GetEmbedding(value_ptr, 0) == nullptr) { + memcpy(value_list + i * value_len_, default_value_, + sizeof(V) * value_len_); + } else { + V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index); + memcpy(value_list + i * value_len_, val, sizeof(V) * value_len_); + } + + if (is_save_version) { + int64 dump_version = feat_desc_->GetVersion(value_ptr); + version_list[i] = dump_version; + } + + if (is_save_freq) { + int64 dump_freq = feat_desc_->GetFreq(value_ptr); + freq_list[i] = dump_freq; + } + } else { + if (!save_unfiltered_features) continue; + // TODO(JUNQI) : currently not export filtered keys + } + + if (emb_config_.is_primary()) { + Status s; + s = storage_->Remove(tot_keys_list[i]); + if (!s.ok()) { + LOG(ERROR) << "Remove keys error: " << s.message(); + } + feat_desc_->Deallocate(value_ptr); + } + } + return; + } + + Status RestoreFromKeysAndValues(int64 key_num, int partition_id, + int partition_num, const K* key_list, + const V* value_list, + const int64* version_list, + const int64* freq_list, + const Eigen::GpuDevice* device = nullptr) { + RestoreBuffer restore_buff((char*)key_list, (char*)value_list, + (char*)version_list, (char*)freq_list); + return storage_->RestoreFeatures( + key_num, kSavedPartitionNum, partition_id, partition_num, value_len_, + false /* is_filter*/, false /* is_incr*/, emb_config_, device, filter_, + restore_buff); + } + + mutex* mu() { return &mu_; } + + embedding::Storage* storage() { return storage_; } + + embedding::FeatureDescriptor* feature_descriptor() { return feat_desc_; } + + Status Shrink(embedding::ShrinkArgs& shrink_args) { + if (emb_config_.is_primary()) { + shrink_args.value_len = value_len_; + return storage_->Shrink(shrink_args); + } else { + return OkStatus(); + } + } + + string Name() { return name_; } + + V* GetDefaultValuePtr() { return default_value_; } + + int64 GetDefaultValueDim() { return emb_config_.default_value_dim; } + + V* GetDefaultValue(int64 key) { + return default_value_ + (key % emb_config_.default_value_dim) * value_len_; + } + + embedding::BatchCache* Cache() { return storage_->Cache(); } + + int64 GetEmbeddingIndex() { return emb_config_.emb_index; } + + int64 GetEmbeddingSlotNum() { return emb_config_.slot_num; } + + Allocator* GetAllocator() { return alloc_; } + + V** GetBuffer(int64 size) { + if (dev_addr_buffer_size_ >= size) { + return dev_addr_buffer_; + } else { + if (dev_addr_buffer_size_ != 0) { + alloc_->DeallocateRaw(dev_addr_buffer_); + } + dev_addr_buffer_ = (V**)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, size * sizeof(V*)); + dev_addr_buffer_size_ = size; + return dev_addr_buffer_; + } + } + + void UpdateCache(const Tensor& indices, const Tensor& indices_counts, + bool is_called_by_gather = false) { + if (!is_called_by_gather || + (is_called_by_gather && emb_config_.is_inference)) { + storage_->UpdateCache(indices, indices_counts); + } + } + + void UpdateCache(const Tensor& indices, bool is_called_by_gather = false) { + if (!is_called_by_gather || + (is_called_by_gather && emb_config_.is_inference)) { + storage_->UpdateCache(indices); + } + } + + void UpdateCache(const K* key_buff, int64 key_num, const int64* version_buff, + const int64* freq_buff) { + auto cache = Cache(); + if (cache) { + cache->update(key_buff, key_num, version_buff, freq_buff); + auto cache_size = CacheSize(); + if (cache->size() > cache_size) { + int64 evict_size = cache->size() - cache_size; + K* evict_ids = new K[evict_size]; + size_t true_size = cache->get_evic_ids(evict_ids, evict_size); + if (!IsUseHbm()) { + Eviction(evict_ids, true_size); + } + delete[] evict_ids; + } + } + } + + void LookupOrCreate(const K* key, V* val, V* default_v, int32 default_v_num, + size_t n, const Eigen::GpuDevice& device) { + storage_->BatchLookupOrCreate(key, val, default_v, default_v_num, n, + device); + } + + void LookupOrCreateKey(const K* key, int32* item_idxs, size_t n, + const Eigen::GpuDevice& device, + int64 update_version = -1) { + storage_->BatchLookupOrCreateKeys(key, item_idxs, n, device); + } + + void Lookup(const K* key, V* val, V* default_v, int32 default_v_num, size_t n, + const Eigen::GpuDevice& device) { + storage_->BatchLookup(key, val, default_v, default_v_num, n, device); + } + + int32 SlotNum() { + return (emb_config_.block_num * (1 + emb_config_.slot_num)); + } + + int32 EmbIdx() { return emb_config_.emb_index; } + + GPUHashTable* HashTable() { return storage_->HashTable(); } + FilterPolicy>* GetFilter() const { return filter_; } + + protected: + ~EmbeddingVar() override { + // When dynamic dimension embedding is used, + // there will be more than one primary slot + if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) { + delete storage_; + delete feat_desc_; + } + if (embedding::StorageType::HBM_DRAM == storage_type_) { + alloc_->DeallocateRaw(dev_addr_buffer_); + } + TypedAllocator::Deallocate(default_value_alloc_, default_value_, + value_len_ * emb_config_.default_value_dim); + if (default_value_no_permission_) { + TypedAllocator::Deallocate(default_value_alloc_, + default_value_no_permission_, value_len_); + } + if (filter_) { + delete filter_; + } + } + + private: + void LookupThroughFilter(const EmbeddingVarContext& context, + const Tensor& indices, V* output, + int64 num_of_keys) { + const K* keys = (K*)indices.data(); + auto do_work = [this, keys, output](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + V* default_v = default_value_ + + (keys[i] % emb_config_.default_value_dim) * value_len_; + filter_->Lookup(keys[i], output + i * value_len_, default_v, + default_value_no_permission_); + } + }; + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + value_len_ * sizeof(V), do_work); + } + + std::string name_; + bool is_initialized_ = false; + bool is_all_slot_initialized_ = false; + + mutex mu_; + + V* default_value_; + V* default_value_no_permission_; + V** dev_addr_buffer_; + int64 dev_addr_buffer_size_; + int64 value_len_; + Allocator* alloc_; + Allocator* default_value_alloc_; + embedding::Storage* storage_; + embedding::StorageType storage_type_; + EmbeddingConfig emb_config_; + FilterPolicy>* filter_; + embedding::FeatureDescriptor* feat_desc_; + + TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc new file mode 100644 index 00000000..7eabf919 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc @@ -0,0 +1,229 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "embedding_var_ckpt_data.h" + +#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h" +#include "embedding_var_dump_iterator.h" +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { +namespace embedding { +template +void EmbeddingVarCkptData::Emplace( + K key, void* value_ptr, const EmbeddingConfig& emb_config, V* default_value, + FeatureDescriptor* feat_desc, bool is_save_freq, bool is_save_version, + bool save_unfiltered_features) { + if ((int64)value_ptr == ValuePtrStatus::IS_DELETED) return; + + bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); + bool is_admit = feat_desc->IsAdmit(value_ptr); + + if (is_admit) { + key_vec_.emplace_back(key); + + if (!is_in_dram) { + value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM); + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) { + value_ptr_vec_.emplace_back(default_value); + } else { + V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index); + value_ptr_vec_.emplace_back(val); + } + if (is_save_version) { + int64 dump_version = feat_desc->GetVersion(value_ptr); + version_vec_.emplace_back(dump_version); + } + + if (is_save_freq) { + int64 dump_freq = feat_desc->GetFreq(value_ptr); + freq_vec_.emplace_back(dump_freq); + } + } else { + if (!save_unfiltered_features) return; + + key_filter_vec_.emplace_back(key); + + if (is_save_version) { + int64 dump_version = feat_desc->GetVersion(value_ptr); + version_filter_vec_.emplace_back(dump_version); + } + + int64 dump_freq = feat_desc->GetFreq(value_ptr); + freq_filter_vec_.emplace_back(dump_freq); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::Emplace( \ + ktype, void*, const EmbeddingConfig&, vtype*, FeatureDescriptor*, \ + bool, bool, bool); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void EmbeddingVarCkptData::Emplace(K key, V* value_ptr) { + key_vec_.emplace_back(key); + value_ptr_vec_.emplace_back(value_ptr); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::Emplace(ktype, vtype*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void EmbeddingVarCkptData::SetWithPartition( + std::vector>& ev_ckpt_data_parts) { + part_offset_.resize(kSavedPartitionNum + 1); + part_filter_offset_.resize(kSavedPartitionNum + 1); + part_offset_[0] = 0; + part_filter_offset_[0] = 0; + for (int i = 0; i < kSavedPartitionNum; i++) { + part_offset_[i + 1] = + part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size(); + + part_filter_offset_[i + 1] = + part_filter_offset_[i] + ev_ckpt_data_parts[i].key_filter_vec_.size(); + + for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) { + key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) { + value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) { + version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) { + freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) { + key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); + j++) { + version_filter_vec_.emplace_back( + ev_ckpt_data_parts[i].version_filter_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) { + freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]); + } + } +} + +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::SetWithPartition( \ + std::vector>&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status EmbeddingVarCkptData::ExportToCkpt(const string& tensor_name, + BundleWriter* writer, + int64 value_len, + ValueIterator* value_iter) { + size_t bytes_limit = 8 << 20; + std::unique_ptr dump_buffer(new char[bytes_limit]); + + EVVectorDataDumpIterator key_dump_iter(key_vec_); + Status s = SaveTensorWithFixedBuffer( + tensor_name + "-keys", writer, dump_buffer.get(), bytes_limit, + &key_dump_iter, TensorShape({key_vec_.size()})); + if (!s.ok()) return s; + + EV2dVectorDataDumpIterator value_dump_iter(value_ptr_vec_, value_len, + value_iter); + s = SaveTensorWithFixedBuffer( + tensor_name + "-values", writer, dump_buffer.get(), bytes_limit, + &value_dump_iter, TensorShape({value_ptr_vec_.size(), value_len})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator version_dump_iter(version_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-versions", writer, dump_buffer.get(), bytes_limit, + &version_dump_iter, TensorShape({version_vec_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator freq_dump_iter(freq_vec_); + s = SaveTensorWithFixedBuffer(tensor_name + "-freqs", writer, + dump_buffer.get(), bytes_limit, &freq_dump_iter, + TensorShape({freq_vec_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator filtered_key_dump_iter(key_filter_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-keys_filtered", writer, dump_buffer.get(), bytes_limit, + &filtered_key_dump_iter, TensorShape({key_filter_vec_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator filtered_version_dump_iter( + version_filter_vec_); + s = SaveTensorWithFixedBuffer(tensor_name + "-versions_filtered", writer, + dump_buffer.get(), bytes_limit, + &filtered_version_dump_iter, + TensorShape({version_filter_vec_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator filtered_freq_dump_iter(freq_filter_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-freqs_filtered", writer, dump_buffer.get(), bytes_limit, + &filtered_freq_dump_iter, TensorShape({freq_filter_vec_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator part_offset_dump_iter(part_offset_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-partition_offset", writer, dump_buffer.get(), bytes_limit, + &part_offset_dump_iter, TensorShape({part_offset_.size()})); + if (!s.ok()) return s; + + EVVectorDataDumpIterator part_filter_offset_dump_iter( + part_filter_offset_); + s = SaveTensorWithFixedBuffer(tensor_name + "-partition_filter_offset", + writer, dump_buffer.get(), bytes_limit, + &part_filter_offset_dump_iter, + TensorShape({part_filter_offset_.size()})); + if (!s.ok()) return s; + + return OkStatus(); +} + +#define REGISTER_KERNELS(ktype, vtype) \ + template Status EmbeddingVarCkptData::ExportToCkpt( \ + const string&, BundleWriter*, int64, ValueIterator*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS +} // namespace embedding +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h new file mode 100644 index 00000000..0ea4f1e3 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h @@ -0,0 +1,57 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_ +#include "embedding_config.h" +#include "embedding_var_dump_iterator.h" +#include "tensorflow/core/platform/types.h" +namespace tensorflow { +class BundleWriter; +namespace { +const int kDramFlagOffset = 49; +} + +namespace embedding { +template +class EmbeddingVarCkptData { + public: + void Emplace(K key, void* value_ptr, const EmbeddingConfig& emb_config, + V* default_value, FeatureDescriptor* feat_desc, + bool is_save_freq, bool is_save_version, + bool save_unfiltered_features); + + void Emplace(K key, V* value_ptr); + + void SetWithPartition( + std::vector>& ev_ckpt_data_parts); + + Status ExportToCkpt(const string& tensor_name, BundleWriter* writer, + int64 value_len, ValueIterator* value_iter = nullptr); + + private: + std::vector key_vec_; + std::vector value_ptr_vec_; + std::vector version_vec_; + std::vector freq_vec_; + std::vector key_filter_vec_; + std::vector version_filter_vec_; + std::vector freq_filter_vec_; + std::vector part_offset_; + std::vector part_filter_offset_; + const int kSavedPartitionNum = 1000; +}; +} // namespace embedding +} // namespace tensorflow +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h new file mode 100644 index 00000000..192298a7 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h @@ -0,0 +1,64 @@ +/* Copyright 2019 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template +struct EmbeddingVarContext; + +template <> +struct EmbeddingVarContext { + public: + EmbeddingVarContext(OpKernelContext* op_ctx) + : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()) {} + + const DeviceBase::CpuWorkerThreads* worker_threads; +}; + +#if GOOGLE_CUDA +template <> +struct EmbeddingVarContext { + public: + EmbeddingVarContext(OpKernelContext* op_ctx) + : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()), + compute_stream(op_ctx->op_device_context()->stream()), + event_mgr( + op_ctx->device()->tensorflow_accelerator_device_info()->event_mgr), + gpu_allocator(op_ctx->device()->GetAllocator(AllocatorAttributes())), + gpu_device(op_ctx->eigen_gpu_device()) {} + + const DeviceBase::CpuWorkerThreads* worker_threads = nullptr; + se::Stream* compute_stream = nullptr; + EventMgr* event_mgr = nullptr; + Allocator* gpu_allocator = nullptr; + const GPUDevice& gpu_device; +}; +#endif // GOOGLE_CUDA +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h new file mode 100644 index 00000000..ff52465e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h @@ -0,0 +1,91 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_ +#include "embedding_config.h" +#include "kv_interface.h" +#include "tensorflow/core/platform/types.h" +namespace tensorflow { +template +class DumpIterator; + +namespace embedding { +template +class EVVectorDataDumpIterator : public DumpIterator { + public: + EVVectorDataDumpIterator(const std::vector& item_list) + : curr_iter_(item_list.begin()), end_iter_(item_list.end()) {} + + bool HasNext() const { return curr_iter_ != end_iter_; } + + T Next() { + T val = *curr_iter_; + curr_iter_++; + return val; + } + + private: + typename std::vector::const_iterator curr_iter_; + typename std::vector::const_iterator end_iter_; +}; + +template +class EV2dVectorDataDumpIterator : public DumpIterator { + public: + EV2dVectorDataDumpIterator(std::vector& valueptr_list, int64 value_len, + ValueIterator* val_iter) + : curr_iter_(valueptr_list.begin()), + end_iter_(valueptr_list.end()), + val_iter_(val_iter), + value_len_(value_len), + col_idx_(0) { + if (!valueptr_list.empty()) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { + curr_ptr_ = val_iter_->Next(); + } else { + curr_ptr_ = *curr_iter_; + } + } + } + + bool HasNext() const { return curr_iter_ != end_iter_; } + + T Next() { + T val = curr_ptr_[col_idx_++]; + if (col_idx_ >= value_len_) { + curr_iter_++; + col_idx_ = 0; + if (curr_iter_ != end_iter_) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { + curr_ptr_ = val_iter_->Next(); + } else { + curr_ptr_ = *curr_iter_; + } + } + } + return val; + } + + private: + typename std::vector::const_iterator curr_iter_; + typename std::vector::const_iterator end_iter_; + ValueIterator* val_iter_; + int64 value_len_; + int64 col_idx_; + T* curr_ptr_ = nullptr; +}; +} // namespace embedding +} // namespace tensorflow +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc new file mode 100644 index 00000000..5f7eb9d1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc @@ -0,0 +1,646 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "embedding_var_restore.h" + +#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +template +int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) { + TensorShape shape; + Status st; + st = reader->LookupTensorShape(record_key, &shape); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0)); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + size_t bytes_read = 0; + *buffer = new K[shape.dim_size(0)]; + st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0), + (char*)*buffer, bytes_read); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + return shape.dim_size(0); +} +#define REGISTER_KERNELS(ktype) \ + template int64 ReadRecord(BundleReader*, const string&, ktype**); +REGISTER_KERNELS(int32); +REGISTER_KERNELS(int64); +#undef REGISTER_KERNELS + +template +void CheckpointLoader::RestoreSSD() { + std::string name_string_temp(restore_args_.m_name_string); + std::string new_str = "_"; + int64 pos = name_string_temp.find("/"); + while (pos != std::string::npos) { + name_string_temp.replace(pos, 1, new_str.data(), 1); + pos = name_string_temp.find("/"); + } + std::string ssd_record_file_name = + restore_args_.m_file_name_string + "-" + name_string_temp + "-ssd_record"; + if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) { + std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" + + name_string_temp + "-emb_files"; + BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name); + RestoreSSDBuffer ssd_buffer(&ssd_record_reader); + VLOG(1) << "Loading SSD record... " << ssd_record_file_name; + storage_->RestoreSSD(ev_->GetEmbeddingIndex(), ev_->GetEmbeddingSlotNum(), + ev_->ValueLen(), ssd_emb_file_name, ev_, ssd_buffer); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::RestoreSSD(); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void CheckpointLoader::RestoreInternal(const std::string& name_string, + const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + RestoreBuffer& restore_buff) { + Status s = EVInitTensorNameAndShape(name_string); + if (!s.ok()) { + LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString(); + return; + } + + Tensor part_offset_tensor; + Tensor part_filter_offset_tensor; + if (!restore_args_.m_is_oldform) { + /****** InitPartOffsetTensor ******/ + TensorShape part_offset_shape; + DataType part_offset_type; + string offset_tensor_name; + if (!restore_args_.m_is_incr) { + offset_tensor_name = name_string + kPartOffsetTensorSuffsix; + } else { + offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix; + } + + Status s = reader_->LookupDtypeAndShape( + offset_tensor_name, &part_offset_type, &part_offset_shape); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail:" << s.message(); + } + part_offset_tensor = + Tensor(cpu_allocator(), part_offset_type, part_offset_shape); + s = reader_->Lookup(offset_tensor_name, &part_offset_tensor); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail:" << s.message(); + } + + if (restore_args_.m_has_filter) { + TensorShape part_filter_offset_shape; + DataType part_filter_offset_type; + string offset_filter_tensor_name = + name_string + kPartFilterOffsetTensorSuffsix; + s = reader_->LookupDtypeAndShape(offset_filter_tensor_name, + &part_filter_offset_type, + &part_filter_offset_shape); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.message(); + } + part_filter_offset_tensor = Tensor( + cpu_allocator(), part_filter_offset_type, part_filter_offset_shape); + s = reader_->Lookup(offset_filter_tensor_name, + &part_filter_offset_tensor); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.message(); + } + } + } + + if (restore_args_.m_is_oldform) { + VLOG(1) << "old form, EV name:" << name_string + << ", partition_id:" << restore_args_.m_partition_id + << ", new partition num:" << restore_args_.m_partition_num; + int64 new_dim = ev_->ValueLen(); + TensorShape key_shape; + Status st = + reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); + if (!st.ok()) { + LOG(ERROR) << "EVRestoreFeaturesOld fail: " << st.message(); + } + int tot_key_num = key_shape.dim_size(0); + Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff, new_dim, + emb_config, device); + if (!s.ok()) { + LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.message(); + } + } else { + int64 new_dim = ev_->ValueLen(); + VLOG(1) << "new form checkpoint... :" << name_string + << " , partition_id:" << restore_args_.m_partition_id + << " , partition_num:" << restore_args_.m_partition_num; + auto part_offset_flat = part_offset_tensor.flat(); + for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) { + int subpart_id = restore_args_.m_loaded_parts[i]; + size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; + size_t value_unit_bytes_new = sizeof(V) * new_dim; + int subpart_offset = part_offset_flat(subpart_id); + int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset; + int64 key_part_offset = subpart_offset * sizeof(K); + int64 value_part_offset = + subpart_offset * sizeof(V) * restore_args_.m_old_dim; + int64 version_part_offset = subpart_offset * sizeof(int64); + int64 freq_part_offset = subpart_offset * sizeof(int64); + VLOG(1) << "dynamically load ev : " << name_string + << ", subpartid:" << subpart_id; + + EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset, + version_part_offset, freq_part_offset, restore_buff, + new_dim, emb_config, device); + + if (restore_args_.m_has_filter) { + auto part_filter_offset_flat = part_filter_offset_tensor.flat(); + Status s = EVRestoreFilteredFeatures(subpart_id, new_dim, restore_buff, + part_filter_offset_flat, + emb_config, device); + if (!s.ok()) { + LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.message(); + } + } + } + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::RestoreInternal( \ + const std::string&, const EmbeddingConfig&, const Eigen::GpuDevice*, \ + RestoreBuffer&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +bool CheckpointLoader::IsOldCheckpoint( + const std::string& curr_partid_str, + const std::string& kPartOffsetTensorSuffsix) { + if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) { + string tensor_name = restore_args_.m_name_string; + TensorShape part_offset_shape; + DataType part_offset_type; + Status st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (st.ok()) return false; + + string part_id = std::to_string(0); + tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id; + + Status form_st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (form_st.ok()) return false; + } else { + string part_id = std::to_string(0); + size_t part_pos = restore_args_.m_name_string.find(kPartStr); + size_t part_size = strlen(kPartStr); + size_t cur_part_size = curr_partid_str.size(); + + string pre_subname = restore_args_.m_name_string.substr(0, part_pos); + string post_subname = restore_args_.m_name_string.substr( + part_pos + part_size + cur_part_size); + string tensor_name = pre_subname + kPartStr + part_id + post_subname; + + TensorShape part_offset_shape; + DataType part_offset_type; + Status form_st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (form_st.ok()) return false; + pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/ + post_subname = restore_args_.m_name_string.substr(part_pos + part_size + + cur_part_size); + tensor_name = pre_subname + post_subname; + + Status st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (st.ok()) return false; + } + + return true; +} +#define REGISTER_KERNELS(ktype, vtype) \ + template bool CheckpointLoader::IsOldCheckpoint( \ + const std::string&, const std::string&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void CheckpointLoader::InitPartNumAndLoadedParts( + std::vector& tensor_name_vec) { + std::string tmp_key_suffix; + std::string tmp_kPartOffsetTensorSuffsix; + if (!restore_args_.m_is_incr) { + tmp_key_suffix = kKeySuffix; + tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix; + } else { + tmp_key_suffix = kIncrKeySuffix; + tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix; + } + + restore_args_.m_loaded_parts.reserve(kSavedPartitionNum); + int orig_partnum = 0; + const string& curr_partid_str = std::to_string(restore_args_.m_partition_id); + size_t part_pos = restore_args_.m_name_string.find(kPartStr); + + if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) { + restore_args_.m_is_oldform = true; + } + + if (part_pos == std::string::npos) { + for (;; orig_partnum++) { + string part_id = std::to_string(orig_partnum); + string tensor_name = + restore_args_.m_name_string + "/" + kPartStr + part_id; + string tensor_key = tensor_name + tmp_key_suffix; + TensorShape key_shape; + Status st = reader_->LookupTensorShape(tensor_key, &key_shape); + if (!st.ok()) { + break; + } + tensor_name_vec.emplace_back(tensor_name); + } + if (orig_partnum == 0) { + tensor_name_vec.emplace_back(restore_args_.m_name_string); + } + for (int i = 0; i < kSavedPartitionNum; ++i) { + restore_args_.m_loaded_parts.push_back(i); + } + } else { + for (;; orig_partnum++) { + string part_id = std::to_string(orig_partnum); + string pre_subname = restore_args_.m_name_string.substr(0, part_pos); + string post_subname = restore_args_.m_name_string.substr( + part_pos + strlen(kPartStr) + curr_partid_str.size()); + string tensor_name = pre_subname + kPartStr + part_id + post_subname; + string tensor_key = tensor_name + tmp_key_suffix; + TensorShape key_shape; + Status st = reader_->LookupTensorShape(tensor_key, &key_shape); + if (!st.ok()) { + break; + } + tensor_name_vec.emplace_back(tensor_name); + } + if (orig_partnum == 0) { + string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1); + string post_subname = restore_args_.m_name_string.substr( + part_pos + strlen(kPartStr) + curr_partid_str.size()); + string tmp_name = pre_subname + post_subname; + tensor_name_vec.emplace_back(tmp_name); + } + for (int i = 0; i < kSavedPartitionNum; i++) { + if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) { + restore_args_.m_loaded_parts.push_back(i); + } + } + } + for (auto& tensor_name : tensor_name_vec) { + VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name + << " ****"; + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::InitPartNumAndLoadedParts( \ + std::vector&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVInitTensorNameAndShape( + const std::string& tensor_name) { + if (!restore_args_.m_is_incr) { + restore_args_.m_tensor_key = tensor_name + kKeySuffix; + restore_args_.m_tensor_value = tensor_name + kValueSuffix; + restore_args_.m_tensor_version = tensor_name + kVersionSuffix; + restore_args_.m_tensor_freq = tensor_name + kFreqSuffix; + } else { + restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix; + restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix; + restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix; + restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix; + } + + TensorShape key_shape, value_shape, version_shape, freq_shape; + + Status st = + reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_version, + &version_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_key, + sizeof(K) * key_shape.dim_size(0)); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader( + restore_args_.m_tensor_value, + sizeof(V) * value_shape.dim_size(0) * value_shape.dim_size(1)); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_version, + sizeof(int64) * version_shape.dim_size(0)); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + freq_shape = version_shape; + } else { + return st; + } + } + st = reader_->LookupHeader(restore_args_.m_tensor_freq, + sizeof(int64) * freq_shape.dim_size(0)); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + restore_args_.m_has_freq = false; + } else { + return st; + } + } + restore_args_.m_old_dim = value_shape.dim_size(1); + + if (!restore_args_.m_is_oldform) { + TensorShape key_filter_shape, version_filter_shape, freq_filter_shape; + st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered", + &key_filter_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + key_filter_shape = key_shape; + restore_args_.m_has_filter = false; + } else { + return st; + } + } + st = reader_->LookupTensorShape( + restore_args_.m_tensor_version + "_filtered", &version_filter_shape); + if ((!st.ok()) && (st.code() != error::NOT_FOUND)) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered", + sizeof(K) * key_filter_shape.dim_size(0)); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + restore_args_.m_has_filter = false; + } else { + return st; + } + } + st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered", + sizeof(K) * version_filter_shape.dim_size(0)); + if (!st.ok() && st.code() != error::NOT_FOUND) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered", + &freq_filter_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + freq_filter_shape = freq_shape; + } else { + return st; + } + } + + st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered", + sizeof(K) * freq_filter_shape.dim_size(0)); + if (!st.ok() && st.code() != error::NOT_FOUND) { + return st; + } + } + + return OkStatus(); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVInitTensorNameAndShape( \ + const std::string&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVRestoreFeatures( + int tot_key_num, int64 key_part_offset, int64 value_part_offset, + int64 version_part_offset, int64 freq_part_offset, + RestoreBuffer& restore_buff, int64 new_dim, + const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) { + size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; + size_t value_unit_bytes_new = sizeof(V) * new_dim; + int64 tot_key_bytes_read(0); + int64 tot_value_bytes_read(0); + int64 tot_version_bytes_read(0); + int64 tot_freq_bytes_read(0); + size_t key_bytes_read = 0; + size_t value_bytes_read = 0; + size_t version_bytes_read = 0; + size_t freq_bytes_read = 0; + + while (tot_key_num > 0) { + size_t read_key_num = std::min( + std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes), + kBufferSize / sizeof(int64)); + read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new); + read_key_num = std::min((int)read_key_num, tot_key_num); + reader_->LookupSegmentOffset( + restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read, + read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read); + reader_->LookupSegmentOffset(restore_args_.m_tensor_value, + value_part_offset + tot_value_bytes_read, + read_key_num * value_unit_bytes, + restore_buff.value_buffer, value_bytes_read); + if (!restore_args_.m_reset_version) { + reader_->LookupSegmentOffset(restore_args_.m_tensor_version, + version_part_offset + tot_version_bytes_read, + read_key_num * sizeof(int64), + restore_buff.version_buffer, + version_bytes_read); + if (version_bytes_read == 0) { + memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num); + } + } else { + int64* version_tmp = (int64*)restore_buff.version_buffer; + memset(version_tmp, 0, read_key_num * sizeof(int64)); + } + + if (restore_args_.m_has_freq) { + reader_->LookupSegmentOffset(restore_args_.m_tensor_freq, + freq_part_offset + tot_freq_bytes_read, + read_key_num * sizeof(int64), + restore_buff.freq_buffer, freq_bytes_read); + if (freq_bytes_read == 0) { + int64* freq_tmp = (int64*)restore_buff.freq_buffer; + for (int64 i = 0; i < read_key_num; i++) { + freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); + } + } + } else { + int64* freq_tmp = (int64*)restore_buff.freq_buffer; + for (int64 i = 0; i < read_key_num; i++) { + freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); + } + } + if (key_bytes_read > 0) { + read_key_num = key_bytes_read / sizeof(K); + Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes, + value_bytes_read, value_unit_bytes_new, + restore_buff); + if (!st.ok()) { + LOG(FATAL) << "EV Restore fail:" << st.ToString(); + } + + st = storage_->RestoreFeatures( + read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, + restore_args_.m_partition_num, new_dim, false, + restore_args_.m_is_incr, emb_config, device, filter_, restore_buff); + if (!st.ok()) { + LOG(FATAL) << "EV Restore fail:" << st.ToString(); + } + } + + tot_key_num -= read_key_num; + tot_key_bytes_read += key_bytes_read; + tot_value_bytes_read += value_bytes_read; + tot_version_bytes_read += version_bytes_read; + tot_freq_bytes_read += freq_bytes_read; + } + + return OkStatus(); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVRestoreFeatures( \ + int, int64, int64, int64, int64, RestoreBuffer&, int64, \ + const EmbeddingConfig&, const Eigen::GpuDevice*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVRestoreFilteredFeatures( + int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff, + typename TTypes::Flat part_filter_offset_flat, + const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) { + int subpart_filter_offset = part_filter_offset_flat(subpart_id); + int tot_key_filter_num = + part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset; + int64 key_filter_part_offset = subpart_filter_offset * sizeof(K); + int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64); + int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64); + + VLOG(1) << "key_filter_num: " << tot_key_filter_num + << ", subpart_filter_offset: " << subpart_filter_offset; + + size_t key_filter_bytes_read = 0; + size_t version_filter_bytes_read = 0; + size_t freq_filter_bytes_read = 0; + + while (tot_key_filter_num > 0) { + size_t read_key_num = + std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64)); + read_key_num = std::min((int)read_key_num, tot_key_filter_num); + reader_->LookupSegmentOffset(restore_args_.m_tensor_key + "_filtered", + key_filter_part_offset + key_filter_bytes_read, + read_key_num * sizeof(K), + restore_buff.key_buffer, + key_filter_bytes_read); + if (!restore_args_.m_reset_version) { + reader_->LookupSegmentOffset( + restore_args_.m_tensor_version + "_filtered", + version_filter_part_offset + version_filter_bytes_read, + read_key_num * sizeof(int64), restore_buff.version_buffer, + version_filter_bytes_read); + } else { + int64* version_tmp = (int64*)restore_buff.version_buffer; + memset(version_tmp, 0, read_key_num * sizeof(int64)); + } + reader_->LookupSegmentOffset( + restore_args_.m_tensor_freq + "_filtered", + freq_filter_part_offset + freq_filter_bytes_read, + read_key_num * sizeof(int64), restore_buff.freq_buffer, + freq_filter_bytes_read); + if (key_filter_bytes_read > 0) { + read_key_num = key_filter_bytes_read / sizeof(K); + VLOG(2) << "restore, read_key_num:" << read_key_num; + Status st = storage_->RestoreFeatures( + read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, + restore_args_.m_partition_num, value_len, true, + restore_args_.m_is_incr, emb_config, device, filter_, restore_buff); + if (!st.ok()) return st; + tot_key_filter_num -= read_key_num; + } + } + return OkStatus(); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVRestoreFilteredFeatures( \ + int64, int64, RestoreBuffer&, typename TTypes::Flat, \ + const EmbeddingConfig&, const Eigen::GpuDevice*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h new file mode 100644 index 00000000..4235d8fb --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h @@ -0,0 +1,223 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_ + +#include "embedding_config.h" +#include "embedding_var.h" +#include "filter_policy.h" +#include "storage.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +template +class EmbeddingVar; + +namespace { +const size_t kBufferSize = 8 << 20; +constexpr char kPartStr[] = "part_"; + +constexpr char kPartOffsetTensorSuffsix[] = "-partition_offset"; +constexpr char kPartFilterOffsetTensorSuffsix[] = "-partition_filter_offset"; +constexpr char kKeySuffix[] = "-keys"; +constexpr char kValueSuffix[] = "-values"; +constexpr char kVersionSuffix[] = "-versions"; +constexpr char kFreqSuffix[] = "-freqs"; + +constexpr char kIncrPartOffsetTensorSuffsix[] = "-incr_partition_offset"; +constexpr char kIncrKeySuffix[] = "-sparse_incr_keys"; +constexpr char kIncrValueSuffix[] = "-sparse_incr_values"; +constexpr char kIncrVersionSuffix[] = "-sparse_incr_versions"; +constexpr char kIncrFreqSuffix[] = "-sparse_incr_freqs"; +} // namespace + +template +int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer); + +template +struct RestoreSSDBuffer { + int64* file_list_buf = nullptr; + int64* invalid_record_count_list_buf = nullptr; + int64* record_count_list_buf = nullptr; + K* key_list_buf = nullptr; + int64* key_file_id_list_buf = nullptr; + int64* key_offset_list_buf = nullptr; + int64 num_of_keys = 0; + int64 num_of_files = 0; + + explicit RestoreSSDBuffer(BundleReader* ssd_record_reader) { + num_of_files = ReadRecord(ssd_record_reader, "files", &file_list_buf); + + ReadRecord(ssd_record_reader, "invalid_record_count", + &invalid_record_count_list_buf); + ReadRecord(ssd_record_reader, "record_count", &record_count_list_buf); + num_of_keys = ReadRecord(ssd_record_reader, "keys", &key_list_buf); + + ReadRecord(ssd_record_reader, "keys_file_id", &key_file_id_list_buf); + ReadRecord(ssd_record_reader, "keys_offset", &key_offset_list_buf); + } + + ~RestoreSSDBuffer() { + delete[] file_list_buf; + delete[] invalid_record_count_list_buf; + delete[] record_count_list_buf; + delete[] key_list_buf; + delete[] key_file_id_list_buf; + delete[] key_offset_list_buf; + } +}; + +struct RestoreArgs { + std::string m_file_name_string; + std::string m_name_string; + std::string m_tensor_key; + std::string m_tensor_value; + std::string m_tensor_version; + std::string m_tensor_freq; + std::vector m_loaded_parts; + int64 m_partition_id; + int64 m_partition_num; + int64 m_idx; + int m_old_dim; + bool m_is_incr; + bool m_reset_version; + bool m_has_freq; + bool m_has_filter; + bool m_is_oldform; + RestoreArgs(const std::string name_string, const std::string file_name_string, + int64 partition_id, int64 partition_num, bool is_incr, + bool reset_version) + : m_name_string(name_string), + m_file_name_string(file_name_string), + m_partition_id(partition_id), + m_partition_num(partition_num), + m_idx(0), + m_old_dim(0), + m_is_incr(is_incr), + m_reset_version(reset_version), + m_has_freq(true), + m_has_filter(true), + m_is_oldform(false) {} + RestoreArgs() = default; +}; + +template +class CheckpointLoader { + public: + CheckpointLoader(embedding::Storage* storage, EmbeddingVar* ev, + FilterPolicy>* filter, + const std::string& name_string, + const std::string& file_name_string, int64 partition_id, + int64 partition_num, bool is_incr, bool reset_version, + BundleReader* reader) + : storage_(storage), ev_(ev), filter_(filter), reader_(reader) { + restore_args_ = RestoreArgs(name_string, file_name_string, partition_id, + partition_num, is_incr, reset_version); + } + + void RestoreCkpt(const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device) { + /* Step 1: Restore SSD ckpt Data (Optional) + Step 2; Restore model ckpt */ + RestoreSSD(); + + std::vector tensor_name_vec; + InitPartNumAndLoadedParts(tensor_name_vec); + + RestoreBuffer restore_buff(kBufferSize); + for (auto& tensor_name : tensor_name_vec) { + RestoreInternal(tensor_name, emb_config, device, restore_buff); + } + } + + void RestoreInternal(const std::string& name_string, + const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + RestoreBuffer& restore_buff); + + private: + void RestoreSSD(); + + bool IsOldCheckpoint(const std::string& curr_partid_str, + const std::string& kPartOffsetTensorSuffsix); + + void InitPartNumAndLoadedParts(std::vector& tensor_name_vec); + + Status EVInitTensorNameAndShape(const std::string& tensor_name); + + Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset, + int64 value_part_offset, int64 version_part_offset, + int64 freq_part_offset, RestoreBuffer& restore_buff, + int64 new_dim, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device); + + Status EVRestoreFilteredFeatures( + int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff, + typename TTypes::Flat part_filter_offset_flat, + const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device); + + Status RestoreCustomDim(int new_dim, int read_key_num, + size_t value_unit_bytes, size_t value_bytes_read, + size_t value_unit_bytes_new, + RestoreBuffer& restore_buff) { + bool restore_customDim; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_RESTORE_CUSTOM_DIM", false, + &restore_customDim)); + if (restore_customDim && ev_->IsUseHbm()) { + return errors::FailedPrecondition( + "HBM EV not and custom dim," + "are not supported used together"); + } + if (restore_customDim && restore_args_.m_old_dim != new_dim) { + VLOG(2) << "restore, read_value_reshape dim: from " + << restore_args_.m_old_dim << " to " << new_dim; + if (read_key_num * value_unit_bytes != value_bytes_read) { + return tensorflow::errors::FailedPrecondition( + "Expected read_key_num * value_unit_bytes == " + "value_bytes_read, but got read_key_num * value_unit_bytes " + "!= value_bytes_read!"); + } + + std::unique_ptr tmp_ptr(new char[kBufferSize]); + size_t read_once = std::min(value_unit_bytes, value_unit_bytes_new); + for (int i = 0; i < read_key_num; ++i) { + memcpy(tmp_ptr.get() + i * value_unit_bytes_new, + restore_buff.value_buffer + i * value_unit_bytes, read_once); + if (restore_args_.m_old_dim >= new_dim) continue; + auto p = ev_->GetDefaultValue(restore_args_.m_idx++); + memcpy(tmp_ptr.get() + i * value_unit_bytes_new + value_unit_bytes, + p + value_unit_bytes, value_unit_bytes_new - value_unit_bytes); + } + auto tmp = tmp_ptr.release(); + tmp_ptr.reset(restore_buff.value_buffer); + restore_buff.value_buffer = tmp; + } + return OkStatus(); + } + + private: + embedding::Storage* storage_; + EmbeddingVar* ev_; + FilterPolicy>* filter_; + BundleReader* reader_; + RestoreArgs restore_args_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h b/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h new file mode 100644 index 00000000..766362da --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h @@ -0,0 +1,139 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_ + +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { + +namespace embedding { +template +class MultiTierStorage; + +template +struct StorageItem { + volatile bool is_occupied; + volatile bool is_deleted; + + StorageItem(bool is_occupied, volatile bool is_deleted) + : is_occupied(is_occupied), is_deleted(is_deleted) {} +}; + +template +class EvictionManager { + public: + EvictionManager() { + num_of_threads_ = 1; + TF_CHECK_OK(ReadInt64FromEnvVar("TF_MULTI_TIER_EV_EVICTION_THREADS", 1, + &num_of_threads_)); + thread_pool_.reset(new thread::ThreadPool(Env::Default(), ThreadOptions(), + "EVICTION_MANAGER", 3, + /*low_latency_hint=*/false)); + } + + ~EvictionManager() {} + + TF_DISALLOW_COPY_AND_ASSIGN(EvictionManager); + + void Schedule(std::function fn) { + thread_pool_->Schedule(std::move(fn)); + } + + void AddStorage(MultiTierStorage* storage) { + mutex_lock l(mu_); + auto ret = storage_table_.emplace( + std::make_pair(storage, new StorageItem(false, false))); + if (ret.second && num_of_active_threads_ < num_of_threads_) StartThread(); + } + + void DeleteStorage(MultiTierStorage* storage) { + auto storage_item = storage_table_[storage]; + bool delete_flag = false; + while (!delete_flag) { + volatile bool* occupy_flag = &storage_item->is_occupied; + delete_flag = __sync_bool_compare_and_swap(occupy_flag, false, true); + if (delete_flag) { + storage_item->is_deleted = true; + } + *occupy_flag = false; + } + } + + private: + void StartThread() { + while (this->flag_.test_and_set(std::memory_order_acquire)); + if (num_of_active_threads_ < num_of_threads_) { + __sync_fetch_and_add(&num_of_active_threads_, 1); + thread_pool_->Schedule([this]() { EvictionLoop(); }); + } + this->flag_.clear(std::memory_order_release); + } + + bool CheckStorages() { + mutex_lock l(mu_); + for (auto it = storage_table_.begin(); it != storage_table_.end();) { + if (!(it->second)->is_deleted) + return true; + else + it = storage_table_.erase(it); + } + return false; + } + + void EvictionLoop() { + while (CheckStorages()) { + mutex_lock l(mu_); + for (auto it : storage_table_) { + auto storage = it.first; + auto storage_item = it.second; + volatile bool* occupy_flag = &storage_item->is_occupied; + if (__sync_bool_compare_and_swap(occupy_flag, false, true)) { + if (storage_item->is_deleted) { + *occupy_flag = false; + continue; + } + storage->BatchEviction(); + *occupy_flag = false; + } + Env::Default()->SleepForMicroseconds(1); + } + } + __sync_fetch_and_sub(&num_of_active_threads_, 1); + } + + int64 num_of_threads_; + int64 num_of_active_threads_; + std::atomic_flag flag_ = ATOMIC_FLAG_INIT; + std::map*, StorageItem*> storage_table_; + std::unique_ptr thread_pool_; + mutex mu_; +}; + +class EvictionManagerCreator { + public: + template + static EvictionManager* Create() { + static EvictionManager eviction_manager; + return &eviction_manager; + } +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h new file mode 100644 index 00000000..05787c6a --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h @@ -0,0 +1,154 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#include + +#include "counter_filter_descriptor_impl.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "dynamic_dim_feature_descriptor_impl.h" +#include "feature_descriptor_impl.h" +#include "hbm_multi_tier_feature_descriptor.h" +#include "normal_feature_descriptor.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +namespace embedding { + +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl; + +template +class FeatureDescriptor { + public: + FeatureDescriptor(int64 block_num, int64 slot_num, Allocator* alloc, + StorageType storage_type, bool need_record_freq, + bool need_record_version, + const std::pair& filter_info) { + if (block_num > 1) { + feat_desc_impl_.reset( + new DynmaicDimDescriptorImpl(alloc, block_num * slot_num)); + } else if (filter_info.first) { + feat_desc_impl_.reset(new CounterFilterDescriptorImpl( + alloc, slot_num, need_record_freq, need_record_version, + filter_info.second, storage_type)); + } else if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { + feat_desc_impl_.reset(new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, need_record_freq, need_record_version)); + } else { + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + alloc, slot_num, need_record_freq, need_record_version)); + } + } + + FeatureDescriptor(FeatureDescriptor* feat_desc) { + if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(CounterFilterDescriptorImpl*)) { + feat_desc_impl_.reset(new CounterFilterDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } else if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl)) { + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } else { + feat_desc_impl_.reset(new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + } + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + return feat_desc_impl_->InitSlotInfo(emb_index, embedding_dim, + default_value); + } + + bool InitSlotInfo(FeatureDescriptor* feat_desc) { + return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get()); + } + + V* GetEmbedding(void* val, int emb_index) { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + void* Allocate() { return feat_desc_impl_->Allocate(); } + + void* Allocate(int64 freq) { return feat_desc_impl_->Allocate(freq); } + + void Deallocate(void* val) { feat_desc_impl_->Deallocate(val); } + + void Deallocate(const std::vector& value_ptrs) { + feat_desc_impl_->Deallocate(value_ptrs); + } + + void SetDefaultValue(void* val, int64 index) { + feat_desc_impl_->SetDefaultValue(val, index); + } + + void SetValue(void* val, int64 emb_index, V* value) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + reinterpret_cast*>( + feat_desc_impl_.get()) + ->SetDefaultValues(keys, init_cursor, value_ptrs, compute_stream, + event_mgr, gpu_device); + } +#endif + + void SetAllocator(Allocator* alloc) { feat_desc_impl_->SetAllocator(alloc); } + + int data_bytes() { return feat_desc_impl_->data_bytes(); } + + int64 GetFreq(void* val) { return feat_desc_impl_->GetFreq(val); } + + int64 GetVersion(void* val) { return feat_desc_impl_->GetVersion(val); } + + void SetFreq(void* val, int64 freq) { feat_desc_impl_->SetFreq(val, freq); } + + void UpdateVersion(void* val, int64 version) { + feat_desc_impl_->UpdateVersion(val, version); + } + + void AddFreq(void* val, int64 freq) { feat_desc_impl_->AddFreq(val, freq); } + + int total_dim() { return feat_desc_impl_->total_dim(); } + + bool IsAdmit(void* val) { return feat_desc_impl_->IsAdmit(val); } + + void* Admit(void* val) { return feat_desc_impl_->Admit(val); } + + protected: + std::unique_ptr> feat_desc_impl_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h new file mode 100644 index 00000000..18dc6696 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h @@ -0,0 +1,299 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/util/env_var.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { +namespace embedding { +struct SlotInfo { + int embedding_dim; + int embedding_offset; + void* default_value; + int64 default_value_dim; + int default_value_len; +}; + +class BaseFreqDescriptor { + public: + virtual int64 GetFreq(void* value_ptr) = 0; + virtual void AddFreq(void* value_ptr, int64 freq) {} + virtual void SetFreq(void* value_ptr, int64 freq) {} + virtual BaseFreqDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class FreqDescriptor : public BaseFreqDescriptor { + public: + explicit FreqDescriptor(int offset_byte) : offset_byte_(offset_byte) {} + + int64 GetFreq(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void AddFreq(void* value_ptr, int64 freq) override { + __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq); + } + + void SetFreq(void* value_ptr, int64 freq) override { + *(int64*)(value_ptr + offset_byte_) = freq; + } + + BaseFreqDescriptor* Clone() override { + return new FreqDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonFreqDescriptor : public BaseFreqDescriptor { + public: + int64 GetFreq(void* value_ptr) override { + LOG(FATAL) << "Can not get freq from NonFreqCounter."; + } + + BaseFreqDescriptor* Clone() override { return new NonFreqDescriptor(); } +}; + +class BaseVersionDescriptor { + public: + virtual int64 GetVersion(void* value_ptr) = 0; + virtual void UpdateVersion(void* value_ptr, int64 version) {} + virtual BaseVersionDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class VersionDescriptor : public BaseVersionDescriptor { + public: + explicit VersionDescriptor(int offset_byte) : offset_byte_(offset_byte) {} + + int64 GetVersion(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void UpdateVersion(void* value_ptr, int64 version) override { + *(int64*)(value_ptr + offset_byte_) = version; + } + + BaseVersionDescriptor* Clone() override { + return new VersionDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonVersionDescriptor : public BaseVersionDescriptor { + public: + int64 GetVersion(void* value_ptr) override { + LOG(FATAL) << "Can not get version from NonFreqCounter."; + } + + BaseVersionDescriptor* Clone() override { return new NonVersionDescriptor(); } +}; + +template +class FeatureDescriptorImpl { + public: + FeatureDescriptorImpl(int64 slot_num, bool need_record_freq, + bool need_record_version) { + slot_infos_.resize(slot_num); + for (int i = 0; i < slot_infos_.size(); i++) { + slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE; + } + + if (!need_record_freq) { + freq_desc_.reset(new NonFreqDescriptor()); + } + if (!need_record_version) { + version_desc_.reset(new NonVersionDescriptor()); + } + } + + FeatureDescriptorImpl(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + freq_desc_.reset(feat_desc_impl->freq_desc_->Clone()); + version_desc_.reset(feat_desc_impl->version_desc_->Clone()); + } + + virtual ~FeatureDescriptorImpl() {} + + virtual bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) = 0; + virtual bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + LOG(FATAL) << "InitSlotInfo(feat_desc_impl) is not implemented."; + } + virtual V* GetEmbedding(void* val, int emb_index) = 0; + virtual void* Allocate() = 0; + virtual void* Allocate(int64 freq) { return Allocate(); } + virtual void Deallocate(void* val) = 0; + virtual void Deallocate(const std::vector& val) = 0; + virtual void SetAllocator(Allocator* alloc) = 0; + virtual void SetDefaultValue(void* val, int64 key) = 0; + virtual void SetValue(void* val, int64 emb_index, V* value) {} + virtual bool IsAdmit(void* val) { return true; } + virtual void* Admit(void* val) {} +#if GOOGLE_CUDA + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + virtual int data_bytes() = 0; + + virtual int64 GetFreq(void* val) { return freq_desc_->GetFreq(val); } + + virtual int64 GetVersion(void* val) { return version_desc_->GetVersion(val); } + + virtual void SetFreq(void* val, int64 freq) { + freq_desc_->SetFreq(val, freq); + } + + virtual void UpdateVersion(void* val, int64 version) { + version_desc_->UpdateVersion(val, version); + } + + virtual void AddFreq(void* val, int64 freq) { + freq_desc_->AddFreq(val, freq); + } + + inline int total_dim() { + int64 slot_num = slot_infos_.size(); + return slot_infos_[slot_num - 1].embedding_offset + + slot_infos_[slot_num - 1].embedding_dim; + } + + protected: + bool SetEmbeddingInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + slot_infos_[emb_index].default_value = default_value.first; + slot_infos_[emb_index].default_value_dim = default_value.second; + slot_infos_[emb_index].default_value_len = embedding_dim; + + bool is_aligned = true; + TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true, &is_aligned)); + if (is_aligned) { + embedding_dim = ComputeAlignedDim(embedding_dim); + } + + // Avoid parallel consitency issue + __sync_bool_compare_and_swap(&slot_infos_[emb_index].embedding_offset, + EMPTY_OFFSET_VALUE, embedding_dim); + slot_infos_[emb_index].embedding_dim = embedding_dim; + // Check whether all offsets are set + for (int i = 0; i < slot_infos_.size(); i++) { + if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) { + return false; + } + } + + ComputeEmbeddingOffsets(); + return true; + } + + void SetSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + } + + void ComputeAllocBytes(int* alloc_bytes) { + for (auto slot_info : slot_infos_) { + *alloc_bytes += slot_info.embedding_dim * sizeof(V); + } + } + + void CreateFreqAndVersionDescriptor(int* alloc_bytes) { + if (!freq_desc_) { + freq_desc_.reset(new FreqDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + if (!version_desc_) { + version_desc_.reset(new VersionDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + } + + void InitFreqAndVersion(void* val) { + freq_desc_->SetFreq(val, 0); + version_desc_->UpdateVersion(val, -1); + } + + void SetFreqAndVersionOffset(int* alloc_bytes) { + freq_desc_->SetOffset(alloc_bytes); + version_desc_->SetOffset(alloc_bytes); + } + + V* GetDefaultValuePtr(int64 emb_index, int64 key) { + V* default_value_base = (V*)slot_infos_[emb_index].default_value; + int64 default_value_offset = + (std::abs(key) % slot_infos_[emb_index].default_value_dim) * + slot_infos_[emb_index].default_value_len; + return default_value_base + default_value_offset; + } + + void SetDefaultValue(void* val, int64 emb_index, int64 key) { + memcpy(val, GetDefaultValuePtr(emb_index, key), + slot_infos_[emb_index].default_value_len * sizeof(V)); + } + + private: + int64 ComputeAlignedDim(int64 embedding_dim) { + int padding_bytes = ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES; + if (padding_bytes == ALIGN_BYTES) { + return embedding_dim; + } else { + return embedding_dim + padding_bytes / sizeof(V); + } + } + + void ComputeEmbeddingOffsets() { + for (int i = slot_infos_.size() - 1; i >= 0; i--) { + slot_infos_[i].embedding_offset = 0; + for (int j = 0; j < i; j++) { + slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset; + } + } + } + + protected: + const int EMPTY_OFFSET_VALUE = -1; + const int ALIGN_BYTES = 16; + std::vector slot_infos_; + std::unique_ptr freq_desc_; + std::unique_ptr version_desc_; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h new file mode 100644 index 00000000..db2e1a88 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h @@ -0,0 +1,51 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_ + +#include "bloom_filter_policy.h" +#include "counter_filter_policy.h" +#include "embedding_config.h" +#include "filter_policy.h" +#include "nullable_filter_policy.h" + +namespace tensorflow { +namespace embedding { +template +class Storage; +} + +class FilterFactory { + public: + template + static FilterPolicy* CreateFilter( + const EmbeddingConfig& config, EV* ev, embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) { + if (config.filter_freq > 0) { + if (config.kHashFunc != 0) { + return new BloomFilterPolicy(config, ev, feat_desc); + } else { + return new CounterFilterPolicy(config, ev, feat_desc); + } + } else { + return new NullableFilterPolicy(config, ev, storage, feat_desc); + } + } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h new file mode 100644 index 00000000..090f6f02 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h @@ -0,0 +1,106 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_ + +#include "emb_file.h" +#include "embedding_config.h" +#include "feature_descriptor.h" + +namespace tensorflow { + +struct RestoreBuffer { + char* key_buffer = nullptr; + char* value_buffer = nullptr; + char* version_buffer = nullptr; + char* freq_buffer = nullptr; + bool should_release = false; + + explicit RestoreBuffer(size_t buffer_size) { + key_buffer = new char[buffer_size]; + value_buffer = new char[buffer_size]; + version_buffer = new char[buffer_size]; + freq_buffer = new char[buffer_size]; + should_release = true; + } + + explicit RestoreBuffer(char* i_key_buffer, char* i_value_buffer, + char* i_version_buffer, char* i_freq_buffer) { + key_buffer = i_key_buffer; + value_buffer = i_value_buffer; + version_buffer = i_version_buffer; + freq_buffer = i_freq_buffer; + } + + ~RestoreBuffer() { + if (should_release) { + delete[] key_buffer; + delete[] value_buffer; + delete[] version_buffer; + delete[] freq_buffer; + } + } +}; + +template +class RestoreSSDBuffer; + +template +class FilterPolicy { + public: + FilterPolicy(const EmbeddingConfig& config, EV* ev) + : config_(config), ev_(ev) {} + + virtual void LookupOrCreate(K key, V* val, const V* default_value_ptr, + void** value_ptr, int count, + const V* default_value_no_permission) = 0; + + virtual Status Lookup(K key, V* val, const V* default_value_ptr, + const V* default_value_no_permission) = 0; + +#if GOOGLE_CUDA + virtual void BatchLookup(const EmbeddingVarContext& context, + const K* keys, V* output, int64 num_of_keys, + V* default_value_ptr, + V* default_value_no_permission) = 0; + + virtual void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptrs_list, + int64 num_of_keys) = 0; +#endif // GOOGLE_CUDA + + virtual Status LookupOrCreateKey(K key, void** val, bool* is_filter, + int64 count) = 0; + + virtual Status LookupKey(K key, void** val, bool* is_filter, int64 count) {} + + virtual int64 GetFreq(K key, void* value_ptr) = 0; + virtual int64 GetFreq(K key) = 0; + + virtual bool is_admit(K key, void* value_ptr) = 0; + + virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool to_dram, bool is_incr, + RestoreBuffer& restore_buff) = 0; + + protected: + EmbeddingConfig config_; + EV* ev_; +}; +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h new file mode 100644 index 00000000..ed7a7be9 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h @@ -0,0 +1,62 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_ + +#include "shrink_policy.h" + +namespace tensorflow { +namespace embedding { +template +class GlobalStepShrinkPolicy : public ShrinkPolicy { + public: + GlobalStepShrinkPolicy(int64 steps_to_live, FeatureDescriptor* feat_desc, + KVInterface* kv) + : steps_to_live_(steps_to_live), kv_(kv), ShrinkPolicy(feat_desc) {} + + TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy); + + void Shrink(std::vector& key_list, std::vector& value_list, + const ShrinkArgs& shrink_args) override { + ShrinkPolicy::ReleaseValuePtrs(); + FilterToDelete(shrink_args.global_step, key_list, value_list); + } + + private: + void FilterToDelete(int64 global_step, std::vector& key_list, + std::vector& value_list) { + for (int64 i = 0; i < key_list.size(); ++i) { + int64 version = ShrinkPolicy::feat_desc_->GetVersion(value_list[i]); + if (version == -1) { + ShrinkPolicy::feat_desc_->UpdateVersion(value_list[i], + global_step); + } else { + if (global_step - version > steps_to_live_) { + kv_->Remove(key_list[i]); + ShrinkPolicy::EmplacePointer(value_list[i]); + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; + } + } + } + } + + private: + int64 steps_to_live_; + KVInterface* kv_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h new file mode 100644 index 00000000..5d7ba3d0 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h @@ -0,0 +1,333 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_ + +#if GOOGLE_CUDA + +#include "gpu_hash_table.h" +#include "kv_interface.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { + +namespace embedding { + +template +class GPUHashMapKV : public KVInterface { + public: + GPUHashMapKV(const EmbeddingConfig& config, Allocator* alloc) + : config_(config), alloc_(alloc), static_hash_table_(nullptr) { + TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_)); + if (!is_inference_) { + hash_table_ = new GPUHashTable(-1, alloc); + } + } + + ~GPUHashMapKV() override { + if (is_inference_) { + TypedAllocator::Deallocate( + alloc_, static_hash_table_->values_d, + static_hash_table_->capacity_ * static_hash_table_->dimension_); + delete static_hash_table_; + } else { + for (int i = 0; i < hash_table_->bank_ptrs.size(); ++i) { + TypedAllocator::Deallocate(alloc_, hash_table_->bank_ptrs[i], + value_len_ * hash_table_->initial_bank_size); + TypedAllocator::Deallocate(alloc_, hash_table_->existence_flag_ptrs[i], + hash_table_->initial_bank_size); + } + if (hash_table_->mem_bank_num != 0) { + auto num_elements = hash_table_->mem_bank_num * + (config_.block_num * (1 + config_.slot_num)); + TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs, + num_elements); + TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs, + num_elements); + } + delete hash_table_; + } + } + + TF_DISALLOW_COPY_AND_ASSIGN(GPUHashMapKV); + + void SetValueLen(int64 value_len) { value_len_ = value_len; } + + Status BatchLookupOrCreateKeys(const K* keys, size_t n, int32* item_idxs, + const Eigen::GpuDevice& device) { + if (n > 0) { + mutex_lock lock(lock_); + int remaining_size = + n + *(hash_table_->start_idx) - + hash_table_->mem_bank_num * hash_table_->initial_bank_size; + if (remaining_size > 0) { + Resize(remaining_size); + } + functor::KvLookupInsertKey()( + keys, item_idxs, n, hash_table_, hash_table_->start_idx, + device.stream()); + } + return OkStatus(); + } + + Status BatchLookupOrCreate(const K* keys, V* val, V* default_v, + int32 default_v_num, size_t n, + const Eigen::GpuDevice& device) { + if (n > 0) { + int32* item_idxs = + TypedAllocator::Allocate(alloc_, n, AllocationAttributes()); + BatchLookupOrCreateKeys(keys, n, item_idxs, device); + functor::KvLookupCreateEmb()( + keys, val, default_v, value_len_, item_idxs, n, config_.emb_index, + default_v_num, hash_table_->d_bank_ptrs, + hash_table_->d_existence_flag_ptrs, + (config_.block_num * (1 + config_.slot_num)), + hash_table_->initial_bank_size, device.stream()); + TypedAllocator::Deallocate(alloc_, item_idxs, n); + } + + return OkStatus(); + } + + void GetSnapshot(std::vector* key_list, std::vector* value_list, + const EmbeddingConfig& emb_config) { + if (is_inference_) return; // Special case for testing in training mode; + auto size = hash_table_->Size(); + if (size <= 0) return; + + int32* item_idxs = + TypedAllocator::Allocate(alloc_, size, AllocationAttributes()); + K* keys_gpu = + TypedAllocator::Allocate(alloc_, size, AllocationAttributes()); + V* values_gpu = TypedAllocator::Allocate(alloc_, size * value_len_, + AllocationAttributes()); + V* values = TypedAllocator::Allocate(cpu_allocator(), size * value_len_, + AllocationAttributes()); + key_list->resize(size); + for (int64 i = 0; i < size; i++) { + value_list->emplace_back(values + i * value_len_); + } + + auto slot_num = emb_config.block_num * (1 + emb_config.slot_num); + functor::KvKeyGetSnapshot()( + keys_gpu, item_idxs, emb_config.emb_index, emb_config.primary_emb_index, + hash_table_->d_existence_flag_ptrs, hash_table_->mem_bank_num, slot_num, + hash_table_->initial_bank_size, hash_table_, size, NULL); + + functor::KvEmbGetSnapshot()( + keys_gpu, values_gpu, -1, value_len_, item_idxs, size, + emb_config.emb_index, hash_table_->d_bank_ptrs, + hash_table_->mem_bank_num, slot_num, hash_table_->initial_bank_size, + NULL); + + cudaMemcpyAsync(const_cast(key_list->data()), keys_gpu, + size * sizeof(K), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(values, values_gpu, size * value_len_ * sizeof(V), + cudaMemcpyDeviceToHost); + EventSynchronize(NULL); + TypedAllocator::Deallocate(alloc_, item_idxs, size); + TypedAllocator::Deallocate(alloc_, keys_gpu, size); + TypedAllocator::Deallocate(alloc_, values_gpu, size * value_len_); + } + + Status Import(const std::vector& key_import, + const std::vector& value_import, + const Eigen::GpuDevice* device, + const EmbeddingConfig& emb_config) { + int n = key_import.size(); + auto stream = device->stream(); + + if (is_inference_) { + if (n == 0) { + LOG(INFO) << "Size of keys in EmbeddingVar: " << emb_config.name + << " is 0 while loading in inference mode!"; + return OkStatus(); + } + static_hash_table_ = + new GPUStaticHashTable(n, value_len_, -1, -1, alloc_, stream); + K* keys_d = + TypedAllocator::Allocate(alloc_, n, AllocationAttributes()); + cudaMemcpyAsync(keys_d, key_import.data(), n * sizeof(K), + cudaMemcpyHostToDevice, stream); + static_hash_table_->values_d = TypedAllocator::Allocate( + alloc_, value_import.size(), AllocationAttributes()); + cudaMemcpyAsync(static_hash_table_->values_d, value_import.data(), + value_import.size() * sizeof(V), cudaMemcpyHostToDevice, + stream); + functor::KvInitStaticMap()( + keys_d, static_hash_table_, n, value_len_, stream); + EventSynchronize(stream); + + TypedAllocator::Deallocate(alloc_, keys_d, n); + } else { + if (n > 0) { + int32* item_idxs = + TypedAllocator::Allocate(alloc_, n, AllocationAttributes()); + K* key_gpu = + TypedAllocator::Allocate(alloc_, n, AllocationAttributes()); + cudaMemcpyAsync(key_gpu, key_import.data(), + key_import.size() * sizeof(K), cudaMemcpyHostToDevice, + stream); + BatchLookupOrCreateKeys(key_gpu, n, item_idxs, *device); + V* value_gpu = TypedAllocator::Allocate(alloc_, value_import.size(), + AllocationAttributes()); + cudaMemcpyAsync(value_gpu, value_import.data(), + value_import.size() * sizeof(V), cudaMemcpyHostToDevice, + stream); + + functor::KvUpdateEmb()( + key_import.data(), value_gpu, value_len_, item_idxs, n, + emb_config.emb_index, key_import.size(), hash_table_->d_bank_ptrs, + hash_table_->d_existence_flag_ptrs, + (emb_config.block_num * (1 + emb_config.slot_num)), + hash_table_->initial_bank_size, stream); + EventSynchronize(stream); + TypedAllocator::Deallocate(alloc_, item_idxs, n); + TypedAllocator::Deallocate(alloc_, value_gpu, value_import.size()); + TypedAllocator::Deallocate(alloc_, key_gpu, n); + } + } + + return OkStatus(); + } + + Status BatchLookupOrCreate(const K* keys, size_t n, + void** value_ptrs) override { + return OkStatus(); + } + + Status Lookup(K key, void** value_ptr) override { return OkStatus(); } + + Status Contains(K key) override { return OkStatus(); } + + Status Insert(K key, const void* value_ptr) override { return OkStatus(); } + + Status Remove(K key) override { return OkStatus(); } + + Status BatchLookup(const K* keys, size_t size, void** value_ptrs) override { + return OkStatus(); + } + + Status BatchInsert(const std::vector& keys, + const std::vector& value_ptrs) override { + return OkStatus(); + } + + Status BatchRemove(const K* keys, size_t size) override { return OkStatus(); } + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + return OkStatus(); + } + + int64 Size() const override { return 0; } + + void FreeValuePtr(void* value_ptr) override {} + + Status Commit(K key, const void* value_ptr) override { return OkStatus(); } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot"; + return OkStatus(); + } + + std::string DebugString() const override { return std::string(); } + + GPUHashTable* HashTable() override { return hash_table_; } + + Status BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val, + size_t n, const V* default_v) override { + if (n > 0) { + if (is_inference_) { + functor::KvLookupKey, K, V>()( + keys, val, n, value_len_, config_.emb_index, + (config_.block_num * (1 + config_.slot_num)), static_hash_table_, + default_v, config_.default_value_dim, device.stream()); + } else { + functor::KvLookupKey, K, V>()( + keys, val, n, value_len_, config_.emb_index, + (config_.block_num * (1 + config_.slot_num)), hash_table_, + default_v, config_.default_value_dim, device.stream()); + } + } + return OkStatus(); + } + + private: + void Resize(int hint) { + while (hint > 0) { + for (int i = 0; i < (config_.block_num * (1 + config_.slot_num)); ++i) { + V* ptr = TypedAllocator::Allocate( + alloc_, value_len_ * hash_table_->initial_bank_size, + AllocationAttributes()); + hash_table_->bank_ptrs.push_back(ptr); + bool* ptr2 = TypedAllocator::Allocate( + alloc_, hash_table_->initial_bank_size, AllocationAttributes()); + hash_table_->existence_flag_ptrs.push_back(ptr2); + cudaMemset(ptr2, 0, sizeof(bool) * hash_table_->initial_bank_size); + } + hint -= hash_table_->initial_bank_size; + ++hash_table_->mem_bank_num; + } + + auto num_elements = hash_table_->mem_bank_num * + (config_.block_num * (1 + config_.slot_num)); + if (hash_table_->d_bank_ptrs) { + TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs, + num_elements); + TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs, + num_elements); + } + hash_table_->d_bank_ptrs = TypedAllocator::Allocate( + alloc_, num_elements, AllocationAttributes()); + cudaMemcpy(hash_table_->d_bank_ptrs, hash_table_->bank_ptrs.data(), + num_elements * sizeof(V*), cudaMemcpyHostToDevice); + hash_table_->d_existence_flag_ptrs = TypedAllocator::Allocate( + alloc_, num_elements, AllocationAttributes()); + cudaMemcpy(hash_table_->d_existence_flag_ptrs, + hash_table_->existence_flag_ptrs.data(), + num_elements * sizeof(bool*), cudaMemcpyHostToDevice); + } + + void EventSynchronize(const cudaStream_t& stream) { + cudaEvent_t is_finish; + cudaEventCreate(&is_finish); + cudaEventRecord(is_finish, stream); + cudaEventSynchronize(is_finish); + cudaEventDestroy(is_finish); + } + + private: + EmbeddingConfig config_; + bool is_inference_; + GPUStaticHashTable* static_hash_table_; + GPUHashTable* hash_table_; + Allocator* alloc_; + int64 value_len_; + mutex lock_; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc new file mode 100644 index 00000000..fa114f62 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "gpu_hash_table.h" + +#include +#include +#include + +#include "cuco/dynamic_map.cuh" +#include "cuco/static_map.cuh" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace cg = cooperative_groups; + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +namespace { +const size_t BLOCK_SIZE = 128; +const size_t STRIDE = 1; +const size_t TILE_SIZE = 4; +} // namespace +template +class gpu_hash_map_tf_allocator { + public: + Allocator* alloc_; + using value_type = T; + + gpu_hash_map_tf_allocator(Allocator* alloc) : alloc_(alloc) {} + + gpu_hash_map_tf_allocator(const gpu_hash_map_tf_allocator& a) noexcept + : alloc_(a.alloc_) {} + + template + gpu_hash_map_tf_allocator(const gpu_hash_map_tf_allocator& a) noexcept + : alloc_(a.alloc_) {} + + gpu_hash_map_tf_allocator& operator=( + const gpu_hash_map_tf_allocator& a) noexcept { + return *this; + } + + gpu_hash_map_tf_allocator& operator=(gpu_hash_map_tf_allocator&& a) { + alloc_ = a.alloc_; + return *this; + } + + ~gpu_hash_map_tf_allocator() noexcept {} + + value_type* allocate(size_t size) const { + void* ptr = + alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + size * sizeof(value_type), AllocationAttributes()); + return (value_type*)ptr; + } + + void deallocate(value_type* ptr, size_t) const { alloc_->DeallocateRaw(ptr); } +}; + +template +bool operator==(gpu_hash_map_tf_allocator const&, + gpu_hash_map_tf_allocator const&) noexcept { + return true; +} + +template +bool operator!=(gpu_hash_map_tf_allocator const& lhs, + gpu_hash_map_tf_allocator const& rhs) noexcept { + return not(lhs == rhs); +} + +template > +class DynamicHashTable { + public: + cuco::dynamic_map + map_; + + DynamicHashTable(size_t initial_capacity, KeyType empty_key_sentinel, + ValueType empty_value_sentinel, CUCOAllocator alloc) + : map_(initial_capacity, empty_key_sentinel, empty_value_sentinel, + alloc) {} + ~DynamicHashTable() {} +}; + +template +GPUHashTable::GPUHashTable(K empty_key_sentinel, Allocator* alloc, + size_t initial_capacity) + : initial_bank_size(initial_capacity) { + hash_table = + new DynamicHashTable(initial_capacity, empty_key_sentinel, -1, + gpu_hash_map_tf_allocator(alloc)); + cudaMallocManaged( + &start_idx, sizeof(cuda::atomic)); + *start_idx = 0; +} + +template +GPUHashTable::~GPUHashTable() { + delete hash_table; + cudaFree(start_idx); +} + +template +int32 GPUHashTable::Size() { + return hash_table->map_.get_size(); +} + +template > +class StaticHashTable { + public: + cuco::static_map map_; + + StaticHashTable(size_t initial_capacity, K empty_key_sentinel, + int32 empty_value_sentinel, CUCOAllocator alloc) + : map_(initial_capacity, empty_key_sentinel, empty_value_sentinel, + alloc) {} +}; + +template +GPUStaticHashTable::GPUStaticHashTable(size_t capacity, int dimension, + K empty_key_sentinel, + int32 empty_value_sentinel, + Allocator* alloc, + cudaStream_t stream) { + capacity_ = capacity; + dimension_ = dimension; + // cudaMallocAsync(&values_d, sizeof(V) * dimension * capacity, stream); + // cudaMallocManaged(&values_d, sizeof(V) * dimension * capacity); + + hash_table = new StaticHashTable( + capacity / 0.8 /*load_factor*/, empty_key_sentinel, empty_value_sentinel, + gpu_hash_map_tf_allocator(alloc)); +} + +template +GPUStaticHashTable::~GPUStaticHashTable() { + delete hash_table; + delete default_values; + cudaFree(values_d); +} + +template +std::size_t GPUStaticHashTable::Size() { + return hash_table->map_.get_size(); +} + +#define REGISTER_ALL_TYPE(type) \ + template class GPUHashTable; \ + template class GPUHashTable; \ + template class GPUStaticHashTable; \ + template class GPUStaticHashTable; +TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE) +#undef REGISTER_ALL_TYPE + +namespace functor { +using atomicT = cuda::atomic; + +template , + typename KeyEqual = thrust::equal_to> +__global__ void kv_initialize_static_map(const Key* key_first, int32 num_items, + int32 dimension, + mutableViewT map_mutable_view, + atomicT* num_successes, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + + while (key_idx < num_items) { + auto key = *(key_first + key_idx); + int32 value = key_idx * dimension; + + auto const insert_pair = cuco::pair_type{key, value}; + if (map_mutable_view.insert(tile, insert_pair, hash, key_equal) && + tile.thread_rank() == 0) { + thread_num_successes++; + } + + key_idx += (gridDim.x * blockDim.x) / tile_size; + } + std::size_t block_num_successes = + BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + *num_successes += block_num_successes; + } +} + +template +struct KvInitStaticMap { + void operator()(const Key* keys, GPUStaticHashTable* hash_table, + int32 num_items, int32 dimension, cudaStream_t stream) { + using MutableViewT = typename cuco::static_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::device_mutable_view; + + auto& map = hash_table->hash_table->map_; + size_t num_to_insert = num_items; + while (num_to_insert > 0) { + static_assert(sizeof(std::size_t) == sizeof(atomicT)); + CUCO_CUDA_TRY( + cudaMemsetAsync(map.get_num_success(), 0, sizeof(atomicT), stream)); + + auto n = std::min((size_t)65535, num_to_insert); + auto const grid_size = + (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) / (STRIDE * BLOCK_SIZE); + TF_CHECK_OK(GpuLaunchKernel( + kv_initialize_static_map, + thrust::equal_to>, + grid_size, BLOCK_SIZE, 0, stream, keys, n, dimension, + map.get_device_mutable_view(), map.get_num_success(), + cuco::detail::MurmurHash3_32{}, thrust::equal_to{})); + + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + + std::size_t h_num_successes = + map.get_num_success()->load(cuda::std::memory_order_relaxed); + map.update_size(h_num_successes); + keys += n; + num_to_insert -= n; + } + } +}; + +template , + typename KeyEqual = thrust::equal_to> +__global__ void kv_lookup_dynamic_key_kernel( + const Key* key_first, V** value_srcs, V* value_first, const V* default_v, + int32 default_v_num, size_t num_items, int32 dimension, ViewT* submap_views, + uint32_t num_submaps, int32 slot_idx, int32 slot_num, int32 bank_size, + Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) { + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); + + while (key_idx < num_items) { + auto key = *(key_first + key_idx); + int32 found_value = empty_value_sentinel; + + for (auto i = 0; i < num_submaps; ++i) { + auto submap_view = submap_views[i]; + auto found = submap_view.find(tile, key, hash, key_equal); + if (found != submap_view.end()) { + found_value = found->second; + break; + } + } + if (found_value == empty_value_sentinel) { + for (int id = tile.thread_rank(); id < dimension; id += tile_size) { + value_first[key_idx * dimension + id] = + default_v[abs(key) % default_v_num * dimension + id]; + } + } else { + auto bank_idx = found_value / bank_size; + auto offset_in_bank = found_value % bank_size; + auto slot_offset = bank_idx * slot_num + slot_idx; + for (int id = tile.thread_rank(); id < dimension; id += tile_size) { + value_first[key_idx * dimension + id] = + value_srcs[slot_offset][offset_in_bank * dimension + id]; + } + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +template +struct KvLookupKey, Key, V> { + void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension, + int32 slot_idx, int32 slot_num, + GPUHashTable* hash_table, const V* default_v, + int32 default_v_num, cudaStream_t stream) { + using mutableViewT = typename cuco::dynamic_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::mutable_view_type; + using ViewT = typename cuco::dynamic_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::view_type; + + auto& map = hash_table->hash_table->map_; + + auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) / + (STRIDE * BLOCK_SIZE); + TF_CHECK_OK(GpuLaunchKernel( + kv_lookup_dynamic_key_kernel, + grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->d_bank_ptrs, vals, + default_v, default_v_num, num_items, dimension, + map.get_submap_views().data().get(), map.get_submaps().size(), slot_idx, + slot_num, hash_table->initial_bank_size, + cuco::detail::MurmurHash3_32{}, thrust::equal_to{})); + } +}; + +template , + typename KeyEqual = thrust::equal_to> +__global__ void kv_lookup_static_key_kernel( + const Key* key_first, const V* value_srcs, V* value_first, + const V* default_v, int32 default_v_num, size_t num_items, int32 dimension, + ViewT map_views, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) { + auto grid = cooperative_groups::this_grid(); + auto block = cooperative_groups::this_thread_block(); + auto tile = cooperative_groups::tiled_partition(block); + + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; // actual thread idx + auto empty_value_sentinel = map_views.get_empty_value_sentinel(); + + while (key_idx < num_items) { + auto key = *(key_first + key_idx); + int32 found_value = empty_value_sentinel; + auto found = map_views.find(tile, key, hash, key_equal); + if (found != map_views.end()) { + found_value = found->second; + } + + if (found_value == empty_value_sentinel) { + for (int id = tile.thread_rank(); id < dimension; id += tile_size) { + value_first[key_idx * dimension + id] = + default_v[abs(key) % default_v_num * dimension + id]; + } + } else { + for (int id = tile.thread_rank(); id < dimension; id += tile_size) { + value_first[key_idx * dimension + id] = value_srcs[found_value + id]; + } + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +template +struct KvLookupKey, Key, V> { + void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension, + int32 slot_idx, int32 slot_num, + GPUStaticHashTable* hash_table, const V* default_v, + int32 default_v_num, cudaStream_t stream) { + using ViewT = typename cuco::static_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::device_view; + auto& map = hash_table->hash_table->map_; + + auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) / + (STRIDE * BLOCK_SIZE); + TF_CHECK_OK(GpuLaunchKernel( + kv_lookup_static_key_kernel, + grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->values_d, vals, + default_v, default_v_num, num_items, dimension, map.get_device_view(), + cuco::detail::MurmurHash3_32{}, thrust::equal_to{})); + } +}; + +template , + typename KeyEqual = thrust::equal_to> +__global__ void kv_lookup_and_insert_key_kernel( + const Key* key_first, int32* value_first, int32 num_items, + mutableViewT* submap_mutable_views, ViewT* submap_views, + uint32_t num_submaps, atomicT* num_successes, atomicT* start_idx, + int32 submap_idx, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); + int32 tmp; + + while (key_idx < num_items) { + auto key = *(key_first + key_idx); + int32 found_value = empty_value_sentinel; + + for (auto i = 0; i < num_submaps; ++i) { + auto submap_view = submap_views[i]; + auto found = submap_view.find(tile, key, hash, key_equal); + if (found != submap_view.end()) { + found_value = found->second; + break; + } + } + if (found_value == empty_value_sentinel) { + if (tile.thread_rank() == 0) { + tmp = start_idx->fetch_add(1); + } + found_value = tile.shfl(tmp, 0); + auto insert_pair = cuco::pair_type{key, found_value}; + if (submap_mutable_views[submap_idx].insert(tile, insert_pair, hash, + key_equal) && + tile.thread_rank() == 0) { + thread_num_successes++; + } + } + + if (tile.thread_rank() == 0) { + *(value_first + key_idx) = found_value; + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } + + std::size_t block_num_successes = + BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + *num_successes += block_num_successes; + } +} + +template +struct KvLookupInsertKey { + void operator()(const Key* key_first, int32* value_first, int32 num_items, + GPUHashTable* hash_table, atomicT* start_idx, + cudaStream_t stream) { + using mutableViewT = typename cuco::dynamic_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::mutable_view_type; + using ViewT = typename cuco::dynamic_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::view_type; + auto& map = hash_table->hash_table->map_; + map.reserve(map.get_size() + num_items); + uint32_t submap_idx = 0; + std::size_t num_to_insert = num_items; + + while (num_to_insert > 0) { + std::size_t capacity_remaining = + map.get_max_load_factor() * + map.get_submaps()[submap_idx]->get_capacity() - + map.get_submaps()[submap_idx]->get_size(); + if (capacity_remaining >= map.get_min_insert_size()) { + *(map.get_num_successes()) = 0; + int device_id; + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(map.get_num_successes(), + sizeof(atomicT), device_id)); + + auto n = std::min(capacity_remaining, num_to_insert); + + auto const grid_size = + (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) / (STRIDE * BLOCK_SIZE); + TF_CHECK_OK(GpuLaunchKernel( + kv_lookup_and_insert_key_kernel< + BLOCK_SIZE, TILE_SIZE, Key, mutableViewT, ViewT, + cuco::detail::MurmurHash3_32, thrust::equal_to>, + grid_size, BLOCK_SIZE, 0, stream, key_first, value_first, n, + map.get_submap_mutable_views().data().get(), + map.get_submap_views().data().get(), map.get_submaps().size(), + map.get_num_successes(), start_idx, submap_idx, + cuco::detail::MurmurHash3_32{}, thrust::equal_to{})); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + std::size_t h_num_successes = + map.get_num_successes()->load(cuda::std::memory_order_relaxed); + map.update_submap_sizes(submap_idx, h_num_successes); + key_first += n; + value_first += n; + num_to_insert -= n; + } + submap_idx++; + } + } +}; + +template +__global__ void kv_lookup_or_create_emb_kernel( + const Key* key_first, Value* val, Value* default_v, int64 dim, + int32* item_idxs, int32 slot_idx, Value** d_banks, bool** d_flags, + int32 slot_num, int32 default_v_num, int32 bank_size) { + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto slot_offset = bank_idx * slot_num + slot_idx; + bool stored = d_flags[slot_offset][offset_in_bank]; + __syncthreads(); + if (stored == false) { + d_flags[slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + int32 default_v_idx = *(key_first + item_idx) % default_v_num; + d_banks[slot_offset][offset_in_bank * dim + id] = + default_v[default_v_idx * dim + id]; + } + } + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + val[item_idx * dim + id] = d_banks[slot_offset][offset_in_bank * dim + id]; + } +} + +template +struct KvLookupCreateEmb { + void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + int32 default_v_num, Value** d_banks, bool** d_flags, + int32 slot_num, int32 bank_size, cudaStream_t stream) { + auto const block_size = 256; + auto const grid_size = num_items; + TF_CHECK_OK(GpuLaunchKernel( + kv_lookup_or_create_emb_kernel, grid_size, block_size, 0, + stream, key_first, val, default_v, dim, item_idxs, slot_idx, d_banks, + d_flags, slot_num, default_v_num, bank_size)); + } +}; + +template +__global__ void kv_update_emb_kernel(const Key* key_first, Value* default_v, + int64 dim, int32* item_idxs, + int32 slot_idx, Value** d_banks, + bool** d_flags, int32 slot_num, + int32 default_v_num, int32 bank_size) { + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto slot_offset = bank_idx * slot_num + slot_idx; + bool stored = d_flags[slot_offset][offset_in_bank]; + __syncthreads(); + if (stored == false) { + d_flags[slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + int32 default_v_idx; + default_v_idx = item_idx % default_v_num; + d_banks[slot_offset][offset_in_bank * dim + id] = + default_v[default_v_idx * dim + id]; + } + } +} + +template +struct KvUpdateEmb { + void operator()(const Key* key_first, Value* default_v, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + int32 default_v_num, Value** d_banks, bool** d_flags, + int32 slot_num, int32 bank_size, cudaStream_t stream) { + auto const block_size = 256; + auto const grid_size = num_items; + TF_CHECK_OK(GpuLaunchKernel(kv_update_emb_kernel, grid_size, + block_size, 0, stream, key_first, default_v, + dim, item_idxs, slot_idx, d_banks, d_flags, + slot_num, default_v_num, bank_size)); + } +}; + +template , + typename KeyEqual = thrust::equal_to> +__global__ void kv_get_key_snapshot_kernel( + Key* key, int32* item_idxs, int32 slot_idx, int32 primary_slot_idx, + bool** d_flags, int32 bank_num, int32 slot_num, int32 bank_size, + ViewT* submap_views, uint32_t num_submaps, int32 ev_size, + Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) { + int n = 0; + for (auto i = 0; i < num_submaps; ++i) { + auto submap_view_size = submap_views[i].get_capacity(); + for (auto j = 0; j < submap_view_size; ++j) { + auto found = submap_views[i].get_slot(j, hash, key_equal); + if (found != submap_views[i].end()) { + int32 item_pos = found->second; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto slot_offset = bank_idx * slot_num + slot_idx; + auto pri_slot_offset = bank_idx * slot_num + primary_slot_idx; + if (d_flags[slot_offset][offset_in_bank] && + d_flags[pri_slot_offset][offset_in_bank]) { + *(key + n) = found->first; + *(item_idxs + n) = found->second; + ++n; + } + } + } + } + for (auto i = n; i < ev_size; ++i) { + *(key + n) = submap_views[0].get_empty_key_sentinel(); + } +} + +template +struct KvKeyGetSnapshot { + void operator()(Key* key_first, int32* value_first, int32 slot_idx, + int32 primary_slot_idx, bool** d_flags, int32 bank_num, + int32 slot_num, int32 bank_size, + GPUHashTable* hash_table, int32 ev_size, + cudaStream_t stream) { + using ViewT = typename cuco::dynamic_map< + Key, int32, cuda::thread_scope_device, + gpu_hash_map_tf_allocator>::view_type; + auto& map = hash_table->hash_table->map_; + + auto const block_size = 1; + auto const grid_size = 1; + TF_CHECK_OK(GpuLaunchKernel( + kv_get_key_snapshot_kernel, + thrust::equal_to>, + grid_size, block_size, 0, stream, key_first, value_first, slot_idx, + primary_slot_idx, d_flags, bank_num, slot_num, bank_size, + map.get_submap_views().data().get(), map.get_submaps().size(), ev_size, + cuco::detail::MurmurHash3_32{}, thrust::equal_to{})); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + } +}; + +template +__global__ void kv_emb_get_snapshot_kernel(Key* key, Value* val, + Key empty_key_sentinel, int64 dim, + int32* item_idxs, int32 slot_idx, + Value** d_banks, int32 bank_num, + int32 slot_num, int32 bank_size, + int32 total_num) { + auto item_idx = blockIdx.x; + if (item_idx < total_num) { + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto slot_offset = bank_idx * slot_num + slot_idx; + if (key[item_idx] != empty_key_sentinel) { + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + val[item_idx * dim + id] = + d_banks[slot_offset][offset_in_bank * dim + id]; + } + } + } +} + +template +struct KvEmbGetSnapshot { + void operator()(Key* key, Value* val, Key empty_key_sentinel, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + Value** d_banks, int32 bank_num, int32 slot_num, + int32 bank_size, cudaStream_t stream) { + auto const block_size = 256; + auto const grid_size = num_items; + if (grid_size == 0) return; + TF_CHECK_OK(GpuLaunchKernel( + kv_emb_get_snapshot_kernel, grid_size, block_size, 0, + stream, key, val, empty_key_sentinel, dim, item_idxs, slot_idx, d_banks, + bank_num, slot_num, bank_size, num_items)); + } +}; + +} // namespace functor + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvInitStaticMap; \ + template struct functor::KvInitStaticMap; \ + template struct functor::KvLookupInsertKey; \ + template struct functor::KvLookupInsertKey; \ + template struct functor::KvLookupCreateEmb; \ + template struct functor::KvLookupCreateEmb; \ + template struct functor::KvKeyGetSnapshot; \ + template struct functor::KvKeyGetSnapshot; \ + template struct functor::KvEmbGetSnapshot; \ + template struct functor::KvEmbGetSnapshot; \ + template struct functor::KvUpdateEmb; \ + template struct functor::KvUpdateEmb; +TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE) + +#define REGISTER_LOOKUP_KERNEL_ALL(hash_table, type) \ + template struct functor::KvLookupKey, int32, type>; \ + template struct functor::KvLookupKey, int64, type>; +#define REGISTER_INFERENCE_LOOKUP_KERNEL(type) \ + REGISTER_LOOKUP_KERNEL_ALL(GPUHashTable, type) +#define REGISTER_TRAINING_LOOKUP_KERNEL(type) \ + REGISTER_LOOKUP_KERNEL_ALL(GPUStaticHashTable, type) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_INFERENCE_LOOKUP_KERNEL) +TF_CALL_REAL_NUMBER_TYPES(REGISTER_TRAINING_LOOKUP_KERNEL) + +#undef REGISTER_INFERENCE_LOOKUP_KERNEL +#undef REGISTER_TRAINING_LOOKUP_KERNEL +#undef REGISTER_LOOKUP_KERNEL_ALL_TYPE +#undef REGISTER_ALL_TYPE + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h new file mode 100644 index 00000000..497b8017 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h @@ -0,0 +1,136 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_ + +#if GOOGLE_CUDA +#include + +#include "tensorflow/core/framework/typed_allocator.h" +#include "tensorflow/core/lib/core/status.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +template +class gpu_hash_map_tf_allocator; + +template +class DynamicHashTable; + +template +class StaticHashTable; + +template +class GPUStaticHashTable { + public: + GPUStaticHashTable(size_t capacity, int dimension, K empty_key_sentinel, + int32 empty_value_sentinel, Allocator* alloc, + cudaStream_t stream); + + ~GPUStaticHashTable(); + + std::size_t Size(); + + StaticHashTable>* hash_table; + V* values_d{nullptr}; + int dimension_; + V* default_values{nullptr}; + int capacity_; +}; + +template +class GPUHashTable { + public: + GPUHashTable(K empty_key_sentinel, Allocator* alloc, + size_t initial_capacity = 50000); + + ~GPUHashTable(); + + int32 Size(); + + DynamicHashTable>* hash_table; + + const int32 initial_bank_size; + cuda::atomic* start_idx; + int32 mem_bank_num = 0; + std::vector bank_ptrs; + V** d_bank_ptrs = nullptr; + std::vector existence_flag_ptrs; + bool** d_existence_flag_ptrs = nullptr; +}; + +namespace functor { + +template +struct KvLookupKey { + void operator()(const Key* key_first, V* value_first, int32 num_items, + int32 dimension, int32 slot_idx, int32 slot_num, + HashTable* hash_table, const V* default_v, + int32 default_v_num, cudaStream_t stream); +}; + +template +struct KvInitStaticMap { + void operator()(const Key* key_first, GPUStaticHashTable* hash_table, + int32 num_items, int32 dimension, cudaStream_t stream); +}; + +template +struct KvLookupInsertKey { + void operator()( + const Key* key_first, int32* value_first, int32 num_items, + GPUHashTable* hash_table, + cuda::atomic* start_idx, + cudaStream_t stream); +}; + +template +struct KvLookupCreateEmb { + void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + int32 default_v_num, Value** d_banks, bool** d_flags, + int32 slot_num, int32 bank_size, cudaStream_t stream); +}; + +template +struct KvUpdateEmb { + void operator()(const Key* key_first, Value* default_v, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + int32 default_v_num, Value** d_banks, bool** d_flags, + int32 slot_num, int32 bank_size, cudaStream_t stream); +}; + +template +struct KvKeyGetSnapshot { + void operator()(Key* key_first, int32* value_first, int32 slot_idx, + int32 primary_slot_idx, bool** d_flags, int32 bank_num, + int32 slot_num, int32 bank_size, + GPUHashTable* hash_table, int32 ev_size, + cudaStream_t stream); +}; + +template +struct KvEmbGetSnapshot { + void operator()(Key* key, Value* val, Key empty_key_sentinel, int64 dim, + int32* item_idxs, int32 num_items, int32 slot_idx, + Value** d_banks, int32 bank_num, int32 slot_num, + int32 bank_size, cudaStream_t stream); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_ \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h new file mode 100644 index 00000000..430acb5a --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h @@ -0,0 +1,601 @@ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_ + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "hbm_storage_iterator.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/platform/stream_executor.h" + +namespace tensorflow { +using se::DeviceMemoryBase; +using se::Stream; + +template +class CheckpointLoader; + +void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr); + +namespace embedding { +template +class HbmDramSsdStorage : public MultiTierStorage { + public: + HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), + MultiTierStorage(sc, name), + dram_capacity_(-1) { + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); + ssd_ = new SsdHashStorage(sc, dram_feat_desc_); + } + + ~HbmDramSsdStorage() override { + MultiTierStorage::DeleteFromEvictionManager(); + delete hbm_; + delete dram_; + delete ssd_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage); + + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + ssd_->Init(); + + MultiTierStorage::cache_capacity_ = + Storage::storage_config_.size[0] / (total_dim() * sizeof(V)); + + dram_capacity_ = + Storage::storage_config_.size[1] / (total_dim() * sizeof(V)); + MultiTierStorage::ready_eviction_ = true; + } + + Status Get(K key, void** value_ptr) override { + Status s = hbm_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = dram_->Get(key, value_ptr); + if (s.ok()) { + AddCopyBackFlagToValuePtr(value_ptr, COPYBACK); + return s; + } + s = ssd_->Get(key, value_ptr); + if (s.ok()) { + AddCopyBackFlagToValuePtr(value_ptr, COPYBACK_AND_DESTROY); + return s; + } + return s; + } + + void BatchGet(const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys) override { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> copyback_cursor_list(num_worker_threads + 1); + std::vector> ssd_value_ptr_list(num_worker_threads + 1); + + BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, + copyback_cursor_list, ssd_value_ptr_list); + + CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list, + copyback_cursor_list[0], ssd_value_ptr_list[0]); + } + + void BatchGetOrCreate( + const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys, int64 value_len, + std::vector>& not_fountd_cursor_list) override { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> copyback_cursor_list(num_worker_threads + 1); + std::vector> ssd_value_ptr_list(num_worker_threads + 1); + + BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, + copyback_cursor_list, ssd_value_ptr_list, + ¬_fountd_cursor_list); + + CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list, + copyback_cursor_list[0], ssd_value_ptr_list[0]); + + CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], + value_len); + } + + void Insert(K key, void** value_ptr) override { + hbm_->Insert(key, value_ptr); + } + + void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override { + if (to_dram) { + dram_->Insert(key, value_ptr); + } else { + hbm_->Insert(key, value_ptr); + } + } + + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL) << "Stroage with HBM only suppotrs batch APIs."; + } + + void InitCache(embedding::CacheStrategy cache_strategy) override { + MultiTierStorage::InitCache(cache_strategy); + dram_cache_ = new LRUCache(); + } + + Status Remove(K key) override { + hbm_->Remove(key); + dram_->Remove(key); + ssd_->Remove(key); + return OkStatus(); + } + + int64 Size() const override { + int64 total_size = hbm_->Size(); + total_size += dram_->Size(); + total_size += ssd_->Size(); + return total_size; + } + + int64 Size(int level) const override { + if (level == 0) { + return hbm_->Size(); + } else if (level == 1) { + return dram_->Size(); + } else if (level == 2) { + return ssd_->Size(); + } else { + return -1; + } + } + + int LookupTier(K key) const override { + Status s = hbm_->Contains(key); + if (s.ok()) return 0; + s = dram_->Contains(key); + if (s.ok()) return 1; + s = ssd_->Contains(key); + if (s.ok()) return 2; + return -1; + } + + bool IsUseHbm() override { return true; } + + bool IsSingleHbm() override { return false; } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector key_list, tmp_dram_key_list; + std::vector value_ptr_list, tmp_dram_value_list; + TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); + hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); + + HbmValueIterator hbm_value_iter(key_list, value_ptr_list, + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); + + for (int64 i = 0; i < value_ptr_list.size(); i++) { + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq(value_ptr, + hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); + } + + TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, &tmp_dram_value_list)); + dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list, shrink_args, + value_len); + + for (int64 i = 0; i < tmp_dram_key_list.size(); i++) { + Status s = hbm_->Contains(tmp_dram_key_list[i]); + if (!s.ok()) { + key_list.emplace_back(tmp_dram_key_list[i]); + value_ptr_list.emplace_back(tmp_dram_value_list[i]); + } + } + + { + mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, emb_config, value_len, default_value, key_list, + value_ptr_list, feat_desc_list, &hbm_value_iter))); + } + + for (auto value_ptr : value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); + } + } + + ssd_->Save(tensor_name, prefix, writer, emb_config, shrink_args, value_len, + default_value); + + return OkStatus(); + } + + Status DramToSsdBatchCommit(std::shared_ptr> keys) { + MultiTierStorage::ReleaseValuePtrs(dram_value_ptr_out_of_date_, + dram_feat_desc_); + mutex_lock l(*(ssd_->get_mutex())); + mutex_lock l1(*(dram_->get_mutex())); + + dram_cache_->update(keys->data(), keys->size()); + int64 dram_count = dram_cache_->size(); + if (dram_count > dram_capacity_) { + int k_size = dram_count - dram_capacity_; + constexpr int DramEvictionSize = 10000; + k_size = std::min(k_size, DramEvictionSize); + K dram_evic_ids[DramEvictionSize]; + size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size); + void* value_ptr; + for (int64 i = 0; i < true_size; ++i) { + if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) { + TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr)); + TF_CHECK_OK(dram_->Remove(dram_evic_ids[i])); + dram_value_ptr_out_of_date_.emplace_back(value_ptr); + } + } + } + return OkStatus(); + } + + void BatchEviction() override { + constexpr int EvictionSize = 10000; + K evic_ids[EvictionSize]; + if (!MultiTierStorage::ready_eviction_) { + return; + } + mutex_lock l(*(hbm_->get_mutex())); + mutex_lock l1(*(dram_->get_mutex())); + + int64 cache_count = MultiTierStorage::cache_->size(); + if (cache_count > MultiTierStorage::cache_capacity_) { + // eviction + int k_size = cache_count - MultiTierStorage::cache_capacity_; + k_size = std::min(k_size, EvictionSize); + size_t true_size = + MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); + void* value_ptr; + std::shared_ptr> keys(new std::vector()); + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; + + for (int64 i = 0; i < true_size; ++i) { + if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { + keys->emplace_back(evic_ids[i]); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); + } + } + + CopyEmbeddingFromHbmToDram(hbm_value_ptrs, dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(*keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); + for (auto it : *keys) { + TF_CHECK_OK(hbm_->Remove(it)); + } + MultiTierStorage::eviction_manager_->Schedule( + [this, keys]() { DramToSsdBatchCommit(keys); }); + } + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + protected: + int total_dim() override { return hbm_feat_desc_->total_dim(); } + + void Restore(const std::string& name_string, + const std::string& file_name_string, int64 partition_id, + int64 partition_num, int64 value_len, bool is_incr, + bool reset_version, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, BundleReader* reader, + EmbeddingVar* ev, + FilterPolicy>* filter) override { + CheckpointLoader restorer(reinterpret_cast*>(this), ev, + filter, name_string, file_name_string, + partition_id, partition_num, is_incr, + reset_version, reader); + restorer.RestoreCkpt(emb_config, device); + + int64 num_of_hbm_ids = + std::min(MultiTierStorage::cache_capacity_, + (int64)MultiTierStorage::cache_->size()); + if (num_of_hbm_ids > 0) { + K* hbm_ids = new K[num_of_hbm_ids]; + int64* hbm_freqs = new int64[num_of_hbm_ids]; + int64* hbm_versions = nullptr; + MultiTierStorage::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids, + hbm_versions, hbm_freqs); + ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index); + MultiTierStorage::cache_thread_pool_->Schedule( + [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() { + MultiTierStorage::cache_->update(hbm_ids, num_of_hbm_ids, + hbm_versions, hbm_freqs); + delete[] hbm_ids; + delete[] hbm_freqs; + }); + } + } + + Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool is_incr, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) override { + Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num, + value_len, is_filter, true /*to_dram*/, is_incr, + restore_buff); + + MultiTierStorage::cache_->update((K*)restore_buff.key_buffer, key_num, + (int64*)restore_buff.version_buffer, + (int64*)restore_buff.freq_buffer); + return s; + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override {} + + private: + void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { + V* memcpy_buffer_cpu = new V[size * value_len]; + V** value_address = new V*[size]; + V* memcpy_buffer_gpu = (V*)gpu_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, size * value_len * sizeof(V)); + V* dev_value_address = (V*)gpu_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, size * sizeof(V*)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); + } + } + // Split from above for loop for minize the cost of mutex lock + // TODO: Speed up with intra parallelism + + for (int64 i = 0; i < size; i++) { + memcpy(memcpy_buffer_cpu + i * value_len, + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = + hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); + } + cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, + size * value_len * sizeof(V), cudaMemcpyHostToDevice); + cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), + cudaMemcpyHostToDevice); + int block_dim = 128; + void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, + (void*)&value_len, (void*)&size}; + + cudaLaunchKernel((void*)BatchUnpack, + (size + block_dim - 1) / block_dim * value_len, block_dim, + args, 0, NULL); + cudaDeviceSynchronize(); + + delete[] memcpy_buffer_cpu; + delete[] cpu_value_ptrs; + delete[] gpu_value_ptrs; + delete[] value_address; + gpu_alloc_->DeallocateRaw(dev_value_address); + gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu); + } + + void BatchGetValuePtrs( + const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys, + std::vector>& copyback_cursor_list, + std::vector>& ssd_value_ptr_list, + std::vector>* not_found_cursor_list = nullptr) { + int num_worker_threads = ctx.worker_threads->num_threads; + IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); + uint64 main_thread_id = Env::Default()->GetCurrentThreadId(); + + std::function>*, int64, int)> + set_not_found_list = 0; + if (not_found_cursor_list != nullptr) { + set_not_found_list = + [](std::vector>* not_found_cursor_list, int64 i, + int copy_id) { + (*not_found_cursor_list)[copy_id].emplace_back(i); + }; + } else { + set_not_found_list = + [](std::vector>* not_found_cursor_list, int64 i, + int copy_id) {}; + } + + auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc, + main_thread_id, ©back_cursor_list, &ssd_value_ptr_list, + set_not_found_list, + ¬_found_cursor_list](int64 start, int64 limit) { + int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id); + for (int64 i = start; i < limit; i++) { + Status s = Get(keys[i], &value_ptr_list[i]); + if (s.ok()) { + int64 copyback_flag = + (int64)value_ptr_list[i] >> copyback_flag_offset_bits_; + RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]); + if (copyback_flag == COPYBACK) { + copyback_cursor_list[copy_id].emplace_back(i); + } else if (copyback_flag == COPYBACK_AND_DESTROY) { + copyback_cursor_list[copy_id].emplace_back(i); + ssd_value_ptr_list[copy_id].emplace_back(value_ptr_list[i]); + } + } else { + value_ptr_list[i] = nullptr; + set_not_found_list(not_found_cursor_list, i, copy_id); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + + for (int i = 1; i < worker_threads->num_threads + 1; i++) { + if (copyback_cursor_list[i].size() > 0) { + copyback_cursor_list[0].splice(copyback_cursor_list[0].end(), + copyback_cursor_list[i]); + } + if (ssd_value_ptr_list[i].size() > 0) { + ssd_value_ptr_list[0].splice(ssd_value_ptr_list[0].end(), + ssd_value_ptr_list[i]); + } + } + + if (not_found_cursor_list != nullptr) { + for (int i = 1; i < worker_threads->num_threads + 1; i++) { + if ((*not_found_cursor_list)[i].size() > 0) { + (*not_found_cursor_list)[0].splice((*not_found_cursor_list)[0].end(), + (*not_found_cursor_list)[i]); + } + } + } + } + + void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptr_list, + std::list& copyback_cursors, + std::list& ssd_value_ptrs) { + int64 total = copyback_cursors.size(); + std::vector gpu_value_ptrs(total); + std::vector copyback_keys(total); + std::vector memory_index(total); + // Create Hbm ValuePtrs. + int64 i = 0; + auto it = copyback_cursors.cbegin(); + // Mutex with eviction thread + for (; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + gpu_value_ptr, dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; + } + MultiTierStorage::CopyEmbeddingsFromDramToHbm( + ctx, keys, value_ptr_list, copyback_cursors, memory_index, + gpu_value_ptrs, hbm_feat_desc_->total_dim(), hbm_feat_desc_, + dram_feat_desc_); + + // Insert copyback ids to hbm hash table. + auto do_insert = [this, copyback_keys, gpu_value_ptrs, memory_index, + value_ptr_list](int64 start, int64 limit) { + for (int64 i = start; i < limit; i++) { + Status s = hbm_->TryInsert(copyback_keys[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); + hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, total, 100000, + do_insert); + + for (auto it = ssd_value_ptrs.cbegin(); it != ssd_value_ptrs.cend(); ++it) { + ssd_->DestroyValuePtr(*it); + } + } + + void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, + std::list& not_found_cursors, int64 value_len) { + int64 total = not_found_cursors.size(); + if (total > 0) { + std::vector> insert_pairs(total); + std::vector cursor_index(total); + // Create Hbm ValuePtrs. + + int64 i = 0; + auto it = not_found_cursors.cbegin(); + // Mutex with eviction thread + for (; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; + } + + hbm_feat_desc_->SetDefaultValues(keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, + ctx.gpu_device); + + // Insert copyback ids to hbm hash table. + auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]( + int64 start, int64 limit) { + for (int64 i = start; i < limit; i++) { + Status s = + hbm_->TryInsert(insert_pairs[i].first, insert_pairs[i].second); + if (!s.ok()) { + hbm_->DestroyValuePtr(insert_pairs[i].second); + hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, total, 100000, + do_insert); + } + } + + void AddCopyBackFlagToValuePtr(void** value_ptr, CopyBackFlag copyback_flag) { + int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; + tmp = ((int64)*value_ptr) | tmp; + *value_ptr = reinterpret_cast(tmp); + } + + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { + int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; + tmp = ((int64)*value_ptr) & tmp; + *value_ptr = reinterpret_cast(tmp); + } + + private: + HbmStorageWithCpuKv* hbm_ = nullptr; + DramStorage* dram_ = nullptr; + SsdHashStorage* ssd_ = nullptr; + Allocator* gpu_alloc_; + BatchCache* dram_cache_; + int64 dram_capacity_; + std::deque dram_value_ptr_out_of_date_; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; + const int copyback_flag_offset_bits_ = 60; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h new file mode 100644 index 00000000..5b9531c9 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h @@ -0,0 +1,536 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_ + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "hbm_storage_iterator.h" +#include "intra_thread_copy_id_allocator.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" +#include "tensorflow/core/platform/stream_executor.h" + +namespace tensorflow { +using se::DeviceMemoryBase; +using se::Stream; + +template +class CheckpointLoader; + +void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr); + +namespace embedding { +template +class HbmDramStorage : public MultiTierStorage { + public: + HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), MultiTierStorage(sc, name) { + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); + } + + ~HbmDramStorage() override { + MultiTierStorage::DeleteFromEvictionManager(); + delete hbm_; + delete dram_; + delete dram_feat_desc_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage); + + Status Get(K key, void** value_ptr) override { + Status s = hbm_->Get(key, value_ptr); + if (s.ok()) { + return s; + } + s = dram_->Get(key, value_ptr); + if (s.ok()) { + AddCopyBackFlagToValuePtr(value_ptr, COPYBACK); + return s; + } + return s; + } + + void BatchGet(const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys) override { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> copyback_cursor_list(num_worker_threads + 1); + + BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, + copyback_cursor_list); + + CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list, + copyback_cursor_list[0]); + } + + void Insert(K key, void** value_ptr) override { + hbm_->Insert(key, value_ptr); + } + + void BatchGetOrCreate( + const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys, int64 value_len, + std::vector>& not_fountd_cursor_list) override { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> copyback_cursor_list(num_worker_threads + 1); + + BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, + copyback_cursor_list, ¬_fountd_cursor_list); + + CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list, + copyback_cursor_list[0]); + CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], + value_len); + } + + void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override { + if (to_dram) { + dram_->CreateAndInsert(key, value_ptr); + } else { + hbm_->CreateAndInsert(key, value_ptr); + } + } + + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL) << "Stroage with HBM only suppotrs batch APIs."; + } + + Status Remove(K key) override { + hbm_->Remove(key); + dram_->Remove(key); + return OkStatus(); + } + + int64 Size() const override { + int64 total_size = hbm_->Size(); + total_size += dram_->Size(); + return total_size; + } + + int64 Size(int level) const override { + if (level == 0) { + return hbm_->Size(); + } else if (level == 1) { + return dram_->Size(); + } else { + return -1; + } + } + + int LookupTier(K key) const override { + Status s = hbm_->Contains(key); + if (s.ok()) return 0; + s = dram_->Contains(key); + if (s.ok()) return 1; + return -1; + } + + bool IsUseHbm() override { return true; } + + bool IsSingleHbm() override { return false; } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector key_list, tmp_dram_key_list; + std::vector value_ptr_list, tmp_dram_value_list; + TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); + hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); + + HbmValueIterator hbm_value_iter(key_list, value_ptr_list, + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); + + for (int64 i = 0; i < value_ptr_list.size(); i++) { + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq(value_ptr, + hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); + } + + TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, &tmp_dram_value_list)); + dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list, shrink_args, + value_len); + + for (int64 i = 0; i < tmp_dram_key_list.size(); i++) { + Status s = hbm_->Contains(tmp_dram_key_list[i]); + if (!s.ok()) { + key_list.emplace_back(tmp_dram_key_list[i]); + value_ptr_list.emplace_back(tmp_dram_value_list[i]); + } + } + + { + mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, emb_config, value_len, default_value, key_list, + value_ptr_list, feat_desc_list, &hbm_value_iter))); + } + + for (auto value_ptr : value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); + } + } + return OkStatus(); + } + + void BatchEviction() override { + constexpr int EvictionSize = 10000; + K evic_ids[EvictionSize]; + if (!MultiTierStorage::ready_eviction_) { + return; + } + mutex_lock l(*(hbm_->get_mutex())); + mutex_lock l1(*(dram_->get_mutex())); + + int64 cache_count = MultiTierStorage::cache_->size(); + if (cache_count > MultiTierStorage::cache_capacity_) { + // eviction + int k_size = cache_count - MultiTierStorage::cache_capacity_; + k_size = std::min(k_size, EvictionSize); + size_t true_size = + MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); + void* value_ptr; + std::vector keys; + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; + + for (int64 i = 0; i < true_size; ++i) { + if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { + keys.emplace_back(evic_ids[i]); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); + } + } + + CopyEmbeddingFromHbmToDram(hbm_value_ptrs, dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); + for (auto it : keys) { + TF_CHECK_OK(hbm_->Remove(it)); + } + } + } + + void Restore(const std::string& name_string, + const std::string& file_name_string, int64 partition_id, + int64 partition_num, int64 value_len, bool is_incr, + bool reset_version, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, BundleReader* reader, + EmbeddingVar* ev, + FilterPolicy>* filter) override { + CheckpointLoader restorer(reinterpret_cast*>(this), ev, + filter, name_string, file_name_string, + partition_id, partition_num, is_incr, + reset_version, reader); + + restorer.RestoreCkpt(emb_config, device); + + int64 num_of_hbm_ids = + std::min(MultiTierStorage::cache_capacity_, + (int64)MultiTierStorage::cache_->size()); + if (num_of_hbm_ids > 0) { + K* hbm_ids = new K[num_of_hbm_ids]; + int64* hbm_freqs = new int64[num_of_hbm_ids]; + int64* hbm_versions = nullptr; + MultiTierStorage::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids, + hbm_versions, hbm_freqs); + ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index); + MultiTierStorage::cache_thread_pool_->Schedule( + [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() { + MultiTierStorage::cache_->update(hbm_ids, num_of_hbm_ids, + hbm_versions, hbm_freqs); + delete[] hbm_ids; + delete[] hbm_freqs; + }); + } + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + MultiTierStorage::Init(); + } + + protected: + Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool is_incr, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) override { + Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num, + value_len, is_filter, true /*to_dram*/, is_incr, + restore_buff); + + MultiTierStorage::cache_->update((K*)restore_buff.key_buffer, key_num, + (int64*)restore_buff.version_buffer, + (int64*)restore_buff.freq_buffer); + return s; + } + + int total_dim() override { return hbm_feat_desc_->total_dim(); } + + private: + void BatchGetValuePtrs( + const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, int64 num_of_keys, + std::vector>& copyback_cursor_list, + std::vector>* not_found_cursor_list = nullptr) { + int num_worker_threads = ctx.worker_threads->num_threads; + IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); + uint64 main_thread_id = Env::Default()->GetCurrentThreadId(); + + std::function>*, int64, int)> + set_not_found_list = 0; + if (not_found_cursor_list != nullptr) { + set_not_found_list = + [](std::vector>* not_found_cursor_list, int64 i, + int copy_id) { + (*not_found_cursor_list)[copy_id].emplace_back(i); + }; + } else { + set_not_found_list = + [](std::vector>* not_found_cursor_list, int64 i, + int copy_id) {}; + } + + auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc, + main_thread_id, ©back_cursor_list, set_not_found_list, + ¬_found_cursor_list](int64 start, int64 limit) { + int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id); + for (int64 i = start; i < limit; i++) { + Status s = Get(keys[i], &value_ptr_list[i]); + if (s.ok()) { + int64 copyback_flag = + (int64)value_ptr_list[i] >> copyback_flag_offset_bits_; + RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]); + if (copyback_flag == CopyBackFlag::COPYBACK) { + copyback_cursor_list[copy_id].emplace_back(i); + } + } else { + value_ptr_list[i] = nullptr; + set_not_found_list(not_found_cursor_list, i, copy_id); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + + for (int i = 1; i < worker_threads->num_threads + 1; i++) { + if (copyback_cursor_list[i].size() > 0) { + copyback_cursor_list[0].splice(copyback_cursor_list[0].end(), + copyback_cursor_list[i]); + } + } + + if (not_found_cursor_list != nullptr) { + for (int i = 1; i < worker_threads->num_threads + 1; i++) { + if ((*not_found_cursor_list)[i].size() > 0) { + (*not_found_cursor_list)[0].splice((*not_found_cursor_list)[0].end(), + (*not_found_cursor_list)[i]); + } + } + } + } + + void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptr_list, + std::list& copyback_cursors) { + int64 total = copyback_cursors.size(); + std::vector gpu_value_ptrs(total); + std::vector copyback_keys(total); + std::vector memory_index(total); + // Create Hbm ValuePtrs. + int64 i = 0; + auto it = copyback_cursors.cbegin(); + // Mutex with eviction thread + for (; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + gpu_value_ptr, dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; + } + MultiTierStorage::CopyEmbeddingsFromDramToHbm( + ctx, keys, value_ptr_list, copyback_cursors, memory_index, + gpu_value_ptrs, hbm_feat_desc_->total_dim(), hbm_feat_desc_, + dram_feat_desc_); + + // Insert copyback ids to hbm hash table. + auto do_insert = [this, copyback_keys, gpu_value_ptrs, memory_index, + value_ptr_list](int64 start, int64 limit) { + for (int64 i = start; i < limit; i++) { + Status s = hbm_->TryInsert(copyback_keys[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); + hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, total, 100000, + do_insert); + } + + void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, + std::list& not_found_cursors, int64 value_len) { + int64 total = not_found_cursors.size(); + if (total > 0) { + std::vector> insert_pairs(total); + std::vector cursor_index(total); + // Create Hbm ValuePtrs. + int64 i = 0; + auto it = not_found_cursors.cbegin(); + for (; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; + } + + hbm_feat_desc_->SetDefaultValues(keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, + ctx.gpu_device); + + // Insert copyback ids to hbm hash table. + auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]( + int64 start, int64 limit) { + for (int64 i = start; i < limit; i++) { + Status s = + hbm_->TryInsert(insert_pairs[i].first, insert_pairs[i].second); + if (!s.ok()) { + hbm_->DestroyValuePtr(insert_pairs[i].second); + hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, total, 100000, + do_insert); + } + } + + void AddCopyBackFlagToValuePtr(void** value_ptr, CopyBackFlag copyback_flag) { + int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; + tmp = ((int64)*value_ptr) | tmp; + *value_ptr = reinterpret_cast(tmp); + } + + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { + int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; + tmp = ((int64)*value_ptr) & tmp; + *value_ptr = reinterpret_cast(tmp); + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); + } + + void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { + V* memcpy_buffer_cpu = new V[size * value_len]; + V** value_address = new V*[size]; + V* memcpy_buffer_gpu = (V*)gpu_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, size * value_len * sizeof(V)); + V* dev_value_address = (V*)gpu_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, size * sizeof(V*)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); + } + } + // Split from above for loop for minize the cost of mutex lock + // TODO: Speed up with intra parallelism + + for (int64 i = 0; i < size; i++) { + memcpy(memcpy_buffer_cpu + i * value_len, + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = + hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); + } + cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, + size * value_len * sizeof(V), cudaMemcpyHostToDevice); + cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), + cudaMemcpyHostToDevice); + int block_dim = 128; + void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, + (void*)&value_len, (void*)&size}; + + cudaLaunchKernel((void*)BatchUnpack, + (size + block_dim - 1) / block_dim * value_len, block_dim, + args, 0, NULL); + cudaDeviceSynchronize(); + + delete[] memcpy_buffer_cpu; + delete[] cpu_value_ptrs; + delete[] gpu_value_ptrs; + delete[] value_address; + gpu_alloc_->DeallocateRaw(dev_value_address); + gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu); + } + + private: + HbmStorageWithCpuKv* hbm_ = nullptr; + DramStorage* dram_ = nullptr; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; + Allocator* gpu_alloc_; + const int copyback_flag_offset_bits_ = 60; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h new file mode 100644 index 00000000..ea9639ba --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h @@ -0,0 +1,116 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#include "embedding_memory_pool.h" +#include "feature_descriptor_impl.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/util/env_var.h" +#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive +// #include "xla/stream_executor/stream.h" +// #include "xla/stream_executor/stream_executor.h" + +namespace tensorflow { +namespace embedding { +template +class NormalFeatureDescriptorImpl; + +template +class HbmMultiTierFeatureDescriptorImpl : public FeatureDescriptorImpl { + public: + HbmMultiTierFeatureDescriptorImpl(Allocator* alloc, int64 slot_num, + bool need_record_freq, + bool need_record_version) + : dram_alloc_bytes_(sizeof(V*)), + hbm_alloc_(alloc), + dram_alloc_(ev_allocator()), + FeatureDescriptorImpl(slot_num, need_record_freq, + need_record_version) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor( + &dram_alloc_bytes_); + } + + ~HbmMultiTierFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&hbm_alloc_bytes_); + embedding_mem_pool_.reset(new EmbeddingMemoryPool( + hbm_alloc_, hbm_alloc_bytes_ / sizeof(V), 1024 * 1024 * 64)); + } + return is_compute_alloc_bytes; + } + + V* GetEmbedding(void* val, int emb_index) override { + return *((V**)val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = dram_alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + dram_alloc_bytes_); + mutex_lock l(memory_pool_mu_); + *((V**)val) = embedding_mem_pool_->Allocate(); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { + mutex_lock l(memory_pool_mu_); + embedding_mem_pool_->Deallocate(*((V**)val)); + dram_alloc_->DeallocateRaw(val); + } + + void Deallocate(const std::vector& value_ptrs) override { + mutex_lock l(memory_pool_mu_); + for (auto ptr : value_ptrs) { + embedding_mem_pool_->Deallocate(*((V**)ptr)); + dram_alloc_->DeallocateRaw(ptr); + } + } + void SetDefaultValue(void* val, int64 key) override { + LOG(FATAL) << "Can't call SetDefaultValue(void* val, int64 key," + << "int default_value_len) in HbmMultiTierFeatureDescriptor."; + } + + void SetAllocator(Allocator* alloc) override { hbm_alloc_ = alloc; } + + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device); + + int data_bytes() override { return dram_alloc_bytes_; } + + public: + friend class NormalFeatureDescriptorImpl; + + protected: + int dram_alloc_bytes_; + int hbm_alloc_bytes_ = 0; + mutex memory_pool_mu_; // ensure thread safety of embedding_mem_pool_ + Allocator* hbm_alloc_; + Allocator* dram_alloc_; + std::unique_ptr> embedding_mem_pool_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h new file mode 100644 index 00000000..848c55bb --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h @@ -0,0 +1,124 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_ + +#if GOOGLE_CUDA +#include "storage.h" +namespace tensorflow { + +template +class ValuePtr; + +namespace embedding { +template +class HbmValueIterator : public ValueIterator { + public: + HbmValueIterator(const std::vector& key_list, + const std::vector& value_ptr_list, int64 emb_index, + int64 value_len, Allocator* alloc, + FeatureDescriptor* feat_desc) + : value_len_(value_len), alloc_(alloc) { + int64 emb_offset = value_len_ * emb_index; + std::vector> value_parts_vec(kSavedPartitionNum); + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + value_parts_vec[part_id].emplace_back( + feat_desc->GetEmbedding(value_ptr_list[i], emb_index)); + break; + } + } + } + + for (int64 i = 0; i < kSavedPartitionNum; i++) { + values_.splice(values_.end(), value_parts_vec[i]); + } + + values_iter_ = values_.begin(); + + num_of_embs_ = buffer_capacity_ / value_len_; + dev_addr_list_ = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, + num_of_embs_ * sizeof(V*)); + dev_embedding_buffer_ = (V*)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, buffer_capacity_ * sizeof(V)); + + FillEmbeddingBuffer(); + } + + ~HbmValueIterator() { + alloc_->DeallocateRaw(dev_addr_list_); + alloc_->DeallocateRaw(dev_embedding_buffer_); + } + + V* Next() { + if (buffer_cursor_ == num_of_embs_) { + FillEmbeddingBuffer(); + buffer_cursor_ = 0; + } + + V* val = embedding_buffer_ + value_len_ * buffer_cursor_; + counter_++; + values_iter_++; + buffer_cursor_++; + return val; + } + + private: + void FillEmbeddingBuffer() { + int64 total_num = + std::min(num_of_embs_, (int64)(values_.size() - counter_)); + std::vector local_addr_list(total_num); + auto iter = values_iter_; + for (int64 i = 0; i < total_num; i++) { + local_addr_list[i] = *iter; + iter++; + } + cudaMemcpy(dev_addr_list_, local_addr_list.data(), sizeof(V*) * total_num, + cudaMemcpyHostToDevice); + int block_dim = 128; + void* args[] = {(void*)&dev_addr_list_, + (void*)&dev_embedding_buffer_, + (void*)&value_len_, + (void*)&total_num, + nullptr, + nullptr}; + cudaLaunchKernel((void*)BatchCopy, + (total_num + block_dim - 1) / block_dim * value_len_, + block_dim, args, 0, NULL); + cudaDeviceSynchronize(); + cudaMemcpy(embedding_buffer_, dev_embedding_buffer_, + sizeof(V) * total_num * value_len_, cudaMemcpyDeviceToHost); + } + + private: + std::list values_; + typename std::list::iterator values_iter_; + const static int64 buffer_capacity_ = 1024 * 1024 * 1; + V embedding_buffer_[buffer_capacity_]; + int64 counter_ = 0; + int64 buffer_cursor_ = 0; + int64 value_len_; + int64 num_of_embs_ = 0; + Allocator* alloc_; + V** dev_addr_list_; + V* dev_embedding_buffer_; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h b/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h new file mode 100644 index 00000000..5f97a2e2 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h @@ -0,0 +1,73 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_ + +#include +#include +#include + +#include "deepray/custom_ops/utils/spin_rw_lock.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/types.h" +namespace tensorflow { + +// Allocate a copy id for each thread +class IntraThreadCopyIdAllocator { + public: + IntraThreadCopyIdAllocator(int num_threads) + : num_worker_threads_(num_threads) { + is_occupy_flag_.reset(new bool[num_worker_threads_]); + memset(is_occupy_flag_.get(), 0, sizeof(bool) * num_worker_threads_); + } + + int64 GetCopyIdOfThread(uint64 main_thread_id) { + uint64 thread_id = Env::Default()->GetCurrentThreadId(); + if (thread_id == main_thread_id) { + return num_worker_threads_; + } else { + int copy_id = -1; + { + spin_rd_lock l(mu_); + auto iter = hash_map_.find(thread_id); + if (iter != hash_map_.end()) { + copy_id = iter->second; + return copy_id; + } + } + if (copy_id == -1) { + // bind a new thread to a local cursor_list + copy_id = thread_id % num_worker_threads_; + while (!__sync_bool_compare_and_swap(&(is_occupy_flag_[copy_id]), false, + true)) { + copy_id = (copy_id + 1) % num_worker_threads_; + } + { + spin_wr_lock l(mu_); + hash_map_.insert(std::pair(thread_id, copy_id)); + } + return copy_id; + } + } + } + + private: + int num_worker_threads_; + std::unique_ptr is_occupy_flag_; + std::map hash_map_; + mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER; +}; +} // namespace tensorflow +#endif // TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h b/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h new file mode 100644 index 00000000..b80b58bd --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h @@ -0,0 +1,121 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ + +#include "feature_descriptor.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace { +const char* kInferenceMode = "INFERENCE_MODE"; +const int kSavedPartitionNum = 1000; +} // namespace + +template +class GPUHashTable; + +using GPUDevice = Eigen::GpuDevice; +namespace embedding { + +template +class ValueIterator { + public: + virtual V* Next() = 0; +}; + +template +class KVInterface { + public: + virtual ~KVInterface() {} + virtual Status Lookup(K key, void** value_ptr) = 0; + virtual Status Contains(K key) = 0; + virtual Status Insert(K key, const void* value_ptr) = 0; + virtual Status Remove(K key) = 0; + + virtual Status BatchLookup(const K* keys, size_t size, void** value_ptrs) { + return errors::Unimplemented( + "Unimplemented for BatchLookup in KVInterface."); + } + // KV Batch Insert + virtual Status BatchInsert(const std::vector& keys, + const std::vector& value_ptrs) { + return errors::Unimplemented( + "Unimplemented for BatchInsert in KVInterface."); + } + // KV Batch Remove + virtual Status BatchRemove(const K* keys, size_t size) { + return errors::Unimplemented( + "Unimplemented for BatchRemove in KVInterface."); + } + + virtual Status BatchLookupOrCreate(const K* keys, size_t size, + void** value_ptrs) { + return errors::Unimplemented( + "Unimplemented for BatchLookupOrInsert in KVInterface."); + } + + virtual void UpdateValuePtr(K key, void* new_value_ptr, void* old_value_ptr) { + LOG(FATAL) << "Unimplemented for UpdateValuePtr in KVInterface."; + } + + virtual Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) = 0; + + // KV Size + virtual int64 Size() const = 0; + + virtual void FreeValuePtr(void* value_ptr) {} + + virtual Status Commit(K key, const void* value_ptr) { return OkStatus(); } + + virtual Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) = 0; + + virtual Status GetShardedSnapshot( + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, + int partition_nums) = 0; + + virtual std::string DebugString() const = 0; + + virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v, + int32 default_v_num, size_t n, + const GPUDevice& device) { + return OkStatus(); + } + virtual Status BatchLookupOrCreateKeys(const K* keys, size_t n, + int32* item_idxs, + const GPUDevice& device) { + return OkStatus(); + } + + virtual Status BatchLookup(const GPUDevice& device, const K* keys, V* val, + size_t n, const V* default_v) { + return errors::Unimplemented( + "Unimplemented for BatchLookup in KVInterface."); + } + + virtual GPUHashTable* HashTable() { return nullptr; } + + virtual void SetValueLen(int64 value_len) {} +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h new file mode 100644 index 00000000..18f3d2b8 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h @@ -0,0 +1,71 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_ + +#include "shrink_policy.h" + +namespace tensorflow { + +namespace embedding { +template +class L2WeightShrinkPolicy : public ShrinkPolicy { + public: + L2WeightShrinkPolicy(float l2_weight_threshold, int64 index, + FeatureDescriptor* feat_desc, KVInterface* kv) + : index_(index), + kv_(kv), + l2_weight_threshold_(l2_weight_threshold), + ShrinkPolicy(feat_desc) {} + + TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy); + + void Shrink(std::vector& key_list, std::vector& value_list, + const ShrinkArgs& shrink_args) override { + ShrinkPolicy::ReleaseValuePtrs(); + FilterToDelete(shrink_args.value_len, key_list, value_list); + } + + private: + void FilterToDelete(int64 value_len, std::vector& key_list, + std::vector& value_list) { + for (int64 i = 0; i < key_list.size(); ++i) { + V* val = + ShrinkPolicy::feat_desc_->GetEmbedding(value_list[i], index_); + if (val != nullptr) { + V l2_weight = (V)0.0; + for (int64 j = 0; j < value_len; j++) { + l2_weight += val[j] * val[j]; + } + l2_weight *= (V)0.5; + if (l2_weight < (V)l2_weight_threshold_) { + kv_->Remove(key_list[i]); + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; + ShrinkPolicy::EmplacePointer(value_list[i]); + } + } + } + } + + private: + int64 index_; + // int64 offset_; + KVInterface* kv_; + float l2_weight_threshold_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h new file mode 100644 index 00000000..8d415d75 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h @@ -0,0 +1,288 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_ + +#include + +#include "kv_interface.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/write_batch.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/io/path.h" + +using leveldb::DB; +using leveldb::Options; +using leveldb::ReadOptions; +using leveldb::WriteBatch; +using leveldb::WriteOptions; + +namespace tensorflow { +namespace embedding { + +template +class SizeCounter { + public: + SizeCounter(int num_parts) { + num_parts_ = num_parts; + for (int i = 0; i < num_parts_; i++) { + counter_.emplace_back(0); + } + } + + void add(K key, int64 count) { + int part = key % num_parts_; + __sync_fetch_and_add(&counter_[part], count); + } + + void sub(K key, int64 count) { + int part = key % num_parts_; + __sync_fetch_and_sub(&counter_[part], count); + } + + int64 size() { + int64 total = 0; + for (int i = 0; i < num_parts_; i++) { + total += counter_[i]; + } + return total; + } + + private: + std::vector counter_; + int num_parts_; +}; + +template +class LevelDBKV : public KVInterface { + public: + LevelDBKV(std::string path, FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { + path_ = io::JoinPath( + path, "level_db_" + std::to_string(Env::Default()->NowMicros())); + ; + options_.create_if_missing = true; + leveldb::Status s = leveldb::DB::Open(options_, path_, &db_); + CHECK(s.ok()); + counter_ = new SizeCounter(8); + } + + ~LevelDBKV() override { delete db_; } + + Status Lookup(K key, void** value_ptr) override { + std::string val_str; + leveldb::Slice db_key((char*)(&key), sizeof(void*)); + leveldb::ReadOptions options; + leveldb::Status s = db_->Get(options, db_key, &val_str); + if (s.IsNotFound()) { + return errors::NotFound("Unable to find Key: ", key, " in LevelDB."); + } else { + void* val = feat_desc_->Allocate(); + memcpy((int64*)val, &val_str[0], val_str.length()); + *value_ptr = val; + return OkStatus(); + } + } + + Status Contains(K key) override { + std::string val_str; + leveldb::Slice db_key((char*)(&key), sizeof(void*)); + leveldb::ReadOptions options; + leveldb::Status s = db_->Get(options, db_key, &val_str); + if (s.IsNotFound()) { + return errors::NotFound("Unable to find Key: ", key, " in LevelDB."); + } else { + return OkStatus(); + } + } + + Status Insert(K key, const void* value_ptr) override { + counter_->add(key, 1); + return OkStatus(); + } + + Status BatchInsert(const std::vector& keys, + const std::vector& value_ptrs) override { + return BatchCommit(keys, value_ptrs); + } + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + WriteBatch batch; + for (int i = 0; i < keys.size(); i++) { + std::string value_res((char*)value_ptrs[i], feat_desc_->data_bytes()); + leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*)); + batch.Put(db_key, value_res); + delete value_ptrs[i]; + } + db_->Write(WriteOptions(), &batch); + return OkStatus(); + } + + Status Commit(K key, const void* value_ptr) override { + std::string value_res((char*)value_ptr, feat_desc_->data_bytes()); + leveldb::Slice db_key((char*)(&key), sizeof(void*)); + leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res); + if (!s.ok()) { + return errors::AlreadyExists("already exists Key: ", key, " in RocksDB."); + } else { + return OkStatus(); + } + } + + Status Remove(K key) override { + counter_->sub(key, 1); + leveldb::Slice db_key((char*)(&key), sizeof(void*)); + leveldb::Status s = db_->Delete(WriteOptions(), db_key); + if (s.ok()) { + return OkStatus(); + } else { + return errors::NotFound("Unable to find Key: ", key, " in RocksDB."); + } + } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + ReadOptions options; + options.snapshot = db_->GetSnapshot(); + leveldb::Iterator* it = db_->NewIterator(options); + void* dram_value_ptr = feat_desc_->Allocate(); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + K key; + memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); + key_list->emplace_back(key); + FeatureDescriptor hbm_feat_desc(1, 1, ev_allocator() /*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes()); + memcpy(dram_value_ptr, it->value().ToString().data(), + feat_desc_->data_bytes()); + hbm_feat_desc.SetFreq(value_ptr, feat_desc_->GetFreq(dram_value_ptr)); + hbm_feat_desc.UpdateVersion(value_ptr, + feat_desc_->GetVersion(dram_value_ptr)); + value_ptr_list->emplace_back(value_ptr); + } + delete it; + feat_desc_->Deallocate(dram_value_ptr); + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + ReadOptions options; + options.snapshot = db_->GetSnapshot(); + leveldb::Iterator* it = db_->NewIterator(options); + void* dram_value_ptr = feat_desc_->Allocate(); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + K key; + memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); + int part_id = key % kSavedPartitionNum % partition_nums; + if (part_id == partition_id) continue; + key_list[part_id].emplace_back(key); + FeatureDescriptor hbm_feat_desc(1, 1, ev_allocator() /*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes()); + memcpy(dram_value_ptr, it->value().ToString().data(), + feat_desc_->data_bytes()); + hbm_feat_desc.SetFreq(value_ptr, feat_desc_->GetFreq(dram_value_ptr)); + hbm_feat_desc.UpdateVersion(value_ptr, + feat_desc_->GetVersion(dram_value_ptr)); + value_ptr_list[part_id].emplace_back(value_ptr); + } + delete it; + feat_desc_->Deallocate(dram_value_ptr); + return OkStatus(); + } + + int64 Size() const override { return counter_->size(); } + + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); + } + + std::string DebugString() const override { return ""; } + + private: + DB* db_; + SizeCounter* counter_; + Options options_; + std::string path_; + FeatureDescriptor* feat_desc_; +}; + +template +class DBValueIterator : public ValueIterator { + public: + DBValueIterator(const std::vector& key_list, int64 emb_index, + int64 value_len, LevelDBKV* leveldb_kv, + FeatureDescriptor* feat_desc) + : value_len_(value_len), + emb_index_(emb_index), + leveldb_kv_(leveldb_kv), + feat_desc_(feat_desc) { + int64 emb_offset = value_len_ * emb_index; + std::vector> keys_parts_vec(kSavedPartitionNum); + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + keys_parts_vec[part_id].emplace_back(key_list[i]); + break; + } + } + } + + for (int64 i = 0; i < kSavedPartitionNum; i++) { + keys_.splice(keys_.end(), keys_parts_vec[i]); + } + + keys_iter_ = keys_.begin(); + } + + ~DBValueIterator() { delete value_ptr_; } + + V* Next() { + if (value_ptr_ != nullptr) { + feat_desc_->Deallocate(value_ptr_); + } + K key = *(keys_iter_++); + + Status s = leveldb_kv_->Lookup(key, &value_ptr_); + if (!s.ok()) { + LOG(FATAL) << "Not found value in LevelDB when Save."; + } + return feat_desc_->GetEmbedding(value_ptr_, emb_index_); + } + + private: + int64 value_len_; + int64 emb_index_; + LevelDBKV* leveldb_kv_; + FeatureDescriptor* feat_desc_; + std::list keys_; + typename std::list::const_iterator keys_iter_; + void* value_ptr_ = nullptr; + int64 key_cursor_ = 0; +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc new file mode 100644 index 00000000..b2cd8026 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc @@ -0,0 +1,188 @@ +/* Copyright 2019 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "multi_tier_storage.h" + +#include "hbm_multi_tier_feature_descriptor.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using se::DeviceMemoryBase; +using se::Stream; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr); + +namespace embedding { +template +void MultiTierStorage::CopyEmbeddingsFromDramToHbm( + const EmbeddingVarContext& ctx, const K* keys, + void** value_ptr_list, std::list& copyback_cursor, + const std::vector& memory_index, + const std::vector& gpu_value_ptrs, int value_len, + FeatureDescriptor* hbm_feat_desc, FeatureDescriptor* dram_feat_desc) { + if (copyback_cursor.size() > 0) { + int total = copyback_cursor.size(); + // Alocate memcpy buffer on CPU and GPU. + Allocator* gpu_alloc = ctx.gpu_allocator; + V* memcpy_buffer_gpu = (V*)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, total * value_len * sizeof(V)); + V* memcpy_buffer_cpu = (V*)cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, total * value_len * sizeof(V)); + + // Copy embeddings on CPU to bufer on CPU + auto do_work = [memory_index, memcpy_buffer_cpu, value_ptr_list, + gpu_value_ptrs, dram_feat_desc, value_len, + this](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + int j = memory_index[i]; + memcpy(memcpy_buffer_cpu + i * value_len, + dram_feat_desc->GetEmbedding(value_ptr_list[j], 0), + value_len * sizeof(V)); + value_ptr_list[j] = gpu_value_ptrs[i]; + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, total, 1000, + do_work); + + // Copy embeddings from CPU buffer to GPU buffer + auto compute_stream = ctx.compute_stream; + auto event_mgr = ctx.event_mgr; + DeviceMemoryBase gpu_buffer_dst_ptr(memcpy_buffer_gpu, + total * value_len * sizeof(V)); + compute_stream->ThenMemcpy(&gpu_buffer_dst_ptr, memcpy_buffer_cpu, + total * value_len * sizeof(V)); + SyncWithEventMgr(compute_stream, event_mgr); + + // Copy addr of embeddings on GPU to GPU + V** value_address = (V**)cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V*) * total); + V** dev_value_address = (V**)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V*) * total); + int64 i = 0; + auto it = copyback_cursor.cbegin(); + for (; it != copyback_cursor.cend(); ++it, ++i) { + // Get the cursor + int64 cursor = *it; + value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0); + } + DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*)); + compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, + total * sizeof(V*)); + + // Copy each embedding to corresponding address + int block_dim = 128; + TF_CHECK_OK(GpuLaunchKernel( + BatchUnpack, (total + block_dim - 1) / block_dim * value_len, + block_dim, 0, ctx.gpu_device.stream(), dev_value_address, + memcpy_buffer_gpu, value_len, total)); + SyncWithEventMgr(compute_stream, event_mgr); + + gpu_alloc->DeallocateRaw(dev_value_address); + gpu_alloc->DeallocateRaw(memcpy_buffer_gpu); + cpu_allocator()->DeallocateRaw(value_address); + cpu_allocator()->DeallocateRaw(memcpy_buffer_cpu); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void MultiTierStorage::CopyEmbeddingsFromDramToHbm( \ + const EmbeddingVarContext&, const ktype*, void**, \ + std::list&, const std::vector&, const std::vector&, \ + int, FeatureDescriptor*, FeatureDescriptor*); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +template +void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( + const K* keys, const std::list& init_cursor, void** value_ptrs, + se::Stream* compute_stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + if (init_cursor.size() > 0) { + int64 total = init_cursor.size(); + TValue** value_address = nullptr; + value_address = TypedAllocator::Allocate( + cpu_allocator(), total * 2, AllocationAttributes()); + TValue** default_value_address = value_address + total; + TValue** dev_value_address = nullptr; + dev_value_address = TypedAllocator::Allocate( + hbm_alloc_, total * 2, AllocationAttributes()); + TValue** dev_default_value_address = dev_value_address + total; + for (int emb_index = 0; + emb_index < FeatureDescriptorImpl::slot_infos_.size(); + emb_index++) { + int64 i = 0; + auto it = init_cursor.cbegin(); + for (; it != init_cursor.cend(); ++it, ++i) { + value_address[i] = GetEmbedding(value_ptrs[*it], emb_index); + default_value_address[i] = + FeatureDescriptorImpl::GetDefaultValuePtr(emb_index, + keys[i]); + } + DeviceMemoryBase gpu_dst_ptr(dev_value_address, + total * 2 * sizeof(TValue*)); + compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, + total * 2 * sizeof(TValue*)); + int block_dim = 128; + int value_len = FeatureDescriptorImpl::slot_infos_[emb_index] + .default_value_len; + TF_CHECK_OK(GpuLaunchKernel( + embedding::CopyEmbedding, + (total * value_len + block_dim - 1) / block_dim, block_dim, 0, + gpu_device.stream(), dev_default_value_address, dev_value_address, + value_len, total)); + SyncWithEventMgr(compute_stream, event_mgr); + } + + TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2); + TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2); + } +} + +#define REGISTER_KERNELS(ktype, vtype) \ + template void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( \ + const ktype*, const std::list&, void**, se::Stream*, EventMgr*, \ + const Eigen::GpuDevice& gpu_device); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +} // namespace embedding +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h new file mode 100644 index 00000000..03e713b6 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h @@ -0,0 +1,303 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_ + +#include "cache_factory.h" +#include "cache_thread_pool_creator.h" +#include "cpu_hash_map_kv.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "embedding_var_context.h" +#include "embedding_var_restore.h" +#include "eviction_manager.h" +#include "globalstep_shrink_policy.h" +#include "kv_interface.h" +#include "l2weight_shrink_policy.h" +#include "storage.h" +#include "storage_config.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/threadpool.h" + +#if GOOGLE_CUDA +#include "batch.h" +#endif + +namespace tensorflow { +template +class EmbeddingVar; + +template +struct SsdRecordDescriptor; + +namespace embedding { +template +class MultiTierStorage : public Storage { + public: + MultiTierStorage(const StorageConfig& sc, const std::string& name) + : Storage(sc), name_(name) {} + + virtual ~MultiTierStorage() { delete cache_; } + + TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage); + + virtual void Init() override { + cache_capacity_ = + Storage::storage_config_.size[0] / (total_dim() * sizeof(V)); + ready_eviction_ = true; + } + + int64 CacheSize() const override { return cache_capacity_; } + + BatchCache* Cache() override { return cache_; } + + void InitCache(embedding::CacheStrategy cache_strategy) override { + if (cache_ == nullptr) { + cache_ = CacheFactory::Create(cache_strategy, name_); + eviction_manager_ = EvictionManagerCreator::Create(); + eviction_manager_->AddStorage(this); + cache_thread_pool_ = CacheThreadPoolCreator::Create(); + } + } + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + LOG(FATAL) << "BatchCommit isn't supported by MultiTierStorage."; + return OkStatus(); + } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + LOG(FATAL) << "Can't get snapshot of MultiTierStorage."; + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + LOG(FATAL) << "Can't get sharded snapshot of MultiTierStorage."; + return OkStatus(); + } + + void CopyEmbeddingsFromCPUToGPU( + int total, const K* keys, const std::list& copyback_cursor, + V** memcpy_address, size_t value_len, void** gpu_value_ptrs, + V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, + const DeviceBase::CpuWorkerThreads* worker_threads) override { + LOG(FATAL) << "Unsupport CopyEmbeddingsFromCPUToGPU in MultiTierStorage."; + }; + + Status Contains(K key) override { + LOG(FATAL) << "Contains is not support in MultiTierStorage."; + return OkStatus(); + } + + bool IsMultiLevel() override { return true; } + + void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len, + int64 block_size) override { + return; + } + + void Schedule(std::function fn) override { + cache_thread_pool_->Schedule(std::move(fn)); + } + + virtual Status Eviction(K* evict_ids, int64 evict_size) override { + LOG(FATAL) << "Eviction isn't support by " << typeid(this).name(); + return OkStatus(); + } + + virtual void BatchEviction() { + constexpr int EvictionSize = 10000; + K evic_ids[EvictionSize]; + if (!ready_eviction_) return; + int cache_count = cache_->size(); + if (cache_count > cache_capacity_) { + // eviction + int k_size = cache_count - cache_capacity_; + k_size = std::min(k_size, EvictionSize); + size_t true_size = cache_->get_evic_ids(evic_ids, k_size); + EvictionWithDelayedDestroy(evic_ids, true_size); + } + } + + void UpdateCache(const Tensor& indices, + const Tensor& indices_counts) override { + Schedule([this, indices, indices_counts]() { + cache_->update(indices, indices_counts); + }); + } + + void UpdateCache(const Tensor& indices) override { + Schedule([this, indices]() { cache_->update(indices); }); + } + + virtual bool IsUseHbm() override { return false; } + + void AddToCachePrefetchList(const Tensor& indices) override { + Schedule([this, indices]() { cache_->add_to_prefetch_list(indices); }); + } + + void AddToCache(const Tensor& indices) override { + Schedule([this, indices]() { cache_->add_to_cache(indices); }); + } + + Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool is_incr, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) override { + Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num, + value_len, is_filter, false /*to_dram*/, is_incr, + restore_buff); + + if (emb_config.is_primary()) { + K* key_buff = (K*)restore_buff.key_buffer; + V* value_buff = (V*)restore_buff.value_buffer; + int64* version_buff = (int64*)restore_buff.version_buffer; + int64* freq_buff = (int64*)restore_buff.freq_buffer; + if (cache_) { + cache_->update(key_buff, key_num, version_buff, freq_buff); + auto cache_size = CacheSize(); + if (cache_->size() > cache_size) { + int64 evict_size = cache_->size() - cache_size; + std::vector evict_ids(evict_size); + size_t true_size = cache_->get_evic_ids(evict_ids.data(), evict_size); + Eviction(evict_ids.data(), true_size); + } + } + return s; + } + return s; + } + virtual int total_dim() = 0; + + void DeleteFromEvictionManager() { eviction_manager_->DeleteStorage(this); } + + void ReleaseValuePtrs(std::deque& value_ptrs, + FeatureDescriptor* feat_desc) { + constexpr int CAP_INVALID_VALUEPTR = 64 * 1024; + if (value_ptrs.size() > CAP_INVALID_VALUEPTR) { + int64 num_of_deleted_value_ptrs = + value_ptrs.size() - CAP_INVALID_VALUEPTR; + for (int i = 0; i < num_of_deleted_value_ptrs; i++) { + void* value_ptr = value_ptrs.front(); + feat_desc->Deallocate(value_ptr); + value_ptrs.pop_front(); + } + } + } + + void ReleaseInvalidValuePtr(FeatureDescriptor* feat_desc) { + ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc); + } + + void KeepInvalidValuePtr(void* value_ptr) { + value_ptr_out_of_date_.emplace_back(value_ptr); + } + +#if GOOGLE_CUDA + void CopyEmbeddingsFromDramToHbm( + const EmbeddingVarContext& context, const K* keys, + void** value_ptr_list, std::list& copyback_cursors, + const std::vector& memory_index, + const std::vector& gpu_value_ptrs, int value_len, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc); +#endif // GOOGL_CUDA + private: + virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {} + + protected: + std::deque value_ptr_out_of_date_; + BatchCache* cache_ = nullptr; + + EvictionManager* eviction_manager_; + thread::ThreadPool* cache_thread_pool_; + + condition_variable shutdown_cv_; + volatile bool shutdown_ = false; + + int64 cache_capacity_ = -1; + volatile bool ready_eviction_ = false; + + std::string name_; + std::vector mu_list_; +}; + +#if GOOGLE_CUDA +template +void CopyEmbeddingFromHbmToDram(const std::vector& hbm_value_ptrs, + const std::vector& dram_value_ptrs, + Allocator* gpu_alloc, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc) { + int batch_size = hbm_value_ptrs.size(); + V** dev_value_address; + + dev_value_address = (V**)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, batch_size * sizeof(V*)); + Allocator* cpu_alloc = ev_allocator(); + V** value_address = (V**)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V*) * batch_size); + + V* batch_data_place; + V* dev_batch_data_place; + int total_dim = dram_feat_desc->total_dim(); + dev_batch_data_place = (V*)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + batch_data_place = (V*)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + // Copy GPU addresses V* + for (int i = 0; i < batch_size; ++i) { + value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0); + } + cudaMemcpyAsync(dev_value_address, value_address, sizeof(V*) * batch_size, + cudaMemcpyHostToDevice); + + // Launch Kernel,Copy data to continuous place + int block_dim = 128; + void* args[] = {(void*)&dev_value_address, (void*)&dev_batch_data_place, + (void*)&total_dim, (void*)&batch_size}; + + cudaLaunchKernel((void*)BatchCopy, + (batch_size * total_dim + block_dim - 1) / block_dim, + block_dim, args, 0, NULL); + + cudaMemcpyAsync(batch_data_place, dev_batch_data_place, + sizeof(V) * batch_size * total_dim, cudaMemcpyDeviceToHost); + + cudaEvent_t is_finish_; + cudaEventCreate(&is_finish_); + cudaEventRecord(is_finish_); + cudaEventSynchronize(is_finish_); + cudaEventDestroy(is_finish_); + + for (int i = 0; i < batch_size; ++i) { + memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0), + &batch_data_place[i * total_dim], total_dim * sizeof(V)); + } + + cpu_alloc->DeallocateRaw(value_address); + cpu_alloc->DeallocateRaw(batch_data_place); + gpu_alloc->DeallocateRaw(dev_value_address); + gpu_alloc->DeallocateRaw(dev_batch_data_place); +} +#endif // GOOGL_CUDA +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h new file mode 100644 index 00000000..da844008 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h @@ -0,0 +1,127 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#include + +#include "feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +#if GOOGLE_CUDA +template +class HbmMultiTierFeatureDescriptorImpl; +#endif + +template +class NormalFeatureDescriptorImpl : public FeatureDescriptorImpl { + public: + NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num, + bool need_record_freq, bool need_record_version) + : alloc_bytes_(0), + alloc_(alloc), + FeatureDescriptorImpl(slot_num, need_record_freq, + need_record_version) {} + + NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl* feat_desc_impl) + : alloc_(feat_desc_impl->alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + NormalFeatureDescriptorImpl( + HbmMultiTierFeatureDescriptorImpl* feat_desc_impl) + : alloc_bytes_(0), + alloc_(feat_desc_impl->dram_alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + ~NormalFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + return is_compute_alloc_bytes; + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + FeatureDescriptorImpl::SetSlotInfo(feat_desc_impl); + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::SetFreqAndVersionOffset(&alloc_bytes_); + return true; + } + + V* GetEmbedding(void* val, int emb_index) override { + return reinterpret_cast(val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = + alloc_->AllocateRaw(Allocator::kAllocatorAlignment, alloc_bytes_); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { alloc_->DeallocateRaw(val); } + + void Deallocate(const std::vector& value_ptrs) override { + for (auto val : value_ptrs) { + Deallocate(val); + } + } + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy( + val_ptr, value, + sizeof(V) * + FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + + void SetDefaultValue(void* val, int64 index) override { + for (int i = 0; i < FeatureDescriptorImpl::slot_infos_.size(); i++) { + V* val_ptr = GetEmbedding(val, i); + FeatureDescriptorImpl::SetDefaultValue((void*)val_ptr, i, index); + } + } + +#if GOOGLE_CUDA + template + void SetDefaultValues(const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + LOG(FATAL) + << "Can't call SetDefaultValue(const K*, const std::list&," + << "void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)" + << " in HbmMultiTierFeatureDescriptor."; + } +#endif + + void SetAllocator(Allocator* alloc) override { alloc_ = alloc; } + + int data_bytes() override { return alloc_bytes_; } + + private: + int alloc_bytes_; + Allocator* alloc_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h new file mode 100644 index 00000000..1d5c12f7 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h @@ -0,0 +1,173 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_ + +#include "batch.h" +#include "embedding_config.h" +#include "filter_policy.h" + +namespace tensorflow { +namespace embedding { +template +class Storage; +} + +template +class NullableFilterPolicy : public FilterPolicy { + using FilterPolicy::ev_; + using FilterPolicy::config_; + + public: + NullableFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) + : storage_(storage), + feat_desc_(feat_desc), + FilterPolicy(config, ev) {} + + Status Lookup(K key, V* val, const V* default_value_ptr, + const V* default_value_no_permission) override { + void* value_ptr = nullptr; + Status s = ev_->LookupKey(key, &value_ptr); + if (s.ok()) { + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } else { + memcpy(val, default_value_ptr, sizeof(V) * ev_->ValueLen()); + } + return OkStatus(); + } + +#if GOOGLE_CUDA + void BatchLookup(const EmbeddingVarContext& ctx, const K* keys, + V* output, int64 num_of_keys, V* default_value_ptr, + V* default_value_no_permission) override { + std::vector value_ptr_list(num_of_keys, nullptr); + ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); + std::vector embedding_ptr(num_of_keys, nullptr); + auto do_work = [this, keys, value_ptr_list, &embedding_ptr, + default_value_ptr, + default_value_no_permission](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + void* value_ptr = value_ptr_list[i]; + if (value_ptr != nullptr) { + embedding_ptr[i] = + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); + } else { + embedding_ptr[i] = default_value_ptr; + } + } + }; + auto worker_threads = ctx.worker_threads; + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + 1000, do_work); + auto stream = ctx.compute_stream; + auto event_mgr = ctx.event_mgr; + ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(), + stream, event_mgr, ctx.gpu_device); + } + + void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, + const K* keys, void** value_ptrs, + int64 num_of_keys) { + int num_worker_threads = ctx.worker_threads->num_threads; + std::vector> not_found_cursor_list(num_worker_threads + 1); + ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs, num_of_keys, + not_found_cursor_list); + } +#endif // GOOGLE_CUDA + + void LookupOrCreate(K key, V* val, const V* default_value_ptr, + void** value_ptr, int count, + const V* default_value_no_permission) override { + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); + memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); + } + + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, + int64 count) override { + *is_filter = true; + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + storage_->Insert(key, value_ptr); + s = OkStatus(); + } + feat_desc_->AddFreq(*value_ptr, count); + return s; + } + + Status LookupKey(K key, void** val, bool* is_filter, int64 count) override { + *is_filter = true; + return ev_->LookupKey(key, val); + } + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); + } + + int64 GetFreq(K key) override { + if (!config_.is_save_freq()) return 0; + void* value_ptr = nullptr; + TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); + return feat_desc_->GetFreq(value_ptr); + } + + Status Restore(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool to_dram, bool is_incr, + RestoreBuffer& restore_buff) override { + K* key_buff = (K*)restore_buff.key_buffer; + V* value_buff = (V*)restore_buff.value_buffer; + int64* version_buff = (int64*)restore_buff.version_buffer; + int64* freq_buff = (int64*)restore_buff.freq_buffer; + for (auto i = 0; i < key_num; ++i) { + // this can describe by graph(Mod + DynamicPartition), + // but memory waste and slow + if (*(key_buff + i) % bucket_num % partition_num != partition_id) { + VLOG(1) << "skip EV key:" << *(key_buff + i); + continue; + } + int64 import_freq = 0; + int64 import_version = -1; + + if (config_.filter_freq != 0 || ev_->IsMultiLevel() || + config_.record_freq) { + import_freq = freq_buff[i]; + } + if (config_.steps_to_live != 0 || config_.record_version) { + import_version = version_buff[i]; + } + ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); + } + return OkStatus(); + } + + bool is_admit(K key, void* value_ptr) override { return true; } + + private: + embedding::Storage* storage_; + embedding::FeatureDescriptor* feat_desc_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h new file mode 100644 index 00000000..12231fb9 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h @@ -0,0 +1,72 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ + +#include "feature_descriptor.h" +#include "kv_interface.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { + +namespace embedding { +struct ShrinkArgs { + ShrinkArgs() : global_step(0), value_len(0) {} + + ShrinkArgs(int64 global_step, int64 value_len) + : global_step(global_step), value_len(value_len) {} + int64 global_step; + int64 value_len; +}; + +template +class ShrinkPolicy { + public: + ShrinkPolicy(FeatureDescriptor* feat_desc) : feat_desc_(feat_desc) {} + virtual ~ShrinkPolicy() {} + + TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy); + + virtual void Shrink(std::vector& key_list, std::vector& value_list, + const ShrinkArgs& shrink_args) = 0; + + protected: + void EmplacePointer(void* value_ptr) { to_delete_.emplace_back(value_ptr); } + + void ReleaseValuePtrs() { + for (auto it : to_delete_) { + feat_desc_->Deallocate(it); + } + to_delete_.clear(); + } + + protected: + std::vector to_delete_; + FeatureDescriptor* feat_desc_; +}; + +template +class NonShrinkPolicy : public ShrinkPolicy { + public: + NonShrinkPolicy() : ShrinkPolicy(nullptr) {} + TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy); + + void Shrink(std::vector& key_list, std::vector& value_list, + const ShrinkArgs& shrink_args) override {} +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h new file mode 100644 index 00000000..4dd11652 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h @@ -0,0 +1,581 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SINGLE_TIER_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SINGLE_TIER_STORAGE_H_ + +#include "cache.h" +#include "cpu_hash_map_kv.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "globalstep_shrink_policy.h" +#if GOOGLE_CUDA +#include "gpu_hash_map_kv.h" +#endif // GOOGLE_CUDA +#include "kv_interface.h" +#include "l2weight_shrink_policy.h" +#include "leveldb_kv.h" +#include "ssd_hash_kv.h" +#include "storage.h" +#include "storage_config.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +template +class EmbeddingVar; + +template +struct SsdRecordDescriptor; + +namespace embedding { +template +class DramSsdHashStorage; + +template +class DramPmemStorage; + +template +class DramLevelDBStore; + +#if GOOGLE_CUDA +template +class HbmDramStorage; + +template +class HbmDramSsdStorage; +#endif + +template +class SingleTierStorage : public Storage { + public: + SingleTierStorage(const StorageConfig& sc, KVInterface* kv, + FeatureDescriptor* feat_desc) + : kv_(kv), feat_desc_(feat_desc), Storage(sc) { + if (sc.embedding_config.steps_to_live != 0) { + shrink_policy_ = new GlobalStepShrinkPolicy( + sc.embedding_config.steps_to_live, feat_desc_, kv_); + } else if (sc.embedding_config.l2_weight_threshold != -1.0) { + shrink_policy_ = new L2WeightShrinkPolicy( + sc.embedding_config.l2_weight_threshold, + sc.embedding_config.primary_emb_index, feat_desc_, kv_); + } else { + shrink_policy_ = new NonShrinkPolicy(); + } + } + + ~SingleTierStorage() override { + mutex_lock l(Storage::mu_); + std::vector key_list; + std::vector value_ptr_list; + kv_->GetSnapshot(&key_list, &value_ptr_list); + for (auto value_ptr : value_ptr_list) { + feat_desc_->Deallocate(value_ptr); + } + delete kv_; + delete shrink_policy_; + } + + TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage); + + Status Get(K key, void** value_ptr) override { + return kv_->Lookup(key, value_ptr); + } + + Status Contains(K key) override { return kv_->Contains(key); } + + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + do { + *value_ptr = feat_desc_->Allocate(); + Status s = kv_->Insert(key, *value_ptr); + if (s.ok()) { + break; + } else { + feat_desc_->Deallocate(*value_ptr); + } + } while (!(kv_->Lookup(key, value_ptr)).ok()); + } + + virtual void Insert(K key, void** value_ptr) override { + do { + Status s = kv_->Insert(key, *value_ptr); + if (s.ok()) { + break; + } else { + feat_desc_->Deallocate(*value_ptr); + } + } while (!(kv_->Lookup(key, value_ptr)).ok()); + } + + Status GetOrCreate(K key, void** value_ptr) override { + Status s = kv_->Lookup(key, value_ptr); + if (s.ok()) { + return s; + } + + *value_ptr = feat_desc_->Allocate(); + s = kv_->Insert(key, *value_ptr); + if (s.ok()) { + return s; + } + // Insert Failed, key already exist + feat_desc_->Deallocate(*value_ptr); + return kv_->Lookup(key, value_ptr); + } + + Status Remove(K key) override { return kv_->Remove(key); } + + int64 Size() const override { return kv_->Size(); } + + int64 Size(int level) const override { + if (level > 0) { + LOG(FATAL) << "Unsupport level>0 in SingleTierStorage."; + } + return kv_->Size(); + } + + int64 CacheSize() const override { + LOG(FATAL) << "Unsupport cachesize in SingleTierStorage."; + return 0; + } + + int LookupTier(K key) const override { + Status s = kv_->Contains(key); + return (s.ok()) ? 0 : -1; + } + + void CopyEmbeddingsFromCPUToGPU( + int total, const K* keys, const std::list& copyback_cursor, + V** memcpy_address, size_t value_len, void** gpu_value_ptrs, + V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, + const DeviceBase::CpuWorkerThreads* worker_threads) override { + LOG(FATAL) << "Unsupport CopyEmbeddingsFromCPUToGPU in SingleTierStorage."; + }; + + BatchCache* Cache() override { + LOG(FATAL) << "Unsupport Cache in SingleTierStorage."; + return nullptr; + } + + void InitCache(embedding::CacheStrategy cache_strategy) override { + LOG(FATAL) << "Unsupport InitCache in SingleTierStorage."; + } + + virtual Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + LOG(FATAL) << "Unsupport BatchCommit in Storage: " << typeid(this).name(); + return OkStatus(); + } + + virtual Status Commit(K keys, const void* value_ptr) { + LOG(FATAL) << "Unsupport Commit in Storage: " << typeid(this).name(); + return OkStatus(); + } + + Status Eviction(K* evict_ids, int64 evict_size) override { + LOG(FATAL) << "Unsupport Eviction in SingleTierStorage."; + return OkStatus(); + } + + void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len, + int64 block_size) override { + return; + } + + virtual void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override {} + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + mutex_lock l(Storage::mu_); + return kv_->GetSnapshot(key_list, value_ptr_list); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + mutex_lock l(Storage::mu_); + return kv_->GetShardedSnapshot(key_list, value_ptr_list, partition_id, + partition_nums); + } + + Status Save(const std::string& tensor_name, const std::string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector value_ptr_list; + std::vector key_list_tmp; + TF_CHECK_OK(kv_->GetSnapshot(&key_list_tmp, &value_ptr_list)); + + if (emb_config.is_primary()) { + Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len); + } + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, emb_config, value_len, default_value, key_list_tmp, + value_ptr_list, SingleTierStorage::feat_desc_))); + return OkStatus(); + } + + bool IsMultiLevel() override { return false; } + + bool IsUseHbm() override { return false; } + + bool IsSingleHbm() override { return false; } + + bool IsUsePersistentStorage() override { return false; } + + void Schedule(std::function fn) override { + LOG(FATAL) << "Unsupport Schedule in SingleTierStorage."; + } + + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + protected: + virtual void* CreateValuePtr() { return feat_desc_->Allocate(); } + + virtual void DestroyValuePtr(void* value_ptr) { + feat_desc_->Deallocate(value_ptr); + } + + FeatureDescriptor* feature_descriptor() { return feat_desc_; } + + virtual Status RestoreFeatures(int64 key_num, int bucket_num, + int64 partition_id, int64 partition_num, + int64 value_len, bool is_filter, bool is_incr, + const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) override { + Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num, + value_len, is_filter, false /*to_dram*/, is_incr, + restore_buff); + return s; + } + + protected: + virtual void Shrink(std::vector& key_list, + std::vector& value_ptr_list, + ShrinkArgs& shrink_args, int64 value_len) { + mutex_lock l(Storage::mu_); + shrink_args.value_len = value_len; + shrink_policy_->Shrink(key_list, value_ptr_list, shrink_args); + } + + protected: + KVInterface* kv_; + ShrinkPolicy* shrink_policy_; + Allocator* alloc_; + FeatureDescriptor* feat_desc_; +}; + +template +class DramStorage : public SingleTierStorage { + public: + DramStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), + feat_desc) {} + + ~DramStorage() override {} + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) { + return SingleTierStorage::kv_->BatchCommit(keys, value_ptrs); + } + + Status TryInsert(K key, void* value_ptr) { + return SingleTierStorage::kv_->Insert(key, value_ptr); + } + + Status Commit(K keys, const void* value_ptr) override { + return SingleTierStorage::kv_->Commit(keys, value_ptr); + } + + void Import(K key, V* value, int64 freq, int64 version, + int emb_index) override { + void* value_ptr = SingleTierStorage::feat_desc_->Allocate(freq); + SingleTierStorage::Insert(key, &value_ptr); + SingleTierStorage::feat_desc_->SetValue(value_ptr, emb_index, value); + SingleTierStorage::feat_desc_->SetFreq(value_ptr, freq); + SingleTierStorage::feat_desc_->UpdateVersion(value_ptr, version); + } + + TF_DISALLOW_COPY_AND_ASSIGN(DramStorage); + + public: + friend class DramSsdHashStorage; + friend class DramPmemStorage; + friend class DramLevelDBStore; +#if GOOGLE_CUDA + friend class HbmDramStorage; + friend class HbmDramSsdStorage; +#endif + protected: + void Shrink(std::vector& key_list, std::vector& value_ptr_list, + ShrinkArgs& shrink_args, int64 value_len) override { + SingleTierStorage::Shrink(key_list, value_ptr_list, shrink_args, + value_len); + } +}; + +#if GOOGLE_CUDA +template +class HbmStorage : public SingleTierStorage { + public: + HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator, + FeatureDescriptor* feat_desc) + : SingleTierStorage( + sc, new GPUHashMapKV(sc.embedding_config, gpu_allocator), + feat_desc) {} + ~HbmStorage() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(HbmStorage); + + bool IsSingleHbm() override { return true; } + + void SetValueLen(int64 value_len) override { + SingleTierStorage::kv_->SetValueLen(value_len); + } + + void BatchLookupOrCreate(const K* key, V* val, V* default_v, + int32 default_v_num, size_t n, + const Eigen::GpuDevice& device) override { + SingleTierStorage::kv_->BatchLookupOrCreate(key, val, default_v, + default_v_num, n, device); + } + + void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n, + const Eigen::GpuDevice& device) override { + SingleTierStorage::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, + device); + } + + void BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val, + size_t n, const V* default_v) override { + SingleTierStorage::kv_->BatchLookup(device, keys, val, n, default_v); + } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + std::vector value_ptr_list; + std::vector key_list_tmp; + GPUHashMapKV* gpu_kv = + dynamic_cast*>(SingleTierStorage::kv_); + gpu_kv->GetSnapshot(&key_list_tmp, &value_ptr_list, emb_config); + + TF_CHECK_OK((Storage::SaveToCheckpoint( + tensor_name, writer, value_len, key_list_tmp, value_ptr_list))); + + if (value_ptr_list.size() > 0) { + TypedAllocator::Deallocate(cpu_allocator(), value_ptr_list[0], + value_ptr_list.size() * value_len); + } + return OkStatus(); + } + + GPUHashTable* HashTable() override { + return SingleTierStorage::kv_->HashTable(); + } + + protected: + Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, + int64 partition_num, int64 value_len, bool is_filter, + bool is_incr, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) override { + K* key_buff = (K*)restore_buff.key_buffer; + V* value_buff = (V*)restore_buff.value_buffer; + std::vector key_import; + std::vector value_import; + for (auto i = 0; i < key_num; ++i) { + if (*(key_buff + i) % bucket_num % partition_num != partition_id) { + LOG(INFO) << "skip EV key:" << *(key_buff + i); + continue; + } + key_import.emplace_back(*(key_buff + i)); + auto row_offset = value_buff + i * value_len; + for (int j = 0; j < value_len; j++) { + value_import.emplace_back(*(row_offset + j)); + } + } + GPUHashMapKV* gpu_kv = + dynamic_cast*>(SingleTierStorage::kv_); + gpu_kv->Import(key_import, value_import, device, emb_config); + return OkStatus(); + } +}; + +template +class HbmStorageWithCpuKv : public SingleTierStorage { + public: + HbmStorageWithCpuKv(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), + feat_desc) {} + + ~HbmStorageWithCpuKv() override {} + + Status TryInsert(K key, void* value_ptr) { + return SingleTierStorage::kv_->Insert(key, value_ptr); + } + + public: + friend class HbmDramStorage; + friend class HbmDramSsdStorage; + + protected: + void Shrink(std::vector& key_list, std::vector& value_ptr_list, + ShrinkArgs& shrink_args, int64 value_len) override { + SingleTierStorage::Shrink(key_list, value_ptr_list, shrink_args, + value_len); + } +}; +#endif // GOOGLE_CUDA + +template +class PmemMemkindStorage : public SingleTierStorage { + public: + PmemMemkindStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), + feat_desc) {} + ~PmemMemkindStorage() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage); +}; + +template +class PmemLibpmemStorage : public SingleTierStorage { + public: + PmemLibpmemStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), + feat_desc) {} + ~PmemLibpmemStorage() override {} + + Status Commit(K keys, const void* value_ptr) { + return SingleTierStorage::kv_->Commit(keys, value_ptr); + } + + TF_DISALLOW_COPY_AND_ASSIGN(PmemLibpmemStorage); + + protected: + friend class DramPmemStorage; + void Shrink(std::vector& key_list, std::vector& value_ptr_list, + ShrinkArgs& shrink_args, int64 value_len) override { + SingleTierStorage::Shrink(key_list, value_ptr_list, shrink_args, + value_len); + } +}; + +template +class LevelDBStore : public SingleTierStorage { + public: + LevelDBStore(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LevelDBKV(sc.path, feat_desc), + feat_desc) {} + ~LevelDBStore() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore); + + Status Commit(K keys, const void* value_ptr) { + return SingleTierStorage::kv_->Commit(keys, value_ptr); + } + + embedding::ValueIterator* GetValueIterator(const std::vector& key_list, + int64 emb_index, + int64 value_len) { + LevelDBKV* leveldb_kv = + reinterpret_cast*>(SingleTierStorage::kv_); + return new DBValueIterator(key_list, emb_index, value_len, leveldb_kv, + SingleTierStorage::feat_desc_); + } + + public: + friend class DramLevelDBStore; +}; + +template +class SsdHashStorage : public SingleTierStorage { + public: + SsdHashStorage(const StorageConfig& sc, FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new SSDHashKV(sc.path, feat_desc), + feat_desc) {} + ~SsdHashStorage() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage); + + Status Commit(K keys, const void* value_ptr) { + return SingleTierStorage::kv_->Commit(keys, value_ptr); + } + + Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) override { + if (emb_config.is_primary()) { + SSDHashKV* ssd_kv = + reinterpret_cast*>(SingleTierStorage::kv_); + SsdRecordDescriptor ssd_rec_desc; + { + mutex_lock l(Storage::mu_); + ssd_kv->SetSsdRecordDescriptor(&ssd_rec_desc); + } + ssd_rec_desc.GenerateCheckpoint(prefix, tensor_name); + } + return OkStatus(); + } + + void Import(K* key_list, int64* key_file_id_list, int64* key_offset_list, + int64 num_of_keys, std::map& file_id_map) { + SSDHashKV* ssd_kv = + reinterpret_cast*>(SingleTierStorage::kv_); + + ssd_kv->Import(key_list, key_file_id_list, key_offset_list, num_of_keys, + file_id_map); + } + + void CopyEmbFilesFromCkpt(int64* file_list, int64* invalid_record_count_list, + int64* record_count_list, int64 num_of_files, + const std::string& ssd_emb_file_name) { + SSDHashKV* ssd_kv = + reinterpret_cast*>(SingleTierStorage::kv_); + + ssd_kv->CopyEmbFilesFromCkpt(file_list, invalid_record_count_list, + record_count_list, num_of_files, + ssd_emb_file_name); + } + + void SetSsdRecordDescriptor(SsdRecordDescriptor* ssd_rec_desc) { + SSDHashKV* ssd_kv = + reinterpret_cast*>(SingleTierStorage::kv_); + ssd_kv->SetSsdRecordDescriptor(ssd_rec_desc); + } + + public: + friend class DramSsdHashStorage; +#if GOOGLE_CUDA + friend class HbmDramSsdStorage; +#endif + + protected: + void Init() override { + dynamic_cast*>(SingleTierStorage::kv_)->Init(); + } +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h new file mode 100644 index 00000000..5471ef05 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h @@ -0,0 +1,802 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_ + +#include +#include +#include + +#include "emb_file_creator.h" +#include "kv_interface.h" +#include "sparsehash/dense_hash_map_lockless" +#include "sparsehash/dense_hash_set_lockless" +#include "ssd_record_descriptor.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +namespace embedding { +class EmbPosition { + public: + EmbPosition(int o, size_t v, int bo, bool f) + : offset_(o), + version_(v), + buffer_offset_(bo), + flushed_(f), + invalid_(false) {} + + EmbPosition() + : offset_(-1), + version_(-1), + buffer_offset_(-1), + flushed_(false), + invalid_(false) {} + + void Print() { + LOG(INFO) << "EmbPosition: " + << "offset = " << offset_ << ", version = " << version_ + << ", buffer_offset = " << buffer_offset_ + << ", flushed = " << flushed_; + } + + public: + int offset_; + int buffer_offset_; + size_t version_; + bool flushed_; + bool invalid_; +}; + +template +class SSDIterator { + public: + SSDIterator(google::dense_hash_map_lockless* hash_map, + const std::vector& emb_files, int64 value_len, + char* write_buffer) + : emb_files_(emb_files), + curr_file_(0), + curr_vec_(0), + value_len_(value_len), + write_buffer_(write_buffer) { + for (auto it : *hash_map) { + EmbPosition* posi = it.second; + auto iter = file_map_.find(posi->version_); + if (iter == file_map_.end()) { + std::vector> tmp; + file_map_[posi->version_] = tmp; + file_id_vec_.emplace_back(posi->version_); + } + file_map_[posi->version_].emplace_back(it); + } + } + + virtual ~SSDIterator() {} + + virtual bool Valid() { return !(curr_file_ == file_id_vec_.size()); } + + virtual void SeekToFirst() { + curr_file_ = 0; + curr_vec_ = 0; + if (file_id_vec_.size() > 0) { + int64 f_id = file_id_vec_[curr_file_]; + emb_files_[f_id]->MapForRead(); + } + } + + virtual void Next() { + curr_vec_++; + int64 f_id = file_id_vec_[curr_file_]; + if (curr_vec_ == file_map_[f_id].size()) { + emb_files_[f_id]->UnmapForRead(); + curr_vec_ = 0; + curr_file_++; + if (curr_file_ < file_id_vec_.size()) + emb_files_[file_id_vec_[curr_file_]]->MapForRead(); + } + } + + virtual K Key() { + int64 f_id = file_id_vec_[curr_file_]; + return (file_map_[f_id])[curr_vec_].first; + } + + virtual int64 FileId() { return file_id_vec_[curr_file_]; } + + virtual int64 Offset() { + int64 f_id = file_id_vec_[curr_file_]; + EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; + return posi->offset_; + } + + private: + int64 value_len_; + int64 curr_file_; + int64 curr_vec_; + char* write_buffer_; + std::map>> file_map_; + std::vector file_id_vec_; + std::vector emb_files_; +}; + +template +class SSDHashKV : public KVInterface { + public: + explicit SSDHashKV(const std::string& path, FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { + path_ = io::JoinPath( + path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_"); + hash_map_.max_load_factor(0.8); + hash_map_.set_empty_key_and_value(EMPTY_KEY, nullptr); + hash_map_.set_counternum(16); + hash_map_.set_deleted_key(DELETED_KEY); + evict_file_set_.max_load_factor(0.8); + evict_file_set_.set_empty_key_and_value(EMPTY_KEY, -1); + evict_file_set_.set_counternum(16); + evict_file_set_.set_deleted_key(DELETED_KEY); + + is_async_compaction_ = true; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true, + &is_async_compaction_)); + + std::string io_scheme = "mmap_and_madvise"; + TF_CHECK_OK(ReadStringFromEnvVar("TF_SSDHASH_IO_SCHEME", "mmap_and_madvise", + &io_scheme)); + emb_file_creator_ = EmbFileCreatorFactory::Create(io_scheme); + EmbFile* ef = + emb_file_creator_->Create(path_, current_version_, BUFFER_SIZE); + emb_files_.emplace_back(ef); + + if (!is_async_compaction_) { + LOG(INFO) + << "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!"; + compaction_fn_ = [this]() { Compaction(); }; + check_buffer_fn_ = [this]() { CheckBuffer(); }; + save_kv_fn_ = [this](K key, const void* value_ptr, + bool is_compaction = false) { + SaveKV(key, value_ptr, is_compaction); + }; + } else { + LOG(INFO) << "Use Async Compactor in SSDHashKV of Multi-tier Embedding " + "Storage!"; + compaction_fn_ = []() {}; + check_buffer_fn_ = [this]() { CheckBufferAsync(); }; + save_kv_fn_ = [this](K key, const void* value_ptr, + bool is_compaction = false) { + SaveKVAsync(key, value_ptr, is_compaction); + }; + compaction_thread_ = Env::Default()->StartThread( + ThreadOptions(), "COMPACTION", [this]() { CompactionThread(); }); + } + } + + void Init() { + val_len_ = feat_desc_->data_bytes(); + max_app_count_ = BUFFER_SIZE / val_len_; + write_buffer_ = new char[BUFFER_SIZE]; + unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_); + key_buffer_ = new K[max_key_count]; + done_ = true; + } + + void SetSsdRecordDescriptor(SsdRecordDescriptor* ssd_rec_desc) { + mutex_lock l(compact_save_mu_); + SSDIterator ssd_iter(&hash_map_, emb_files_, val_len_, write_buffer_); + for (ssd_iter.SeekToFirst(); ssd_iter.Valid(); ssd_iter.Next()) { + ssd_rec_desc->key_list.emplace_back(ssd_iter.Key()); + ssd_rec_desc->key_file_id_list.emplace_back(ssd_iter.FileId()); + ssd_rec_desc->key_offset_list.emplace_back(ssd_iter.Offset()); + } + ssd_rec_desc->file_prefix = path_; + + for (auto file : emb_files_) { + if (file->IsDeleted()) continue; + ssd_rec_desc->file_list.emplace_back(file->Version()); + ssd_rec_desc->invalid_record_count_list.emplace_back( + file->InvalidCount()); + ssd_rec_desc->record_count_list.emplace_back(file->Count()); + } + + if (buffer_cur_ > 0) { + if (!is_async_compaction_) { + emb_files_[current_version_]->Write(write_buffer_, + buffer_cur_ * val_len_); + emb_files_[current_version_]->Flush(); + ++current_version_; + CreateFile(current_version_); + } else { + emb_files_[evict_version_]->Write(write_buffer_, + buffer_cur_ * val_len_); + emb_files_[evict_version_]->Flush(); + evict_version_ = ++current_version_; + CreateFile(evict_version_); + } + TF_CHECK_OK(UpdateFlushStatus()); + current_offset_ = 0; + buffer_cur_ = 0; + } + } + + ~SSDHashKV() override { + if (buffer_cur_ > 0) { + if (!is_async_compaction_) { + emb_files_[current_version_]->Write(write_buffer_, + buffer_cur_ * val_len_); + } else { + emb_files_[evict_version_]->Write(write_buffer_, + buffer_cur_ * val_len_); + mutex_lock l(shutdown_mu_); + shutdown_ = true; + // Need last compaction or not??? + // CompactionAsync(); + delete compaction_thread_; + } + buffer_cur_ = 0; + } + for (auto it : emb_files_) { + if (!it->IsDeleted()) { + it->DeleteFile(); + } + delete it; + } + DeallocateEmbPositions(); + delete[] write_buffer_; + delete[] key_buffer_; + } + + Status UpdateFlushStatus() { + for (int i = 0; i < buffer_cur_; ++i) { + auto iter = hash_map_.find_wait_free(key_buffer_[i]); + if (iter.first == EMPTY_KEY) { + return errors::NotFound("Unable to find Key: ", key_buffer_[i], + " in SSDHashKV."); + } else { + iter.second->flushed_ = true; + } + } + return OkStatus(); + } + + Status Lookup(K key, void** value_ptr) override { + auto iter = hash_map_.find_wait_free(key); + if (iter.first == EMPTY_KEY) { + return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV."); + } else { + void* val = feat_desc_->Allocate(); + EmbPosition* posi = iter.second; + if (posi->flushed_) { + emb_files_[posi->version_]->Read((char*)val, val_len_, posi->offset_); + } else { + memcpy((char*)val, write_buffer_ + posi->buffer_offset_, val_len_); + } + *value_ptr = val; + posi->invalid_ = true; + return OkStatus(); + } + } + + Status Contains(K key) override { + auto iter = hash_map_.find_wait_free(key); + if (iter.first == EMPTY_KEY) { + return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV."); + } else { + return OkStatus(); + } + } + + Status Insert(K key, const void* value_ptr) override { return OkStatus(); } + + Status BatchInsert(const std::vector& keys, + const std::vector& value_ptrs) override { + return BatchCommit(keys, value_ptrs); + } + + Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) override { + compaction_fn_(); + __sync_fetch_and_add(&total_app_count_, keys.size()); + for (int i = 0; i < keys.size(); i++) { + check_buffer_fn_(); + save_kv_fn_(keys[i], value_ptrs[i], false); + delete value_ptrs[i]; + } + return OkStatus(); + } + + Status Commit(K key, const void* value_ptr) override { + compaction_fn_(); + __sync_fetch_and_add(&total_app_count_, 1); + check_buffer_fn_(); + save_kv_fn_(key, value_ptr, false); + return OkStatus(); + } + + Status Remove(K key) override { + if (hash_map_.erase_lockless(key)) { + return OkStatus(); + } else { + return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV."); + } + } + + Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) override { + return OkStatus(); + } + + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, + int partition_id, int partition_nums) override { + return OkStatus(); + } + + Status GetSnapshot(std::vector* key_list, + std::vector* file_list) { + int64 bucket_count; + auto it = hash_map_.GetSnapshot(); + auto hash_map_dump = it.first; + bucket_count = it.second; + for (int64 j = 0; j < bucket_count; j++) { + if (hash_map_dump[j].first != LocklessHashMap::EMPTY_KEY_ && + hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_) { + key_list->emplace_back(hash_map_dump[j].first); + file_list->emplace_back(hash_map_dump[j].second); + } + } + // Free the memory of snapshot allocated by hash map. + free(hash_map_dump); + return OkStatus(); + } + + void Import(K* key_list, int64* key_file_id_list, int64* key_offset_list, + int64 num_of_keys, std::map& file_id_map) { + for (int i = 0; i < num_of_keys; i++) { + int64 old_file_id = key_file_id_list[i]; + int64 new_file_id = file_id_map[old_file_id]; + EmbPosition* ep = + new EmbPosition(key_offset_list[i], new_file_id, 0, true); + hash_map_.insert_lockless(std::move(std::pair( + key_list[i], const_cast(ep)))); + } + } + + void CopyEmbFilesFromCkpt(int64* file_list, int64* invalid_record_count_list, + int64* record_count_list, int64 num_of_files, + const std::string& old_file_prefix) { + // delete the file created by constructor + emb_files_[0]->DeleteFile(); + delete emb_files_[0]; + emb_files_.erase(emb_files_.begin()); + for (int64 i = 0; i < num_of_files; i++) { + std::stringstream ss; + ss << old_file_prefix << "/" << file_list[i] << ".emb"; + std::string old_file_path = ss.str(); + EmbFile* f = + emb_file_creator_->Create(path_, current_version_, BUFFER_SIZE); + ++current_version_; + f->LoadExistFile(old_file_path, record_count_list[i], + invalid_record_count_list[i]); + emb_files_.emplace_back(f); + total_app_count_ += record_count_list[i]; + } + CreateFile(current_version_); + } + + int64 Size() const override { return hash_map_.size_lockless(); } + + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); + } + + private: + void WriteFile(size_t version, size_t curr_buffer_offset) { + emb_files_[version]->Write(write_buffer_, curr_buffer_offset); + emb_files_[version]->Flush(); + } + + void CreateFile(size_t version) { + emb_files_.emplace_back( + emb_file_creator_->Create(path_, version, BUFFER_SIZE)); + } + + Status FlushAndUpdate(char* value_buffer, K* id_buffer, + EmbPosition** pos_buffer, int64& n_ids, + std::vector& invalid_files) { + { + mutex_lock l(mu_); + compaction_version_ = ++current_version_; + CreateFile(compaction_version_); + } + + emb_files_[compaction_version_]->Write(value_buffer, n_ids * val_len_); + emb_files_[compaction_version_]->AddCount(n_ids); + emb_files_[compaction_version_]->Flush(); + + for (int64 i = 0; i < n_ids; i++) { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(id_buffer[i], nullptr))); + if ((*(iter.first)).first == EMPTY_KEY) { + return errors::NotFound("Unable to find Key: ", id_buffer[i], + " in SSDHashKV."); + } else { + size_t offset = i * val_len_; + EmbPosition* ep = + new EmbPosition(offset, compaction_version_, offset, true); + bool flag = __sync_bool_compare_and_swap(&((*(iter.first)).second), + pos_buffer[i], ep); + if (!flag) { + emb_files_[compaction_version_]->AddInvalidCountAtomic(1); + if (emb_files_[compaction_version_]->IsNeedToBeCompacted()) { + evict_file_set_.insert_lockless(compaction_version_); + } + delete ep; + } else { + pos_out_of_date_compact_.emplace_back(pos_buffer[i]); + } + } + } + + for (int i = 0; i < invalid_files.size(); i++) { + evict_file_set_.erase_lockless(invalid_files[i]); + } + invalid_files.clear(); + n_ids = 0; + return OkStatus(); + } + + void CheckBuffer() { + size_t curr_buffer_offset = buffer_cur_ * val_len_; + if (curr_buffer_offset + val_len_ > BUFFER_SIZE) { + WriteFile(current_version_, curr_buffer_offset); + if (emb_files_[current_version_]->Count() >= max_app_count_) { + ++current_version_; + current_offset_ = 0; + CreateFile(current_version_); + } + TF_CHECK_OK(UpdateFlushStatus()); + buffer_cur_ = 0; + } + } + + void CheckBufferAsync() { + size_t curr_buffer_offset = buffer_cur_ * val_len_; + if (curr_buffer_offset + val_len_ > BUFFER_SIZE) { + WriteFile(evict_version_, curr_buffer_offset); + TF_CHECK_OK(UpdateFlushStatus()); + mutex_lock l(mu_); + evict_version_ = ++current_version_; + current_offset_ = 0; + CreateFile(evict_version_); + buffer_cur_ = 0; + } + } + + void AppendToWriteBuffer(size_t curr_buffer_offset, K key, + const void* value_ptr) { + current_offset_ += val_len_; + memcpy(write_buffer_ + curr_buffer_offset, (char*)value_ptr, val_len_); + key_buffer_[buffer_cur_] = key; + ++buffer_cur_; + } + + void AppendToPositionRecordQueue(EmbPosition* old_posi) { + // A parameter that can be adjusted in the future + if (pos_out_of_date_.size() > CAP_INVALID_POS) { + EmbPosition* posi = pos_out_of_date_.front(); + delete posi; + pos_out_of_date_.pop_front(); + } + pos_out_of_date_.emplace_back(old_posi); + } + + bool UpdatePosition(EmbPosition** pos, EmbPosition* old_posi, + EmbPosition* new_posi) { + bool flag = __sync_bool_compare_and_swap(pos, old_posi, new_posi); + if (flag) { + AppendToPositionRecordQueue(old_posi); + } + return flag; + } + + void SaveKV(K key, const void* value_ptr, bool is_compaction = false) { + size_t curr_buffer_offset = buffer_cur_ * val_len_; + EmbPosition* ep = new EmbPosition(current_offset_, current_version_, + curr_buffer_offset, false); + AppendToWriteBuffer(curr_buffer_offset, key, value_ptr); + + auto iter = hash_map_.insert_lockless(std::move( + std::pair(key, const_cast(ep)))); + emb_files_[ep->version_]->AddCount(1); + + if ((*(iter.first)).second != ep) { + EmbPosition* old_posi = (*(iter.first)).second; + int64 version = old_posi->version_; + if (!is_compaction) { + emb_files_[version]->AddInvalidCount(1); + // A parameter that can be adjusted in the future + if (version != current_version_ && + emb_files_[version]->IsNeedToBeCompacted()) { + evict_file_set_.insert_lockless(version); + } + } + UpdatePosition(&((*(iter.first)).second), old_posi, ep); + } + } + + void SaveKVAsync(K key, const void* value_ptr, bool is_compaction = false) { + size_t curr_buffer_offset = buffer_cur_ * val_len_; + EmbPosition* ep = new EmbPosition(current_offset_, evict_version_, + curr_buffer_offset, false); + + AppendToWriteBuffer(curr_buffer_offset, key, value_ptr); + auto iter = hash_map_.insert_lockless(std::move( + std::pair(key, const_cast(ep)))); + emb_files_[ep->version_]->AddCount(1); + + if ((*(iter.first)).second != ep) { + bool flag = false; + EmbPosition* old_posi = nullptr; + do { + old_posi = (*(iter.first)).second; + flag = UpdatePosition(&((*(iter.first)).second), old_posi, ep); + } while (!flag); + + if (!is_compaction) { + int version = old_posi->version_; + emb_files_[version]->AddInvalidCountAtomic(1); + // A parameter that can be adjusted in the future + if (version != evict_version_ && + emb_files_[version]->IsNeedToBeCompacted()) { + evict_file_set_.insert_lockless(version); + } + } + } + } + + void DeleteInvalidFiles() { + for (auto it : evict_file_map_) { + emb_files_[it.first]->DeleteFile(); + } + evict_file_map_.clear(); + } + + void DeleteInvalidRecord() { + for (auto it : pos_out_of_date_compact_) { + delete it; + } + pos_out_of_date_compact_.clear(); + } + + void LookupValidItems() { + for (auto it : hash_map_) { + EmbPosition* posi = it.second; + auto iter = evict_file_map_.find(posi->version_); + if (iter != evict_file_map_.end()) { + (*iter).second.emplace_back(it); + } + } + } + + void InitializeEvictMap() { + for (auto it : evict_file_set_) { + std::vector> tmp; + evict_file_map_[it] = tmp; + evict_file_set_.erase_lockless(it); + } + LookupValidItems(); + } + + void InitializeEvictMapWithoutErase() { + for (auto it : evict_file_set_) { + std::vector> tmp; + evict_file_map_[it] = tmp; + } + LookupValidItems(); + } + + void MoveToNewFile() { + void* val = feat_desc_->Allocate(); + for (auto it : evict_file_map_) { + EmbFile* file = emb_files_[it.first]; + total_app_count_ -= file->InvalidCount(); + file->MapForRead(); + for (auto it_vec : it.second) { + EmbPosition* posi = it_vec.second; + file->ReadWithMemcpy((char*)val, val_len_, posi->offset_); + CheckBuffer(); + SaveKV(it_vec.first, val, true); + } + file->UnmapForRead(); + } + feat_desc_->Deallocate(val); + } + + void MoveToNewFileAsync() { + char* compact_buffer = new char[BUFFER_SIZE]; + int64 n_ids = 0; + std::vector invalid_files; + unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_); + K* id_buffer = new K[max_key_count]; + EmbPosition** pos_buffer = new EmbPosition*[max_key_count]; + for (auto it : evict_file_map_) { + EmbFile* file = emb_files_[it.first]; + __sync_fetch_and_sub(&total_app_count_, file->InvalidCount()); + file->MapForRead(); + for (auto it_vec : it.second) { + EmbPosition* posi = it_vec.second; + id_buffer[n_ids] = it_vec.first; + pos_buffer[n_ids] = posi; + file->ReadWithMemcpy(compact_buffer + val_len_ * n_ids, val_len_, + posi->offset_); + n_ids++; + if (n_ids == max_app_count_) { + Status st = FlushAndUpdate(compact_buffer, id_buffer, pos_buffer, + n_ids, invalid_files); + if (!st.ok()) { + LOG(WARNING) << "FLUSH ERROR: " << st.ToString(); + } + } + } + file->UnmapForRead(); + invalid_files.emplace_back(it.first); + } + Status st = FlushAndUpdate(compact_buffer, id_buffer, pos_buffer, n_ids, + invalid_files); + if (!st.ok()) { + LOG(WARNING) << "FLUSH ERROR: " << st.ToString(); + } + delete[] id_buffer; + delete[] compact_buffer; + delete[] pos_buffer; + } + + void Compaction() { + int64 hash_size = hash_map_.size_lockless(); + // These parameter that can be adjusted in the future + if (hash_size * 3 / 2 < total_app_count_ || + total_app_count_ - hash_size > CAP_INVALID_ID) { + // delete the evict_files + DeleteInvalidFiles(); + // Initialize evict_file_map + InitializeEvictMap(); + // read embeddings and write to new file + MoveToNewFile(); + } + } + + void CompactionAsync() { + int64 hash_size = hash_map_.size_lockless(); + // These parameter that can be adjusted in the future + if (hash_size * 3 / 2 < total_app_count_ || + total_app_count_ - hash_size > CAP_INVALID_ID) { + DeleteInvalidRecord(); + // delete the evict_files + DeleteInvalidFiles(); + // Initialize evict_file_map + InitializeEvictMapWithoutErase(); + // read embeddings and write to new file + MoveToNewFileAsync(); + } + } + + void CompactionThread() { + if (val_len_ == -1) { + while (!done_) { + } + } + while (!shutdown_) { + if (shutdown_mu_.try_lock()) { + if (!shutdown_) { + mutex_lock l(compact_save_mu_); + CompactionAsync(); + } + shutdown_mu_.unlock(); + } + Env::Default()->SleepForMicroseconds(1000); + } + } + + std::string DebugString() const { + return strings::StrCat( + "map info size:", Size(), + ", map info bucket_count:", hash_map_.load_factor(), + ",map info load_factor:", hash_map_.load_factor(), + ", map info max_load_factor:", hash_map_.max_load_factor(), + ", map info min_load_factor: ", hash_map_.min_load_factor(), + ", evict_version: ", evict_version_, + ", compaction_version: ", compaction_version_); + } + + private: + void DeallocateEmbPositions() { + std::pair* hash_map_dump; + int64 bucket_count; + auto it = hash_map_.GetSnapshot(); + hash_map_dump = it.first; + bucket_count = it.second; + for (int64 j = 0; j < bucket_count; j++) { + if (hash_map_dump[j].first != SSDHashKV::EMPTY_KEY && + hash_map_dump[j].first != SSDHashKV::DELETED_KEY) { + delete hash_map_dump[j].second; + } + } + free(hash_map_dump); + } + + private: + size_t val_len_ = -1; + volatile size_t current_version_ = 0; + volatile size_t evict_version_ = 0; + volatile size_t compaction_version_ = 0; + volatile size_t current_offset_ = 0; + volatile size_t buffer_cur_ = 0; + size_t total_app_count_ = 0; + size_t max_app_count_; + + char* write_buffer_ = nullptr; + K* key_buffer_ = nullptr; + bool is_async_compaction_; + FeatureDescriptor* feat_desc_; + + int total_dims_; + std::string path_; + + typedef google::dense_hash_map_lockless LockLessHashMap; + LockLessHashMap hash_map_; + mutex mu_; + mutex shutdown_mu_; + mutex compact_save_mu_; + + static const int EMPTY_KEY; + static const int DELETED_KEY; + static const int CAP_INVALID_POS; + static const int CAP_INVALID_ID; + static const size_t BUFFER_SIZE; + + std::vector emb_files_; + std::deque pos_out_of_date_; + std::deque pos_out_of_date_compact_; + typedef google::dense_hash_set_lockless LocklessHashSet; + LocklessHashSet evict_file_set_; + std::map>> evict_file_map_; + + Thread* compaction_thread_ = nullptr; + volatile bool shutdown_ = false; + volatile bool done_ = false; + // std::atomic_flag flag_ = ATOMIC_FLAG_INIT; unused + + std::function compaction_fn_; + std::function check_buffer_fn_; + std::function save_kv_fn_; + EmbFileCreator* emb_file_creator_ = nullptr; +}; +template +const int SSDHashKV::EMPTY_KEY = -1; +template +const int SSDHashKV::DELETED_KEY = -2; +template +const int SSDHashKV::CAP_INVALID_POS = 200000; +template +const int SSDHashKV::CAP_INVALID_ID = 10000000; +template +const size_t SSDHashKV::BUFFER_SIZE = 1 << 27; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc new file mode 100644 index 00000000..60879c19 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc @@ -0,0 +1,80 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ +#include "ssd_record_descriptor.h" + +#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/io/path.h" + +namespace tensorflow { +namespace embedding { +template +template +void SsdRecordDescriptor::DumpSection(const std::vector& data_vec, + const std::string& section_str, + BundleWriter* writer, + std::vector& dump_buffer) { + EVVectorDataDumpIterator iter(data_vec); + SaveTensorWithFixedBuffer(section_str, writer, dump_buffer.data(), + dump_buffer.size(), &iter, + TensorShape({data_vec.size()})); +} +#define REGISTER_KERNELS(ktype, ttype) \ + template void SsdRecordDescriptor::DumpSection( \ + const std::vector&, const std::string&, BundleWriter*, \ + std::vector&); +REGISTER_KERNELS(int32, int32); +REGISTER_KERNELS(int32, int64); +REGISTER_KERNELS(int64, int32); +REGISTER_KERNELS(int64, int64); +#undef REGISTER_KERNELS + +template +void SsdRecordDescriptor::DumpSsdMeta(const std::string& prefix, + const std::string& var_name) { + std::fstream fs; + std::string var_name_temp(var_name); + std::string new_str = "_"; + int64 pos = var_name_temp.find("/"); + while (pos != std::string::npos) { + var_name_temp.replace(pos, 1, new_str.data(), 1); + pos = var_name_temp.find("/"); + } + + std::string ssd_record_path = prefix + "-" + var_name_temp + "-ssd_record"; + BundleWriter ssd_record_writer(Env::Default(), ssd_record_path); + size_t bytes_limit = 8 << 20; + std::vector dump_buffer(bytes_limit); + + DumpSection(key_list, "keys", &ssd_record_writer, dump_buffer); + DumpSection(key_file_id_list, "keys_file_id", &ssd_record_writer, + dump_buffer); + DumpSection(key_offset_list, "keys_offset", &ssd_record_writer, dump_buffer); + DumpSection(file_list, "files", &ssd_record_writer, dump_buffer); + DumpSection(invalid_record_count_list, "invalid_record_count", + &ssd_record_writer, dump_buffer); + DumpSection(record_count_list, "record_count", &ssd_record_writer, + dump_buffer); + + ssd_record_writer.Finish(); +} +#define REGISTER_KERNELS(ktype) \ + template void SsdRecordDescriptor::DumpSsdMeta(const std::string&, \ + const std::string&); +REGISTER_KERNELS(int32); +REGISTER_KERNELS(int64); +#undef REGISTER_KERNELS +} // namespace embedding +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h new file mode 100644 index 00000000..d5a46bc6 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h @@ -0,0 +1,105 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_ + +#include +#include +#include +#include +#include + +#include "embedding_var_dump_iterator.h" +#include "kv_interface.h" +#include "tensorflow/core/platform/env.h" + +namespace tensorflow { +class BundleWriter; +namespace embedding { + +template +class SsdRecordDescriptor { + public: + // prefix of embedding file + tstring file_prefix; + // keys in ssd storage + std::vector key_list; + // file ids of features + std::vector key_file_id_list; + // offsets in the file of features + std::vector key_offset_list; + // files in ssd storage + std::vector file_list; + // number of invalid records in the file + std::vector invalid_record_count_list; + // number of records in the file + std::vector record_count_list; + + void GenerateCheckpoint(const std::string& prefix, + const std::string& var_name) { + DumpSsdMeta(prefix, var_name); + CopyEmbeddingFilesToCkptDir(prefix, var_name); + } + + private: + template + void DumpSection(const std::vector& data_vec, + const std::string& section_str, BundleWriter* writer, + std::vector& dump_buffer); + + void DumpSsdMeta(const std::string& prefix, const std::string& var_name); + + void CopyEmbeddingFilesToCkptDir(const std::string& prefix, + const std::string& var_name) { + std::string var_name_temp(var_name); + std::string new_str = "_"; + int64 pos = var_name_temp.find("/"); + while (pos != std::string::npos) { + var_name_temp.replace(pos, 1, new_str.data(), 1); + pos = var_name_temp.find("/"); + } + + std::string embedding_folder_path = + prefix + "-" + var_name_temp + "-emb_files/"; + Status s = Env::Default()->CreateDir(embedding_folder_path); + if (errors::IsAlreadyExists(s)) { + int64 undeleted_files, undeleted_dirs; + Env::Default()->DeleteRecursively(embedding_folder_path, &undeleted_files, + &undeleted_dirs); + Env::Default()->CreateDir(embedding_folder_path); + } + + for (int64 i = 0; i < file_list.size(); i++) { + int64 file_id = file_list[i]; + std::stringstream old_ss; + old_ss << std::setw(4) << std::setfill('0') << file_id << ".emb"; + std::string file_path = file_prefix + old_ss.str(); + std::string file_name = file_path.substr(file_path.rfind("/")); + std::stringstream new_ss; + new_ss << file_id << ".emb"; + std::string new_file_path = embedding_folder_path + new_ss.str(); + Status s = Env::Default()->CopyFile(file_path, new_file_path); + if (!s.ok()) { + LOG(FATAL) << "Copy file " << file_path << " failed!"; + } + } + } +}; + +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage.h new file mode 100644 index 00000000..40817e59 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage.h @@ -0,0 +1,367 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_ + +#include "cache.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "embedding_memory_pool.h" +#include "embedding_var_ckpt_data.h" +#include "embedding_var_restore.h" +#include "filter_policy.h" +#include "kv_interface.h" +#include "shrink_policy.h" +#include "storage_config.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/util/work_sharder.h" +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif + +namespace tensorflow { +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template +class CheckpointLoader; + +template +class EmbeddingVar; + +template +struct SsdRecordDescriptor; + +template +class GPUHashTable; + +class BundleWriter; +class BundleReader; + +template +struct EmbeddingVarContext; +namespace embedding { + +template +class Storage { + friend class CheckpointLoader; + + public: + explicit Storage(const StorageConfig& storage_config) + : storage_config_(storage_config) { + initialize_value_.resize(storage_config.embedding_config.slot_num + 1); + } + virtual ~Storage() {} + TF_DISALLOW_COPY_AND_ASSIGN(Storage); + + virtual Status Get(K key, void** value_ptr) = 0; +#if GOOGLE_CUDA + virtual void BatchGet(const EmbeddingVarContext& ctx, const K* key, + void** value_ptr_list, int64 num_of_keys) {} + + virtual void BatchGetOrCreate( + const EmbeddingVarContext& ctx, const K* key, + void** value_ptr_list, int64 num_of_keys, int64 value_len, + std::vector>& not_found_cursor_list) {} +#endif // GOOGLE_CUDA + virtual Status Contains(K key) = 0; + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) = 0; + virtual void Insert(K key, void** value_ptr) = 0; + virtual void Init() {} + virtual void SetValueLen(int64 value_len) {} + virtual Status GetOrCreate(K key, void** value_ptr) = 0; + virtual int LookupTier(K key) const = 0; + virtual Status Remove(K key) = 0; + virtual int64 Size() const = 0; + virtual int64 Size(int level) const = 0; + virtual Status GetSnapshot(std::vector* key_list, + std::vector* value_ptr_list) = 0; + virtual Status GetShardedSnapshot( + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, + int partition_nums) = 0; + virtual Status Save(const string& tensor_name, const string& prefix, + BundleWriter* writer, const EmbeddingConfig& emb_config, + ShrinkArgs& shrink_args, int64 value_len, + V* default_value) = 0; + + virtual Status BatchCommit(const std::vector& keys, + const std::vector& value_ptrs) = 0; + + virtual Status Eviction(K* evict_ids, int64 evict_size) = 0; + + virtual void CopyEmbeddingsFromCPUToGPU( + int total, const K* keys, const std::list& copyback_cursor, + V** memcpy_address, size_t value_len, void** gpu_value_ptrs, + V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, + const DeviceBase::CpuWorkerThreads* worker_threads) = 0; + + virtual void BatchLookupOrCreate(const K* key, V* val, V* default_v, + int32 default_v_num, size_t n, + const Eigen::GpuDevice& device) {} + virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n, + const Eigen::GpuDevice& device) {} + virtual void BatchLookup(const Eigen::GpuDevice& device, const K* keys, + V* val, size_t n, const V* default_v) {} + virtual GPUHashTable* HashTable() { return nullptr; } + + virtual void InitCache(embedding::CacheStrategy cache_strategy) = 0; + virtual int64 CacheSize() const = 0; + virtual BatchCache* Cache() = 0; + virtual bool IsMultiLevel() = 0; + virtual bool IsUseHbm() = 0; + virtual bool IsSingleHbm() = 0; + virtual bool IsUsePersistentStorage() { return false; }; + virtual void Schedule(std::function fn) = 0; + virtual void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len, + int64 block_size) = 0; + + inline mutex* get_mutex() { return &mu_; } + inline int64 GetAllocLen() { return alloc_len_; } + inline int64 GetOffset(int64 index) { return alloc_len_ * index; } + inline int64 GetTotalDims() { return total_dims_; } + inline embedding::StorageType GetStorageType() { + return storage_config_.type; + } + inline std::string GetStoragePath() { return storage_config_.path; } + inline embedding::CacheStrategy CacheStrategy() { + return storage_config_.cache_strategy; + } + + inline std::string DebugString() const { + return strings::StrCat("class type: ", typeid(this).name(), + " alloc len: ", alloc_len_, + " total dims: ", total_dims_, + " storage config: ", storage_config_.DebugString()); + } + + inline void Insert(const std::vector& keys, void** value_ptrs) { + for (size_t i = 0; i < keys.size(); i++) { + Insert(keys[i], value_ptrs[i]); + } + } + + virtual void UpdateCache(const Tensor& indices, + const Tensor& indices_counts) {} + + virtual void UpdateCache(const Tensor& indices) {} + + virtual void AddToCachePrefetchList(const Tensor& indices) {} + + virtual void AddToCache(const Tensor& indices) {} + + virtual void Restore(const std::string& name_string, + const std::string& file_name_string, int64 partition_id, + int64 partition_num, int64 value_len, bool is_incr, + bool reset_version, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, BundleReader* reader, + EmbeddingVar* ev, + FilterPolicy>* filter) { + CheckpointLoader restorer(reinterpret_cast*>(this), ev, + filter, name_string, file_name_string, + partition_id, partition_num, is_incr, + reset_version, reader); + restorer.RestoreCkpt(emb_config, device); + }; + + virtual void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) = 0; + + virtual void Import(K key, V* value, int64 freq, int64 version, + int emb_index) = 0; + + virtual Status RestoreFeatures(int64 key_num, int bucket_num, + int64 partition_id, int64 partition_num, + int64 value_len, bool is_filter, bool is_incr, + const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + FilterPolicy>* filter, + RestoreBuffer& restore_buff) { + return OkStatus(); + } + + protected: + virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num, + int64 value_len, + const std::string& ssd_emb_file_name, + EmbeddingVar* ev, + RestoreSSDBuffer& restore_buff) { + for (int64 i = 0; i < restore_buff.num_of_keys; i++) { + int64 file_id = restore_buff.key_file_id_list_buf[i]; + int64 key_offset = restore_buff.key_offset_list_buf[i]; + // Read data from embedding files on SSD. Data are stored in + // NormalContiguousValuePtr temporarily. + std::stringstream ss; + ss << ssd_emb_file_name << "/" << file_id << ".emb"; + int fd = open(ss.str().data(), O_RDONLY); + EmbeddingConfig& emb_config = storage_config_.embedding_config; + FeatureDescriptor normal_feat_desc( + emb_config.block_num, emb_config.slot_num + 1, ev_allocator(), + StorageType::DRAM, true, true, {false, 0}); + void* value_ptr = normal_feat_desc.Allocate(); + char* file_addr = + (char*)mmap(nullptr, normal_feat_desc.data_bytes() + key_offset, + PROT_READ, MAP_PRIVATE, fd, 0); + memcpy(value_ptr, file_addr + key_offset, normal_feat_desc.data_bytes()); + munmap(file_addr, normal_feat_desc.data_bytes() + key_offset); + close(fd); + // Copy Data to ValuePtr, data of slots are set by primary here. + int64 import_freq = normal_feat_desc.GetFreq(value_ptr); + int64 import_version = normal_feat_desc.GetVersion(value_ptr); + V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index); + Import(restore_buff.key_list_buf[i], value, import_freq, import_version, + emb_index); + normal_feat_desc.Deallocate(value_ptr); + } + return OkStatus(); + } + + private: + void GeneratePartitionedCkptData( + const std::vector& key_list, const std::vector& value_ptr_list, + EmbeddingVarCkptData* partitioned_ckpt_data, + const EmbeddingConfig& emb_config, V* default_value, + FeatureDescriptor* feat_desc) { + std::vector> ev_ckpt_data_parts( + kSavedPartitionNum); + + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true, + &save_unfiltered_features)); + + bool is_save_freq = emb_config.is_save_freq(); + bool is_save_version = emb_config.is_save_version(); + + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + ev_ckpt_data_parts[part_id].Emplace( + key_list[i], value_ptr_list[i], emb_config, default_value, + feat_desc, is_save_freq, is_save_version, + save_unfiltered_features); + break; + } + } + } + + partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts); + } + + void GeneratePartitionedCkptData( + const std::vector& key_list, const std::vector& value_ptr_list, + EmbeddingVarCkptData* partitioned_ckpt_data, + const EmbeddingConfig& emb_config, V* default_value, + const std::vector*>& feat_desc) { + std::vector> ev_ckpt_data_parts( + kSavedPartitionNum); + + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true, + &save_unfiltered_features)); + + bool is_save_freq = emb_config.is_save_freq(); + bool is_save_version = emb_config.is_save_version(); + + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset; + ev_ckpt_data_parts[part_id].Emplace( + key_list[i], value_ptr_list[i], emb_config, default_value, + feat_desc[feat_desc_type], is_save_freq, is_save_version, + save_unfiltered_features); + break; + } + } + } + + partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts); + } + + void GeneratePartitionedCkptData( + const std::vector& key_list, const std::vector& value_ptr_list, + EmbeddingVarCkptData* partitioned_ckpt_data) { + std::vector> ev_ckpt_data_parts( + kSavedPartitionNum); + + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + ev_ckpt_data_parts[part_id].Emplace(key_list[i], value_ptr_list[i]); + break; + } + } + } + + partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts); + } + + protected: + Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer, + const EmbeddingConfig& emb_config, int64 value_len, + V* default_value, const std::vector& key_list, + const std::vector& value_ptr_list, + FeatureDescriptor* feat_desc, + ValueIterator* value_iter = nullptr) { + EmbeddingVarCkptData partitioned_ckpt_data; + GeneratePartitionedCkptData(key_list, value_ptr_list, + &partitioned_ckpt_data, emb_config, + default_value, feat_desc); + Status s = partitioned_ckpt_data.ExportToCkpt(tensor_name, writer, + value_len, value_iter); + return OkStatus(); + } + + Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer, + const EmbeddingConfig& emb_config, int64 value_len, + V* default_value, const std::vector& key_list, + const std::vector& value_ptr_list, + const std::vector*>& feat_desc, + ValueIterator* value_iter = nullptr) { + EmbeddingVarCkptData partitioned_ckpt_data; + GeneratePartitionedCkptData(key_list, value_ptr_list, + &partitioned_ckpt_data, emb_config, + default_value, feat_desc); + Status s = partitioned_ckpt_data.ExportToCkpt(tensor_name, writer, + value_len, value_iter); + return OkStatus(); + } + + Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer, + int64 value_len, const std::vector& key_list, + const std::vector& value_ptr_list) { + EmbeddingVarCkptData partitioned_ckpt_data; + GeneratePartitionedCkptData(key_list, value_ptr_list, + &partitioned_ckpt_data); + Status s = + partitioned_ckpt_data.ExportToCkpt(tensor_name, writer, value_len); + return OkStatus(); + } + + protected: + int64 alloc_len_ = 0; + int64 total_dims_ = 0; + StorageConfig storage_config_; + + mutex mu_; + std::atomic_flag flag_ = ATOMIC_FLAG_INIT; + std::vector initialize_value_; +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h new file mode 100644 index 00000000..79e17ae1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h @@ -0,0 +1,59 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_ + +#include "cache.h" +#include "embedding_config.h" + +namespace tensorflow { +namespace embedding { +struct StorageConfig { + StorageConfig() + : type(StorageType::DEFAULT), + path(""), + cache_strategy(CacheStrategy::LFU) { + size = {1 << 30, 1 << 30, 1 << 30, 1 << 30}; + } + + StorageConfig(StorageType t, const std::string& p, + const std::vector& s, const EmbeddingConfig& ec, + const CacheStrategy cache_strategy_ = CacheStrategy::LFU) + : type(t), + path(p), + size(s), + embedding_config(ec), + cache_strategy(cache_strategy_) {} + StorageType type; + std::string path; + std::vector size; + CacheStrategy cache_strategy; + EmbeddingConfig embedding_config; + + std::string DebugString() const { + std::string size_str = + std::accumulate(std::next(size.begin()), size.end(), + std::to_string(size[0]), [](std::string a, int64_t b) { + return std::move(a) + "_" + std::to_string(b); + }); + return strings::StrCat("storage type: ", type, " storage path: ", path, + " storage capacity: ", size_str, + " cache strategy: ", cache_strategy); + } +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h new file mode 100644 index 00000000..67d8a0b6 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h @@ -0,0 +1,78 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ + +#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "dram_leveldb_storage.h" +#include "dram_pmem_storage.h" +#include "dram_ssd_storage.h" +#include "hbm_dram_ssd_storage.h" +#include "hbm_dram_storage.h" +#include "multi_tier_storage.h" +#include "single_tier_storage.h" +#include "storage.h" +#include "storage_config.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace embedding { +class StorageFactory { + public: + template + static Storage* Create(const StorageConfig& sc, + Allocator* gpu_allocator, + FeatureDescriptor* feat_desc, + const string& name) { + switch (sc.type) { + case StorageType::DRAM: + return new DramStorage(sc, feat_desc); + case StorageType::PMEM_MEMKIND: + feat_desc->SetAllocator(pmem_allocator()); + return new PmemMemkindStorage(sc, feat_desc); + case StorageType::PMEM_LIBPMEM: + feat_desc->SetAllocator( + experimental_pmem_allocator(sc.path, sc.size[0])); + return new PmemLibpmemStorage(sc, feat_desc); + case StorageType::DRAM_PMEM: + return new DramPmemStorage(sc, feat_desc, name); + case StorageType::LEVELDB: + case StorageType::DRAM_LEVELDB: + return new DramLevelDBStore(sc, feat_desc, name); + case StorageType::SSDHASH: + case StorageType::DRAM_SSDHASH: + return new DramSsdHashStorage(sc, feat_desc, name); + case StorageType::HBM: +#if GOOGLE_CUDA + return new HbmStorage(sc, gpu_allocator, feat_desc); +#endif // GOOGLE_CUDA + case StorageType::HBM_DRAM: +#if GOOGLE_CUDA + return new HbmDramStorage(sc, gpu_allocator, feat_desc, name); +#endif // GOOGLE_CUDA + case StorageType::HBM_DRAM_SSDHASH: +#if GOOGLE_CUDA + return new HbmDramSsdStorage(sc, gpu_allocator, feat_desc, name); +#endif // GOOGLE_CUDA + default: + return new DramStorage(sc, feat_desc); + } + } +}; +} // namespace embedding +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc new file mode 100644 index 00000000..e50be39e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc @@ -0,0 +1,757 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +namespace { +// input: input tensor value (it sores the id) +// cols: How many elements to do SparseSegmentSum +// output: rows * embedding_size +template +static void sparse_gather_v1(T *input, int rows, int cols, + float *embedding_table, float *output, + int embedding_size, bool is_mean) { + T *pidx = input; + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < embedding_size; ++j) { + float value = 0; + int dense_num = 0; + for (int k = 0; k < cols; ++k) { + int embedding_row = (int)pidx[k]; + if (embedding_row >= 0) { + value += embedding_table[embedding_row * embedding_size + j]; + dense_num += 1; + } + } + + if (is_mean && dense_num > 0) { + *output++ = value / dense_num; + } else { + *output++ = value; + } + } + pidx += cols; + } +} + +// embedding_size = 1 +template +static void sparse_gather_embeddingsize1(T *input, int rows, int cols, + float *embedding_table, float *output, + bool is_mean) { + T *pidx = input; + for (int i = 0; i < rows; ++i) { + float value = 0; + int dense_num = 0; + for (int k = 0; k < cols; ++k) { + int embedding_row = pidx[k]; + if (embedding_row >= 0) { + value += embedding_table[embedding_row]; + dense_num += 1; + } + } + if (is_mean && dense_num > 0) { + *output++ = value / dense_num; + } else { + *output++ = value; + } + pidx += cols; + } +} + +// input cols = 1 +template +static void sparse_gather_column1(T *input, int rows, float *embedding_table, + float *output, int embedding_size) { + T *pidx = input; + for (int i = 0; i < rows; ++i) { + int embedding_row = *pidx++; + if (embedding_row >= 0) { + float *pembedding = &embedding_table[embedding_row * embedding_size]; + for (int j = 0; j < embedding_size; ++j) { + output[j] = pembedding[j]; + } + } else { + for (int j = 0; j < embedding_size; ++j) { + output[j] = 0; + } + } + output += embedding_size; + } +} + +template +static void sparse_gather(T *input, int rows, int cols, float *embedding_table, + float *output, int embedding_size, bool is_mean) { + if (embedding_size == 1) { + sparse_gather_embeddingsize1(input, rows, cols, embedding_table, output, + is_mean); + } else if (cols == 1) { + sparse_gather_column1(input, rows, embedding_table, output, embedding_size); + } else { + // printf("General sparse gather!\n"); + sparse_gather_v1(input, rows, cols, embedding_table, output, embedding_size, + is_mean); + } +} + +// Use memcpy or manually assign? +static void mycopy(float *dst, float *src, int float_num) { + memcpy(dst, src, float_num * sizeof(float)); +} + +static void myadd(float *dst, float *src, int float_num) { + for (int i = 0; i < float_num; ++i) { + dst[i] += src[i]; + } +} + +template +static void row_add(std::map> &mapSet, int64 row_nums) { + for (auto it = mapSet.begin(); it != mapSet.end(); ++it) { + T *dst = it->first; + std::vector srcs(std::move(it->second)); + int64 src_size = srcs.size(); + + for (int row = 0; row < row_nums; ++row) { + dst[row] = 0.0; + for (int index = 0; index < src_size; ++index) { + dst[row] += srcs[index][row]; + } + } + } +} + +template +static void row_add_mean(std::map> &mapSet, + int64 row_nums, bool is_mean) { +#define L(n) srcs[index + n][row] + + for (auto it = mapSet.begin(); it != mapSet.end(); ++it) { + T *dst = it->first; + std::vector srcs(std::move(it->second)); + int64 src_size = srcs.size(); + + if (src_size == 1) { + for (int row = 0; row < row_nums; ++row) { + dst[row] = srcs[0][row]; + } + continue; + } + + float sum_tmp = 0.0; + int64 index = 0; + int64 r = (src_size) % 8; + int64 m = 1; + if (src_size < 10 && is_mean) m = src_size; + + for (int row = 0; row < row_nums; ++row) { + sum_tmp = 0.0; + index = 0; + dst[row] = 0.0; + switch (r) { + case 2: { + sum_tmp = (L(0) + L(1)) / m; + dst[row] = sum_tmp; + break; + } + case 3: { + sum_tmp = (L(0) + L(1) + L(2)) / m; + dst[row] = sum_tmp; + break; + } + case 4: { + sum_tmp = (L(0) + L(1) + L(2) + L(3)) / m; + dst[row] = sum_tmp; + break; + } + case 5: { + sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4)) / m; + dst[row] = sum_tmp; + break; + } + case 6: { + sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m; + dst[row] = sum_tmp; + break; + } + case 7: { + sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m; + dst[row] = sum_tmp; + break; + } + case 0: { + dst[row] = + (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m; + index += 8; + break; + } + case 1: { + dst[row] = + (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) / + m; + index += 8; + break; + } + } + for (index += r; index < src_size; index += 8) { + sum_tmp = L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7); + dst[row] += sum_tmp; + } + if (src_size >= 10 && is_mean) dst[row] /= src_size; + } + } +} + +static void myscale(float *dst, float factor, int float_num) { + for (int i = 0; i < float_num; ++i) { + dst[i] *= factor; + } +} + +template +static void sparse_gather(Tid *input, int64 input_size, Tshape *indice, + int indice_dim, Tshape *shape, int rows, int cols, + float *embedding_table, float *output, + int embedding_size, bool is_mean) { + // Record how many values in each row + int *row_values = new int[rows]; + memset(row_values, 0, rows * sizeof(int)); + + std::map> mapSet; + + for (int64 i = 0; i < input_size; ++i) { + Tid id = input[i]; + if (i < input_size && input[i] < 0) { // Skip invalid id + continue; + } + auto row = indice[i * indice_dim]; + // for (int k = 1; k < indice_dim - 1; ++k) { + // row = row * shape[k] + indice[i * indice_dim + k]; + // } + row_values[row] += 1; + + auto index = row * embedding_size; + if (!mapSet.count(&output[index])) { + std::vector srcs; + mapSet[&output[index]] = srcs; + } + mapSet[&output[index]].push_back(&embedding_table[id * embedding_size]); + } + + // row_add(mapSet, embedding_size); + row_add_mean(mapSet, embedding_size, is_mean); + + for (int i = 0; i < rows; ++i) { + if (row_values[i] == 0) { + memset(&output[i * embedding_size], 0, embedding_size * sizeof(float)); + // } else if (is_mean && row_values[i] > 1) { + // float factor = 1.0f / row_values[i]; + // myscale(&output[i * embedding_size], factor, embedding_size); + } + } + delete[] row_values; +} +} // namespace + +/* + sample: [['green' 'red' 'blue' 'yellow' 'pink' 'blue' 'red' 'indigo'] + ['' '' '' '' '' '' '' ''] + ['' '' '' 'yellow' 'pink' 'blue' 'red' 'indigo'] + ['' '' '' '' '' '' '' ''] + ['green' '' '' '' '' '' '' '']] + => [[ True True True True True True True True] + [False False False False False False False False] + [False False False True True True True True] + [False False False False False False False False] + [ True False False False False False False False]] +-------------------------------------------------------------------------------------- + weight: float[[ 0.23860918 0.07992432 -0.7441818 ] + [-0.8256738 -0.50271106 0.39016065] + [-0.7978571 0.3993331 -0.12494776] + [-0.555991 -0.6705441 -0.23192379] + [-0.5283828 0.19715567 0.12184268]] + input: int64[4 0 0 1 1 0 0 1 1 1 0 0 1 4] from StringToHashBucketFast output + dense_shape: int64[5 8] + indice: int64[[0 0] from to_sparse_input/indices(Where) output + [0 1] + [0 2] + [0 3] + [0 4] + [0 5] + [0 6] + [0 7] + [2 3] + [2 4] + [2 5] + [2 6] + [2 7] + [4 0]] + embedded: float[[-0.25637093 -0.12391002 -0.21055032] + [ 0. 0. 0. ] + [-0.3999606 -0.2696569 -0.06357633] + [ 0. 0. 0. ] + [-0.5283828 0.19715567 0.12184268]] +----------------------------------------------------------------------------------- + input_size: sum of input tensor size == 14 + indice_dim: dim_size(1) of indice tensor[14, 2] == 2 + shape: dense_shape == [5 8] + batch_size: dim of dense_shape == 5 + cols: dim_size(1) of dense_shape == 8 + embedding_size: dim_size(1) of weight tensor == 3 + sparse_gather(input, input_size, indice, indice_dim, shape, batch_size, + cols, weight, output, embedding_size, is_mean); +*/ + +template +class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel { + public: + explicit FusedSafeEmbeddingLookupSparseLocalOp(OpKernelConstruction *context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_)); + // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims)); + node_name = context->def().name(); + } + + ~FusedSafeEmbeddingLookupSparseLocalOp() {} + + void Compute(OpKernelContext *context) override { + // Grab the weight + float *weight; + const Tensor *weight_tensor = &context->input(0); + + // for saved model + if (weight_tensor->dtype() == DT_RESOURCE) { + Var *variable; + OP_REQUIRES_OK( + context, + LookupResource(context, HandleFromInput(context, 0), &variable)); + core::ScopedUnref s(variable); + weight_tensor = variable->tensor(); + OP_REQUIRES( + context, weight_tensor->dtype() == DT_FLOAT, + errors::InvalidArgument("Expect float weight in ", node_name)); + } + + weight = (float *)weight_tensor->tensor_data().data(); + + // Input id + const Tensor &input_tensor = context->input(1); + Tid *input = (Tid *)input_tensor.tensor_data().data(); + + const Tensor &shape_tensor = context->input(2); + Tshape *shape = (Tshape *)shape_tensor.tensor_data().data(); + + // To check the input + OP_REQUIRES( + context, (shape_tensor.dims() == 1), + errors::InvalidArgument("Shape tensor is not valid (dims != 1)")); + OP_REQUIRES( + context, (shape_tensor.dim_size(0) >= 2), + errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)")); + + int64 input_size = 1; + for (int i = 0; i < input_tensor.dims(); ++i) { + input_size *= input_tensor.dim_size(i); + } + + int input_dims = shape_tensor.dim_size(0); + int cols = shape[input_dims - 1]; + int batch_size = 1; + for (int i = 0; i < input_dims - 1; ++i) { + batch_size *= shape[i]; + } + int embedding_size = weight_tensor->dim_size(1); + bool is_mean = (combiner_ == "mean"); + + const Tensor &indice_tensor = context->input(3); + Tshape *indice = (Tshape *)indice_tensor.tensor_data().data(); + int indice_dim = indice_tensor.dim_size(1); + + // Create an output tensor + Tensor *output_tensor = NULL; + TensorShape output_shape({batch_size, embedding_size}); + OP_REQUIRES_OK(context, + context->allocate_output(0, output_shape, &output_tensor)); + float *output = (float *)output_tensor->tensor_data().data(); + + if (false && input_size == batch_size * cols) { // input id is dense + // fixme(marvin): disable this branch just for test. + sparse_gather(input, batch_size, cols, weight, output, embedding_size, + is_mean); + } else { // input id is sparse + OP_REQUIRES(context, (indice_tensor.dims() == 2), + errors::InvalidArgument( + "Indice tensor is not as expected (dims != 2)")); + OP_REQUIRES( + context, (indice_tensor.dim_size(0) == input_size), + errors::InvalidArgument( + "Indice tensor is not as expected (dim_size(0) != batch_size)")); + sparse_gather(input, input_size, indice, indice_dim, shape, batch_size, + cols, weight, output, embedding_size, is_mean); + } + } + + private: + std::string combiner_; + std::string node_name; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedSafeEmbeddingLookupSparseLocal") + .Device(DEVICE_CPU) + .TypeConstraint("T_id") + .TypeConstraint("T_shape"), + FusedSafeEmbeddingLookupSparseLocalOp); + +REGISTER_KERNEL_BUILDER( + Name("FusedSafeEmbeddingLookupSparseLocal") + .Device(DEVICE_CPU) + .TypeConstraint("T_id") + .TypeConstraint("T_shape"), + FusedSafeEmbeddingLookupSparseLocalOp); + +enum class SparseSegmentReductionOperation { kSum, kMean, kSqrtN }; + +namespace functor { + +template +struct SparseSegmentGradFunctor { + void operator()(OpKernelContext *context, + SparseSegmentReductionOperation operation, + typename TTypes::ConstMatrix input_flat, + typename TTypes::ConstVec indices_vec, + typename TTypes::ConstVec segment_vec, + typename TTypes::Matrix output_flat) { + const int64_t N = indices_vec.size(); + const SegmentId M = output_flat.dimension(0); + + // Note that similar to SparseSegmentMean, we assume that segment_vec is + // already sorted and has non-negative values. + const SegmentId num_segments = input_flat.dimension(0); + const SegmentId last_segment_id_plus_one = + internal::SubtleMustCopy(segment_vec(N - 1)) + 1; + OP_REQUIRES(context, last_segment_id_plus_one <= num_segments, + errors::InvalidArgument("Invalid number of segments")); + + // Compute scaling factors for input. + std::vector scaling( + (operation == SparseSegmentReductionOperation::kSum ? 0 : num_segments), + 0.0); + if (operation != SparseSegmentReductionOperation::kSum) { + for (int64_t i = 0; i < N; ++i) { + const SegmentId idx = internal::SubtleMustCopy(segment_vec(i)); + OP_REQUIRES( + context, FastBoundsCheck(idx, num_segments), + errors::InvalidArgument("Segment id ", idx, " out of range [0, ", + num_segments, ").")); + scaling[idx] += 1; + } + for (size_t i = 0; i < scaling.size(); ++i) { + switch (operation) { + case SparseSegmentReductionOperation::kSum: { + OP_REQUIRES( + context, false, + errors::Internal( + "Should not happen: sum inside SparseSegmentReductionOp " + "scaling generation.")); + } + case SparseSegmentReductionOperation::kMean: { + scaling[i] = 1.0 / std::max(scaling[i], 1.0); + break; + } + case SparseSegmentReductionOperation::kSqrtN: { + scaling[i] = 1.0 / sqrt(std::max(scaling[i], 1.0)); + break; + } + // No default to get compiler warnings for missing cases. + } + } + } + + output_flat.setZero(); + std::vector is_modified(M, false); + + for (int64_t i = 0; i < N; ++i) { + const Index output_idx = internal::SubtleMustCopy(indices_vec(i)); + OP_REQUIRES(context, FastBoundsCheck(output_idx, M), + errors::InvalidArgument("Index ", output_idx, + " out of range [0, ", M, ").")); + + const SegmentId idx = internal::SubtleMustCopy(segment_vec(i)); + OP_REQUIRES( + context, FastBoundsCheck(idx, num_segments), + errors::InvalidArgument("Segment id ", idx, " out of range [0, ", + num_segments, ").")); + + const T scale = (operation == SparseSegmentReductionOperation::kSum + ? static_cast(1) + : static_cast(scaling[idx])); + if (is_modified[output_idx]) { + if (scale == 1.0) { + output_flat.template chip<0>(output_idx) += + input_flat.template chip<0>(idx); + } else { + output_flat.template chip<0>(output_idx) += + input_flat.template chip<0>(idx) * scale; + } + } else { + if (scale == 1.0) { + output_flat.template chip<0>(output_idx) = + input_flat.template chip<0>(idx); + } else { + output_flat.template chip<0>(output_idx) = + input_flat.template chip<0>(idx) * scale; + } + } + is_modified[output_idx] = true; + } + } +}; + +} // namespace functor + +template +class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel { + public: + explicit FusedSafeEmbeddingLookupSparseLocalGradOp( + OpKernelConstruction *context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_)); + // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims)); + + if (combiner_ == "sum") { + operation_ = SparseSegmentReductionOperation::kSum; + } else if (combiner_ == "mean") { + operation_ = SparseSegmentReductionOperation::kMean; + } else if (combiner_ == "sqrtn") { + operation_ = SparseSegmentReductionOperation::kSqrtN; + } else { + OP_REQUIRES( + context, false, + errors::InvalidArgument( + "Currently, 'mean', 'sqrtn' and 'sum' are only supported")); + } + + node_name = context->def().name(); + + static bool printed = false; + if (!printed) { + printf("******** FusedSafeEmbeddingLookupSparseLocalGradOp ********\n"); + printed = true; + } + } + + ~FusedSafeEmbeddingLookupSparseLocalGradOp() {} + + void Compute(OpKernelContext *context) override { + // Grab gradients + const Tensor &gradients_tensor = context->input(0); + T *gradients = (T *)gradients_tensor.tensor_data().data(); + OP_REQUIRES( + context, (gradients_tensor.dims() == 2), + errors::InvalidArgument("Gradients tensor is not valid (dims != 2)")); + int64 gradients_row = gradients_tensor.dim_size(0); + int64 embedding_col = gradients_tensor.dim_size(1); + + // Grad input hash value + const Tensor &input_tensor = context->input(1); + Tinput *input = (Tinput *)input_tensor.tensor_data().data(); + int64 input_size = 1; + for (int i = 0; i < input_tensor.dims(); ++i) { + input_size *= input_tensor.dim_size(i); + } + + // Grad indices value + const Tensor &indices_tensor = context->input(2); + Tindices *indices_ptr = (Tindices *)indices_tensor.tensor_data().data(); + int indices_row = indices_tensor.dim_size(0); + int indices_col = indices_tensor.dim_size(1); + OP_REQUIRES(context, (indices_tensor.dims() == 2), + errors::InvalidArgument( + "Indice tensor is not as expected (dims != 2)")); + OP_REQUIRES( + context, (indices_tensor.dim_size(0) == input_size), + errors::InvalidArgument( + "Indice tensor is not as expected (dim_size(0) != batch_size)")); + std::vector input_indices; // collect first col + for (int64 i = 0; i < indices_row; ++i) { + input_indices.emplace_back(indices_ptr[i * indices_col]); + } + + // Grad input dense shape + const Tensor &dense_shape_tensor = context->input(3); + Tdense_shape *dense_shape = + (Tdense_shape *)dense_shape_tensor.tensor_data().data(); + OP_REQUIRES( + context, (dense_shape_tensor.dims() == 1), + errors::InvalidArgument("Shape tensor is not valid (dims != 1)")); + OP_REQUIRES( + context, (dense_shape_tensor.dim_size(0) >= 2), + errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)")); + int input_dims = dense_shape_tensor.dim_size(0); + int input_cols = dense_shape[input_dims - 1]; + int batch_size = 1; + for (int i = 0; i < input_dims - 1; ++i) { + batch_size *= dense_shape[i]; + } + OP_REQUIRES( + context, (gradients_row == batch_size), + errors::InvalidArgument("gradients row is not same as batch_size)")); + + // Grad combiner + // bool is_mean = (combiner == 1); + + // compute unique value and indices of input hash value + std::vector unique_value; + std::vector unique_indices; + unique_value.reserve(input_size); + unique_indices.reserve(input_size); + for (int64 i = 0; i < input_size; ++i) { + Tinput id = input[i]; + if (id < 0) { // Skip invalid id + continue; + } + auto it = std::find(unique_value.begin(), unique_value.end(), id); + if (it == unique_value.end()) { // no find + unique_indices.push_back(unique_value.size()); + unique_value.push_back(id); + } else { + unique_indices.push_back(it - unique_value.begin()); + } + } + + // printf("unique_indices: "); + // for (int i = 0; i < unique_indices.size(); ++i) + // printf("%d ", unique_indices[i]); + // printf("\n"); + + // printf("input_indices: "); + // for (int i = 0; i < input_indices.size(); ++i) + // printf("%d ", input_indices[i]); + // printf("\n"); + + // Create an output tensor + Tensor *output_tensor = NULL; + TensorShape output_shape({unique_value.size(), embedding_col}); + OP_REQUIRES_OK(context, + context->allocate_output(0, output_shape, &output_tensor)); + output_tensor->flat().setZero(); + T *output = (T *)output_tensor->tensor_data().data(); + + memset(output, 0, embedding_col * sizeof(float) * unique_value.size()); + + Tensor *unique_tensor = NULL; + TensorShape unique_shape({unique_value.size()}); + OP_REQUIRES_OK(context, + context->allocate_output(1, unique_shape, &unique_tensor)); + Tinput *unique = (Tinput *)unique_tensor->tensor_data().data(); + + int64 unique_num = unique_value.size(); + for (int64 i = 0; i < unique_num; ++i) { + unique[i] = unique_value[i]; + } + + // if (input_size == batch_size * input_cols) { // input id is dense + // } else { // input id is sparse + // } + + if (operation_ == SparseSegmentReductionOperation::kMean) { + auto input_flat = gradients_tensor.flat_outer_dims(); + typename TTypes::ConstVec indices_vec(unique_indices.data(), + unique_indices.size()); + typename TTypes::ConstVec segment_vec(input_indices.data(), + input_indices.size()); + auto output_flat = output_tensor->flat_outer_dims(); + functor::SparseSegmentGradFunctor()( + context, operation_, input_flat, indices_vec, segment_vec, + output_flat); + } else if (operation_ == SparseSegmentReductionOperation::kSum) { + uint64 rows = unique_indices.size(); + // std::vector row_values(unique_value.size(), 0); + std::map> mapSet; + + for (int64 i = 0; i < rows; ++i) { + // row_values[unique_indices[i]] += 1; + + auto index = unique_indices[i] * embedding_col; + // memset(&output[index * embedding_col], 0, embedding_col * + // sizeof(float)); + if (!mapSet.count(&output[index])) { + std::vector srcs; + mapSet[&output[index]] = srcs; + } + mapSet[&output[index]].push_back( + &gradients[input_indices[i] * embedding_col]); + } + + row_add(mapSet, embedding_col); + // printf("******Goto row_add_mean func.******\n"); + // row_add_mean(mapSet, embedding_col, false); + + // for (int i = 0; i < unique_value.size(); ++i) { + // if (row_values[i] == 0) { + // memset(&output[i * embedding_col], 0, embedding_col * + // sizeof(float)); + // } + // } + // delete[] row_values; + + } else if (operation_ == SparseSegmentReductionOperation::kSqrtN) { + } + } + + private: + template + void copy(Tdata *dst, const Tdata *src, const int64 num) { + memcpy(dst, src, num * sizeof(T)); + } + + template + void add(Tdata *dst, const Tdata *src, const int64 num) { + for (int64 i = 0; i < num; ++i) { + dst[i] += src[i]; + } + } + + template + void scale(Tdata *dst, const Tdata factor, const int64 num) { + for (int64 i = 0; i < num; ++i) { + dst[i] *= factor; + } + } + + private: + std::string combiner_; + std::string node_name; + SparseSegmentReductionOperation operation_; +}; + +REGISTER_KERNEL_BUILDER(Name("FusedSafeEmbeddingLookupSparseLocalGrad") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .TypeConstraint("Tinput") + .TypeConstraint("Tindices") + .TypeConstraint("Tdense_shape"), + FusedSafeEmbeddingLookupSparseLocalGradOp< + CPUDevice, float, int64, int32, int64>); + +REGISTER_KERNEL_BUILDER(Name("FusedSafeEmbeddingLookupSparseLocalGrad") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .TypeConstraint("Tinput") + .TypeConstraint("Tindices") + .TypeConstraint("Tdense_shape"), + FusedSafeEmbeddingLookupSparseLocalGradOp< + CPUDevice, float, int64, int64, int64>); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc new file mode 100644 index 00000000..dc222b12 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc @@ -0,0 +1,901 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/image_ops.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 }; + +template +void get_node_attr_from_test_case(string &combiner_str, float &max_norm) { + if (test_case == Sqrtn) { + combiner_str = "sqrtn"; + max_norm = -1.0f; + } else if (test_case == Mean) { + combiner_str = "mean"; + max_norm = -1.0f; + } else if (test_case == Sum) { + combiner_str = "sum"; + max_norm = -1.0f; + } else if (test_case == SqrtnAndMaxNorm200) { + combiner_str = "sqrtn"; + max_norm = 200.0f; + } else if (test_case == MeanAndMaxNorm100) { + combiner_str = "mean"; + max_norm = 100.0f; + } +} + +template +void fill_emb_vector_expected(Tensor *expected); + +template <> +void fill_emb_vector_expected(Tensor *expected) { + test::FillValues( + expected, {22.627416610717773, 24.0416316986084, 25.45584487915039, + 26.870058059692383, 28.284271240234375, 29.698484420776367, + 31.112699508666992, 32.526912689208984, 73.90083312988281, + 75.63288879394531, 77.36493682861328, 79.09698486328125, + 80.82904052734375, 82.56108856201172, 84.29314422607422, + 86.02519226074219, 124.70765686035156, 126.43971252441406, + 128.17176818847656, 129.90380859375, 131.6358642578125, + 133.367919921875, 135.09996032714844, 136.83201599121094, + 107.48023223876953, 108.89444732666016, 110.30866241455078, + 111.72286987304688, 113.1370849609375, 114.55130004882812, + 115.96551513671875, 117.37973022460938}); +} + +template <> +void fill_emb_vector_expected(Tensor *expected) { + test::FillValues( + expected, {16.00000000000000, 17.00000000000000, 18.00000000000000, + 19.00000000000000, 20.00000000000000, 21.00000000000000, + 22.00000000000000, 23.00000000000000, 42.66666793823242, + 43.66666793823242, 44.66666793823242, 45.66666793823242, + 46.66666793823242, 47.66666793823242, 48.66666793823242, + 49.66666793823242, 72.00000000000000, 73.00000000000000, + 74.00000000000000, 75.00000000000000, 76.00000000000000, + 77.00000000000000, 78.00000000000000, 79.00000000000000, + 76.00000000000000, 77.00000000000000, 78.00000000000000, + 79.00000000000000, 80.00000000000000, 81.00000000000000, + 82.00000000000000, 83.00000000000000}); +} + +template <> +void fill_emb_vector_expected(Tensor *expected) { + test::FillValues( + expected, {32.0, 34.0, 36.0, 38.0, 40.0, 42.0, 44.0, 46.0, + 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0, + 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0, + 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0}); +} + +template <> +void fill_emb_vector_expected(Tensor *expected) { + test::FillValues( + expected, + {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, + 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, + 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, + 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, + 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, + 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, + 77.87342834, 78.98532867}); +} + +class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase { + protected: + template + void Run(Device device) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + DataType dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up", + "FusedEmbeddingLocalSparseLookUp") + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(dtype)) + .Attr("T", dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor emb_variable(dtype, {bucket_size, emb_vector_dim}); + + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + test::FillValues(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7, + 2, 1, 2, 4, 2, 7, 3, 0, 3, 6}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(dtype, {batch_size, emb_vector_dim}); + Tensor sp_values_offset_expected(DT_INT32, {batch_size}); + fill_emb_vector_expected(&emb_vector_expected); + test::FillValues(&sp_values_offset_expected, {0, 2, 5, 8}); + + const Tensor &emb_vector = *GetOutput(0); + const Tensor &values_offset = *GetOutput(1); + TF_EXPECT_OK(device_->Sync()); + + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-4); + test::ExpectTensorEqual(sp_values_offset_expected, values_offset); + } +}; + +template +void fill_grad_expected(Tensor *expected); + +template <> +void fill_grad_expected(Tensor *expected) { + test::FillValues( + expected, {0.000000000000000, 0.7071067690849304, 1.4142135381698608, + 2.1213204860687256, 2.8284270763397217, 3.535533905029297, + 4.242640972137451, 4.949747562408447, 0.000000000000000, + 0.7071067690849304, 1.4142135381698608, 2.1213204860687256, + 2.8284270763397217, 3.535533905029297, 4.242640972137451, + 4.949747562408447, 4.618802070617676, 5.196152687072754, + 5.773502826690674, 6.350852966308594, 6.928203582763672, + 7.505553722381592, 8.082903861999512, 8.66025447845459, + 4.618802070617676, 5.196152687072754, 5.773502826690674, + 6.350852966308594, 6.928203582763672, 7.505553722381592, + 8.082903861999512, 8.66025447845459, 4.618802070617676, + 5.196152687072754, 5.773502826690674, 6.350852966308594, + 6.928203582763672, 7.505553722381592, 8.082903861999512, + 8.66025447845459, 9.237604141235352, 9.81495475769043, + 10.392305374145508, 10.96965503692627, 11.547005653381348, + 12.124356269836426, 12.701705932617188, 13.279056549072266, + 9.237604141235352, 9.81495475769043, 10.392305374145508, + 10.96965503692627, 11.547005653381348, 12.124356269836426, + 12.701705932617188, 13.279056549072266, 9.237604141235352, + 9.81495475769043, 10.392305374145508, 10.96965503692627, + 11.547005653381348, 12.124356269836426, 12.701705932617188, + 13.279056549072266, 16.970563888549805, 17.677669525146484, + 18.384777069091797, 19.091882705688477, 19.79899024963379, + 20.5060977935791, 21.21320343017578, 21.920310974121094, + 16.970563888549805, 17.677669525146484, 18.384777069091797, + 19.091882705688477, 19.79899024963379, 20.5060977935791, + 21.21320343017578, 21.920310974121094}); +} + +template <> +void fill_grad_expected(Tensor *expected) { + test::FillValues( + expected, {0.000000000000000, 0.500000000000000, 1.000000000000000, + 1.500000000000000, 2.000000000000000, 2.500000000000000, + 3.000000000000000, 3.500000000000000, 0.000000000000000, + 0.500000000000000, 1.000000000000000, 1.500000000000000, + 2.000000000000000, 2.500000000000000, 3.000000000000000, + 3.500000000000000, 2.6666667461395264, 3.000000000000000, + 3.3333332538604736, 3.6666667461395264, 4.000000000000000, + 4.333333492279053, 4.666666507720947, 5.000000000000000, + 2.6666667461395264, 3.000000000000000, 3.3333332538604736, + 3.6666667461395264, 4.000000000000000, 4.333333492279053, + 4.666666507720947, 5.000000000000000, 2.6666667461395264, + 3.000000000000000, 3.3333332538604736, 3.6666667461395264, + 4.000000000000000, 4.333333492279053, 4.666666507720947, + 5.000000000000000, 5.333333492279053, 5.666666507720947, + 6.000000000000000, 6.333333492279053, 6.666666507720947, + 7.000000000000000, 7.333333492279053, 7.666666507720947, + 5.333333492279053, 5.666666507720947, 6.000000000000000, + 6.333333492279053, 6.666666507720947, 7.000000000000000, + 7.333333492279053, 7.666666507720947, 5.333333492279053, + 5.666666507720947, 6.000000000000000, 6.333333492279053, + 6.666666507720947, 7.000000000000000, 7.333333492279053, + 7.666666507720947, 12.000000000000000, 12.500000000000000, + 13.000000000000000, 13.500000000000000, 14.000000000000000, + 14.500000000000000, 15.000000000000000, 15.500000000000000, + 12.000000000000000, 12.500000000000000, 13.000000000000000, + 13.500000000000000, 14.000000000000000, 14.500000000000000, + 15.000000000000000, 15.500000000000000}); +} + +template <> +void fill_grad_expected(Tensor *expected) { + test::FillValues( + expected, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0, 9.0, 10.0, 11.0, + 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); +} + +template <> +void fill_grad_expected(Tensor *expected) { + test::FillValues( + expected, + {0.00000000, 0.50000000, 1.00000000, 1.50000000, 2.00000000, + 2.50000000, 3.00000000, 3.50000000, 0.00000000, 0.50000000, + 1.00000000, 1.50000000, 2.00000000, 2.50000000, 3.00000000, + 3.50000000, 2.65028572, 2.98157120, 3.31285667, 3.64414287, + 3.97542834, 4.30671406, 4.63799953, 4.96928549, 2.16437674, + 2.43492365, 2.70547056, 2.97601795, 3.24656487, 3.51711202, + 3.78765893, 4.05820608, 1.58337951, 1.78130186, 1.97922409, + 2.17714667, 2.37506914, 2.57299161, 2.77091384, 2.96883631, + 5.33333349, 5.66666651, 6.00000000, 6.33333349, 6.66666651, + 7.00000000, 7.33333349, 7.66666651, 1.89459133, 2.01300311, + 2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, + 2.72347474, 1.89459133, 2.01300311, 2.13141513, 2.24982715, + 2.36823893, 2.48665094, 2.60506320, 2.72347474, 3.43474555, + 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767, + 4.29343224, 4.43654633, 11.92628479, 12.42321396, 12.92014217, + 13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516}); +} + +class FusedEmbeddingLocalSparseLookUpGradOpTest : public OpsTestBase { + protected: + template + void Run(Device device) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + DataType dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up_grad", + "FusedEmbeddingLocalSparseLookUpGrad") + .Input(FakeInput(dtype)) + .Input(FakeInput(dtype)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Attr("T", dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int bucket_size = 16; + + Tensor top_grad(dtype, {batch_size, emb_vector_dim}); + Tensor emb_variable(dtype, {bucket_size, emb_vector_dim}); + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_values_offset(DT_INT32, {batch_size}); + + test::FillValues( + &top_grad, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + test::FillValues(&sp_values_offset, {0, 2, 5, 8}); + + AddInputFromArray(top_grad.shape(), top_grad.flat()); + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(dtype, {nnz, emb_vector_dim}); + fill_grad_expected(&grad_expected); + + const Tensor &grad = *GetOutput(0); + TF_EXPECT_OK(device_->Sync()); + + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } +}; + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalFloatSumCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocal", + "FusedSafeEmbeddingLookupSparseLocal") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Attr("T", DT_FLOAT) + .Attr("combiner", "sum") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_weight(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor emb_variable(DT_FLOAT, {bucket_size, emb_vector_dim}); + + // [3, 1, 4, 5, 7, 3, 12, 12, 15, 4] + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + test::FillValues(&sp_weight, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + // [0, 0, 1, 1, 1, 2, 2, 2, 3, 3] + test::FillValues(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7, + 2, 1, 2, 4, 2, 7, 3, 0, 3, 6}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim}); + // Tensor sp_values_offset_expected(DT_INT32, {batch_size}); + fill_emb_vector_expected(&emb_vector_expected); + // test::FillValues(&sp_values_offset_expected, {0, 2, 5, 8}); + + const Tensor &emb_vector = *GetOutput(0); + // const Tensor& values_offset = *GetOutput(1); + // TF_EXPECT_OK(device_->Sync()); + + float *output = (float *)emb_vector.tensor_data().data(); + float *output_ex = (float *)emb_vector_expected.tensor_data().data(); + + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-2); + // test::ExpectTensorEqual(sp_values_offset_expected, values_offset); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatSumCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocalGrad", + "FusedSafeEmbeddingLookupSparseLocalGrad") + .Input(FakeInput(DT_FLOAT)) // gradients + .Input(FakeInput(DT_INT64)) // input hash value + .Input(FakeInput(DT_INT64)) // dense_shape + .Input(FakeInput(DT_INT64)) // indices + .Attr("T", DT_FLOAT) + .Attr("Tinput", DT_INT64) + .Attr("Tindices", DT_INT64) + .Attr("Tdense_shape", DT_INT64) + .Attr("combiner", "sum") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 32; + const int batch_size = 32; + const int emb_vector_dim = 4; + const int entries = 1; + const int bucket_size = 16; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim}); + + test::FillValues( + &grad_variable, + {-0.00363823911, 0.0138593055, 0.00232614437, 0.00241222954, + -0.000268990319, -0.00410466315, 0.00478722388, -0.000196215493, + -0.0044340631, -0.00725936424, -0.00691315765, -0.00612797868, + -0.00678675482, -0.00246100035, 0.00216219737, -0.00346030248, + 0.00100048154, -0.00852716807, 0.00803291425, -0.000800206966, + -3.03583856e-05, 0.00524863973, -0.0163001865, -0.0109826243, + 0.0830041766, 0.153927863, -0.0508279465, -0.00474824524, + 7.8225421e-05, -0.000293536956, 0.00610643439, -0.00019871055, + -0.000780000235, -0.00221115421, 0.00387162319, 0.00222597015, + -0.0102384416, -0.00801581, -0.0017716008, 0.00598057127, + -0.00808391348, -0.00166459556, 0.00106997311, -0.00185864791, + 0.00491535058, -0.00633693347, 0.0212651137, 0.00704831816, + -0.00338345463, -0.00668374076, -0.0000871402444, -0.000196078254, + 0.00254824688, -0.00249796058, -0.0034719836, -0.003478111, + 6.03029093e-06, -0.00211180653, 0.000114592229, -0.00240143575, + -0.00592383416, -0.00984606426, 0.00129341101, 0.00100650277, + 0.000906444562, -0.00139640097, -0.000192714069, 0.00277191238, + -0.000245573436, -0.00680374401, 0.00356984767, -0.00120577728, + -0.000766036392, -0.00487764599, 0.000532136182, -0.00413817167, + -0.0302855149, -0.0406391025, 0.0006130244, 0.0183675159, + -0.00247384049, -0.00609699031, 0.00127684267, -0.00235637, + 0.00715987338, 0.00783564895, -0.00139878597, -0.0048744888, + 0.00356917572, -0.0164020304, 0.0179400034, 0.000975746894, + -0.00529623777, -0.00490315, 0.00691250199, 0.00286021968, + -0.00426661829, -0.00417789398, -0.00597105641, -0.00605484238, + 0.00197085389, -0.00757023226, 0.00458694575, 0.00153650146, + -0.00345475, -0.00823391136, 0.000807857723, 0.0121598523, + -0.00745406374, -0.0135948248, 0.004774753, -0.00390140619, + -0.00208005216, -0.00362896058, 0.00558064319, -0.000532045437, + -0.00854093302, 0.00566324079, -0.00435794424, 0.00403016619, + 0.000468764076, 0.000297251798, -0.00617758604, -0.00338481856, + 0.00280403625, -0.00649327, -0.000154057736, -0.000479023496}); + test::FillValues(&sp_values, + {9, 2, 9, 2, 2, 9, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, + 9, 2, 2, 2, 2, 9, 2, 9, 2, 2, 2, 9, 2, 9, 2, 2}); + test::FillValues( + &sp_indices, {0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, + 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, + 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + + AddInputFromArray(grad_variable.shape(), grad_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor output1_tensor_expected(DT_FLOAT, {2, emb_vector_dim}); + Tensor output2_tensor_expected(DT_INT64, {2}); + + test::FillValues( + &output1_tensor_expected, + {-0.0247110315, -0.00123064546, -0.0152365314, -0.0140080471, + 0.0247110203, 0.00123063289, 0.0152365509, 0.0140080536}); + + test::FillValues(&output2_tensor_expected, {9, 2}); + float *output1_ex = (float *)output1_tensor_expected.tensor_data().data(); + int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data(); + + const Tensor &output1_tensor = *GetOutput(0); + const Tensor &output2_tensor = *GetOutput(1); + + float *output1 = (float *)output1_tensor.tensor_data().data(); + int64 *output2 = (int64 *)output2_tensor.tensor_data().data(); + + printf("out = %.11f , expect = %.11f\n", output1[5], output1_ex[5]); + printf("out = %.11f , expect = %.11f\n", output1[7], output1_ex[7]); + test::ExpectTensorNear(output1_tensor_expected, output1_tensor, 1e-8); + test::ExpectTensorEqual(output2_tensor_expected, output2_tensor); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatMeanCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocalGrad", + "FusedSafeEmbeddingLookupSparseLocalGrad") + .Input(FakeInput(DT_FLOAT)) // gradients + .Input(FakeInput(DT_INT64)) // input hash value + .Input(FakeInput(DT_INT64)) // dense_shape + .Input(FakeInput(DT_INT64)) // indices + .Attr("T", DT_FLOAT) + .Attr("Tinput", DT_INT64) + .Attr("Tindices", DT_INT64) + .Attr("Tdense_shape", DT_INT64) + .Attr("combiner", "mean") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 9; + const int batch_size = 5; + const int emb_vector_dim = 4; + const int entries = 8; + const int bucket_size = 16; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim}); + + test::FillValues( + &grad_variable, {0.0103125420, 0.018807490, -0.0106398590, -0.029409127, + 0.0054132286, 0.013920069, -0.0190976150, -0.023196392, + 0.0100601720, 0.015330995, -0.0055795530, -0.024889620, + 0.0108455080, 0.018832123, -0.0095151365, -0.029357582, + 0.0100478110, 0.018798435, -0.0112019650, -0.029439624}); + test::FillValues(&sp_values, {1, 1, 0, 4, 1, 1, 1, 0, 1}); + test::FillValues( + &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + + AddInputFromArray(grad_variable.shape(), grad_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor output1_tensor_expected(DT_FLOAT, {3, emb_vector_dim}); + Tensor output2_tensor_expected(DT_INT64, {3}); + test::FillValues( + &output1_tensor_expected, + {0.0254510570, 0.0477297000, -0.0317581670, -0.075281680, 0.0084614195, + 0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344, + -0.0095488075, -0.011598196}); + test::FillValues(&output2_tensor_expected, {1, 0, 4}); + float *output1_ex = (float *)output1_tensor_expected.tensor_data().data(); + int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data(); + + const Tensor &output1_tensor = *GetOutput(0); + const Tensor &output2_tensor = *GetOutput(1); + + float *output1 = (float *)output1_tensor.tensor_data().data(); + int64 *output2 = (int64 *)output2_tensor.tensor_data().data(); + + // printf("out = %f , expect = %f\n", output1[0], output1_ex[0]); + // printf("out = %f , expect = %f\n", output1[1], output1_ex[1]); + // printf("out = %f , expect = %f\n", output1[2], output1_ex[2]); + // printf("out = %f , expect = %f\n", output1[3], output1_ex[3]); + + // printf("out = %d , expect = %d\n", output2[0], output2_ex[0]); + // printf("out = %d , expect = %d\n", output2[1], output2_ex[1]); + // printf("out = %d , expect = %d\n", output2[2], output2_ex[2]); + + test::ExpectTensorNear(output1_tensor_expected, output1_tensor, 1e-8); + test::ExpectTensorEqual(output2_tensor_expected, output2_tensor); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatSumCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparse", + "FusedSafeEmbeddingLookupSparse") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Attr("T", DT_FLOAT) + .Attr("combiner", "sum") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 9; + const int batch_size = 5; + const int emb_vector_dim = 4; + const int entries = 8; + const int gathered_weight_size = 3; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_weight(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor emb_variable(DT_FLOAT, {gathered_weight_size, emb_vector_dim}); + + // [1 1 0 4 1 1 1 0 1] -> [1 0 4], [0 0 1 2 0 0 0 1 0] + test::FillValues(&sp_values, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + test::FillValues(&sp_weight, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + // [0 0 0 1 1 3 3 4 4] + test::FillValues( + &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + test::FillValues( + &emb_variable, {-0.023765106, -0.248630840, 0.275294270, 0.228118000, + -0.147108670, -0.298352200, -0.067187610, 0.274558250, + 0.491792620, -0.094891705, 0.064489834, 0.058840238}); + + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim}); + + test::FillValues( + &emb_vector_expected, + {-0.19463888, -0.79561390, 0.48340094, 0.73079425, 0.46802750, + -0.34352255, 0.33978412, 0.28695825, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, -0.04753021, -0.49726167, 0.55058855, + 0.45623600, -0.17087378, -0.54698306, 0.20810667, 0.50267625}); + + const Tensor &emb_vector = *GetOutput(0); + + float *output = (float *)emb_vector.tensor_data().data(); + float *output_ex = (float *)emb_vector_expected.tensor_data().data(); + + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-8); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatMeanCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparse", + "FusedSafeEmbeddingLookupSparse") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Attr("T", DT_FLOAT) + .Attr("combiner", "mean") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 9; + const int batch_size = 5; + const int emb_vector_dim = 4; + const int entries = 8; + const int gathered_weight_size = 3; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_weight(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor emb_variable(DT_FLOAT, {gathered_weight_size, emb_vector_dim}); + + // [1 1 0 4 1 1 1 0 1] -> [1 0 4], [0 0 1 2 0 0 0 1 0] + test::FillValues(&sp_values, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + test::FillValues(&sp_weight, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + // [0 0 0 1 1 3 3 4 4] + test::FillValues( + &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + test::FillValues(&emb_variable, + {-0.02299355, -0.247596220, 0.27484232, 0.226618130, + -0.14686598, -0.297978460, -0.06733219, 0.273977040, + 0.49191360, -0.094738655, 0.06426916, 0.058573183}); + + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim}); + test::FillValues( + &emb_vector_expected, + {-0.064284360, -0.26439032, 0.160784140, 0.24240442, 0.234460030, + -0.17116743, 0.169555740, 0.14259565, 0.000000000, 0.00000000, + 0.000000000, 0.00000000, -0.022993550, -0.24759622, 0.274842320, + 0.22661813, -0.084929764, -0.27278733, 0.103755064, 0.25029758}); + + const Tensor &emb_vector = *GetOutput(0); + + float *output = (float *)emb_vector.tensor_data().data(); + float *output_ex = (float *)emb_vector_expected.tensor_data().data(); + + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-7); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatSumCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseGrad", + "FusedSafeEmbeddingLookupSparseGrad") + .Input(FakeInput(DT_FLOAT)) // gradients + .Input(FakeInput(DT_INT64)) // unique_id + .Input(FakeInput(DT_INT64)) // unique_indices + .Input(FakeInput(DT_INT64)) // dense_shape + .Input(FakeInput(DT_INT64)) // indices + .Attr("T", DT_FLOAT) + .Attr("Tinput", DT_INT64) + .Attr("Tindices", DT_INT64) + .Attr("Tdense_shape", DT_INT64) + .Attr("combiner", "sum") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int unique_size = 3; + const int nnz = 9; + const int batch_size = 5; + const int emb_vector_dim = 4; + const int entries = 8; + + Tensor unique_id(DT_INT64, {unique_size}); + Tensor unique_indices(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim}); + + test::FillValues( + &grad_variable, + {0.0076283700764179229736328125, 0.0121669657528400421142578125, + -0.0049919090233743190765380859, -0.0190300568938255310058593750, + 0.0065145129337906837463378906, 0.0117923058569431304931640625, + -0.0164990965276956558227539062, -0.0200323350727558135986328125, + 0.0100607946515083312988281250, 0.0153625328093767166137695312, + -0.0056031607091426849365234375, -0.0249206330627202987670898438, + 0.0099571626633405685424804688, 0.0154269225895404815673828125, + -0.0055019007995724678039550781, -0.0239365808665752410888671875, + 0.0084272380918264389038085938, 0.0152924191206693649291992188, + -0.0086676068603992462158203125, -0.0239860229194164276123046875}); + test::FillValues(&unique_id, {1, 0, 4}); + test::FillValues(&unique_indices, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + test::FillValues( + &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + + AddInputFromArray(grad_variable.shape(), grad_variable.flat()); + AddInputFromArray(unique_id.shape(), unique_id.flat()); + AddInputFromArray(unique_indices.shape(), + unique_indices.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor output1_tensor_expected(DT_FLOAT, {unique_size, emb_vector_dim}); + Tensor output2_tensor_expected(DT_INT64, {unique_size}); + test::FillValues( + &output1_tensor_expected, + {0.0501128211617469787597656250, 0.0822724997997283935546875000, + -0.0461543202400207519531250000, -0.1299516409635543823242187500, + 0.0160556081682443618774414062, 0.0274593848735094070434570312, + -0.0136595163494348526000976562, -0.0430160798132419586181640625, + 0.0065145129337906837463378906, 0.0117923058569431304931640625, + -0.0164990965276956558227539062, -0.0200323369354009628295898438}); + test::FillValues(&output2_tensor_expected, {1, 0, 4}); + float *output1_ex = (float *)output1_tensor_expected.tensor_data().data(); + int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data(); + + const Tensor &output1_tensor = *GetOutput(0); + const Tensor &output2_tensor = *GetOutput(1); + + float *output1 = (float *)output1_tensor.tensor_data().data(); + int64 *output2 = (int64 *)output2_tensor.tensor_data().data(); + + printf("out = %.28f , expect = %.28f\n", output1[11], output1_ex[11]); + + test::ExpectTensorNear(output1_tensor_expected, output1_tensor, 1e-8); + test::ExpectTensorEqual(output2_tensor_expected, output2_tensor); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatMeanCpu) { + TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseGrad", + "FusedSafeEmbeddingLookupSparseGrad") + .Input(FakeInput(DT_FLOAT)) // gradients + .Input(FakeInput(DT_INT64)) // unique_id + .Input(FakeInput(DT_INT64)) // unique_indices + .Input(FakeInput(DT_INT64)) // dense_shape + .Input(FakeInput(DT_INT64)) // indices + .Attr("T", DT_FLOAT) + .Attr("Tinput", DT_INT64) + .Attr("Tindices", DT_INT64) + .Attr("Tdense_shape", DT_INT64) + .Attr("combiner", "mean") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int unique_size = 3; + const int nnz = 9; + const int batch_size = 5; + const int emb_vector_dim = 4; + const int entries = 8; + + Tensor unique_id(DT_INT64, {unique_size}); + Tensor unique_indices(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim}); + + test::FillValues( + &grad_variable, {0.0103125420, 0.018807490, -0.0106398590, -0.029409127, + 0.0054132286, 0.013920069, -0.0190976150, -0.023196392, + 0.0100601720, 0.015330995, -0.0055795530, -0.024889620, + 0.0108455080, 0.018832123, -0.0095151365, -0.029357582, + 0.0100478110, 0.018798435, -0.0112019650, -0.029439624}); + test::FillValues(&unique_id, {1, 0, 4}); + test::FillValues(&unique_indices, {0, 0, 1, 2, 0, 0, 0, 1, 0}); + test::FillValues( + &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + + AddInputFromArray(grad_variable.shape(), grad_variable.flat()); + AddInputFromArray(unique_id.shape(), unique_id.flat()); + AddInputFromArray(unique_indices.shape(), + unique_indices.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor output1_tensor_expected(DT_FLOAT, {unique_size, emb_vector_dim}); + Tensor output2_tensor_expected(DT_INT64, {unique_size}); + test::FillValues( + &output1_tensor_expected, + {0.0254510570, 0.0477297000, -0.0317581670, -0.075281680, 0.0084614195, + 0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344, + -0.0095488075, -0.011598196}); + test::FillValues(&output2_tensor_expected, {1, 0, 4}); + float *output1_ex = (float *)output1_tensor_expected.tensor_data().data(); + int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data(); + + const Tensor &output1_tensor = *GetOutput(0); + const Tensor &output2_tensor = *GetOutput(1); + + float *output1 = (float *)output1_tensor.tensor_data().data(); + int64 *output2 = (int64 *)output2_tensor.tensor_data().data(); + + test::ExpectTensorNear(output1_tensor_expected, output1_tensor, 1e-8); + test::ExpectTensorEqual(output2_tensor_expected, output2_tensor); +} + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h new file mode 100644 index 00000000..b0b2b9b4 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h @@ -0,0 +1,11 @@ +#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_ +#define TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor {} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc new file mode 100644 index 00000000..c19b02f1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc @@ -0,0 +1,394 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +class FusedSafeEmbeddingPostLookupGradOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype, + const std::string& combiner, const float max_norm, + const int default_id) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("fused_safe_embedding_post_look_up_grad", + "FusedEmbeddingSparsePostLookUpGrad") + .Attr("T", dtype) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(FakeInput(dtype)) + .Input(FakeInput(dtype)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT32)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedSafeEmbeddingPostLookupGradOpTest, + Partition2_Mean_MaxNorm100_Float) { + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "mean", 100.0, -1); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + + // emb_shards + AddInputFromArray( + TensorShape({6, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, 26.0, 27.0, + 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 32.0, 33.0, 34.0, 35.0, + 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0}); + AddInputFromArray( + TensorShape({4, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + // sp_values: 3, 1, 4, 5, 7, 3, 12, 12, 15, 4 + // partitioned_values: 1, 3, 3, 4, 4, 5 and 7, 12, 12, 15 + // partitioned_indices + AddInputFromArray(TensorShape({6, 2}), + {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1}); + AddInputFromArray(TensorShape({4, 2}), {1, 7, 2, 4, 2, 7, 3, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 3, 3, 2}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({6, emb_vector_dim})); + test::FillValues( + &grad_shards_1, + {0.00000000, 0.50000000, 1.00000000, 1.50000000, 2.00000000, + 2.50000000, 3.00000000, 3.50000000, 0.00000000, 0.50000000, + 1.00000000, 1.50000000, 2.00000000, 2.50000000, 3.00000000, + 3.50000000, 5.33333349, 5.66666651, 6.00000000, 6.33333349, + 6.66666651, 7.00000000, 7.33333349, 7.66666651, 2.65028572, + 2.98157120, 3.31285667, 3.64414287, 3.97542834, 4.30671406, + 4.63799953, 4.96928549, 11.92628479, 12.42321396, 12.92014217, + 13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516, + 2.16437674, 2.43492365, 2.70547056, 2.97601795, 3.24656487, + 3.51711202, 3.78765893, 4.05820608}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({4, emb_vector_dim})); + test::FillValues( + &grad_shards_2, + {1.58337951, 1.78130186, 1.97922409, 2.17714667, 2.37506914, 2.57299161, + 2.77091384, 2.96883631, 1.89459133, 2.01300311, 2.13141513, 2.24982715, + 2.36823893, 2.48665094, 2.60506320, 2.72347474, 1.89459133, 2.01300311, + 2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, 2.72347474, + 3.43474555, 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767, + 4.29343224, 4.43654633}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +TEST_F(FusedSafeEmbeddingPostLookupGradOpTest, + Partition2_SUM_Float_No_Default) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, -1); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 1, 1}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_1, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_2, + {2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +TEST_F(FusedSafeEmbeddingPostLookupGradOpTest, Partition2_SUM_Float_Default_0) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, 0); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 1, 1}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_1, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_2, + {2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +//----------------------------------------------------------------------------// +// Performance benchmarks // +//----------------------------------------------------------------------------// + +template +void FillValues(Tensor* tensor, gtl::ArraySlice vals) { + auto flat = tensor->flat(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + std::copy_n(vals.data(), vals.size(), flat.data()); + } +} + +template +void FillValues(Tensor* tensor, int val) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = val; + } +} + +template +void FillZerosValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 0.0; + } +} + +template +void FillOnesValues(Tensor* tensor) { + auto flat = tensor->flat(); + float scale = std::rand() / ((RAND_MAX + 1u) / 6); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 1.1 * scale; + } +} + +template +void FillIndiceValues(Tensor* tensor, const int partitions, + const int batch_size, const int entries) { + auto flat = tensor->flat(); + int k = 0; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < entries; ++j) { + flat.data()[k] = i + partitions; + flat.data()[k + 1] = j; + k += 2; + } + } +} + +template +void PrintValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + std::cout << flat.data()[i] << ", "; + } + std::cout << std::endl; +} + +template +static Graph* EmbPostGradOp(const string& kind, int num_partitions, + const std::string& combiner, const float max_norm, + const int default_id) { + const int nnz = 3; + const int batch_size = 512; + const int emb_vector_dim = 32; + const int entries = 8; + const float sparsity = 0.5; + const int total_inputs = batch_size * entries * sparsity; + + Graph* g = new Graph(OpRegistry::Global()); + DataType type = DataTypeToEnum::v(); + + string op_name = "FusedEmbeddingSparsePostLookUpGrad"; + + // top_grad + Tensor top_grad(type, TensorShape({batch_size, emb_vector_dim})); + FillOnesValues(&top_grad); + + // emb_shards + std::vector input_emb_shards; + input_emb_shards.reserve(num_partitions); + for (int i = 0; i < num_partitions; ++i) { + Tensor emb_shards( + type, TensorShape({total_inputs / num_partitions, emb_vector_dim})); + FillOnesValues(&emb_shards); + input_emb_shards.push_back(test::graph::Constant(g, emb_shards)); + // PrintValues(&emb_shards); + } + + // partitioned_indices + std::vector partitioned_indices; + partitioned_indices.reserve(num_partitions); + for (int i = 0; i < num_partitions; ++i) { + Tensor sub_partitioned_indice( + DT_INT64, TensorShape({total_inputs / num_partitions, 2})); + FillIndiceValues(&sub_partitioned_indice, i, + batch_size / num_partitions, entries * sparsity); + partitioned_indices.push_back( + test::graph::Constant(g, sub_partitioned_indice)); + // PrintValues(&sub_partitioned_indice); + } + + // sp_dense_shape + Tensor feature_nums(DT_INT32, TensorShape({batch_size})); + FillValues(&feature_nums, entries * sparsity); + + // row_empty_and_invalid_flags + Tensor row_empty_and_invalid_flags(DT_INT32, TensorShape({batch_size + nnz})); + FillZerosValues(&row_empty_and_invalid_flags); + + auto nodeBuilder = + NodeBuilder(g->NewName("n"), op_name) + .Attr("T", type) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(test::graph::Constant(g, top_grad)) + .Input(input_emb_shards) + .Input(partitioned_indices) + .Input(test::graph::Constant(g, feature_nums)) + .Input(test::graph::Constant(g, row_empty_and_invalid_flags)); + TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr)); + return g; +} + +#define BM_EMB_POST_OP(kind, NP, C, T, DEVICE, NTH) \ + static void BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH( \ + int iters) { \ + testing::UseRealTime(); \ + SessionOptions opts; \ + opts.config.set_intra_op_parallelism_threads(NTH); \ + test::Benchmark(#DEVICE, EmbPostGradOp(#kind, NP, #C, -1.0, -1), &opts) \ + .Run(iters); \ + } \ + BENCHMARK(BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH); + +#define BM_EMB_POST_OP_kind(NP, C, NTH) \ + BM_EMB_POST_OP(OPT, NP, C, float, CPU, NTH); + +#define BM_EMB_POST_OP_NTH(NP, C) \ + BM_EMB_POST_OP_kind(NP, C, 1); \ + BM_EMB_POST_OP_kind(NP, C, 4); \ + BM_EMB_POST_OP_kind(NP, C, 8); + +BM_EMB_POST_OP_NTH(2, sum); + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc new file mode 100644 index 00000000..5b86b0e1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc @@ -0,0 +1,466 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +enum SparseSegmentReductionOperation { kSum, kMean, kSqrtN }; + +namespace { +inline int64 partitioned_indices( + std::vector>& indices, int indice_dim, + int64 id) { + int indices_num = indices.size(); + int64 rows = 0; + for (int i = 0; i < indices_num; ++i) { + size_t sub_nnz = std::get<0>(indices[i]); + rows += sub_nnz; + if (rows > id) { + int idx = id - (rows - sub_nnz); + return std::get<1>(indices[i])[idx * indice_dim]; + } + } +} + +inline const float* const partitioned_embedding_tables( + std::vector>& embedding_tables, + int embedding_size, int64 id) { + int tables_num = embedding_tables.size(); + int64 rows = 0; + for (int i = 0; i < tables_num; ++i) { + size_t sub_nnz = std::get<0>(embedding_tables[i]); + rows += sub_nnz; + if (rows > id) { + int idx = id - (rows - sub_nnz); + return &(std::get<1>(embedding_tables[i])[idx * embedding_size]); + } + } +} + +static void sparse_partitioned_gather( + int64 input_size, std::vector>& indices, + int indice_dim, int rows, + std::vector>& embedding_tables, + float* output, const int64_t embedding_size, + SparseSegmentReductionOperation operation, const bool set_empty_row_zero, + const int* empty_row) { + // Record how many values in each row + uint64_t* row_values = new uint64_t[rows]; + memset(row_values, 0, rows * sizeof(uint64_t)); + // output_buffer is output buffer + float* output_buffer = new float[rows * embedding_size]; + memset(output_buffer, 0, rows * embedding_size * sizeof(float)); + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + auto avx512_add = [](const float* input, uint64_t input_idx, float* output, + uint64_t output_idx, const int64_t num) { + constexpr size_t float_displacement = 4; + constexpr size_t float_alignment = 16; + int64_t quotient = num >> float_displacement; + int64_t remainder = num & 0x000F; + + for (int64_t j = 0; j < quotient; ++j) { + int64_t offset = j << float_displacement; + __m512 a = _mm512_loadu_ps(&input[input_idx + offset]); + __m512 b = _mm512_loadu_ps(&output[output_idx + offset]); + a = _mm512_add_ps(a, b); + _mm512_storeu_ps(&output[output_idx + offset], a); + } + + if (remainder != 0) { + __mmask16 mask = 0xffff >> (float_alignment - remainder); + int64_t offset = quotient << float_displacement; + __m512 zero = _mm512_setzero_ps(); + __m512 a = _mm512_mask_loadu_ps(zero, mask, &input[input_idx + offset]); + __m512 b = _mm512_mask_loadu_ps(zero, mask, &output[output_idx + offset]); + a = _mm512_mask_add_ps(zero, mask, a, b); + _mm512_mask_storeu_ps(&output[output_idx + offset], mask, a); + } + }; + + auto avx512_mean = [](const float* input, uint64_t input_idx, + const float* sum, float* output, uint64_t output_idx, + const int64_t num) { + constexpr size_t float_displacement = 4; + constexpr size_t float_alignment = 16; + int64_t quotient = num >> float_displacement; + int64_t remainder = num & 0x000F; + __m512 sum_ = _mm512_broadcastss_ps(_mm_load_ss(sum)); + + for (int64_t j = 0; j < quotient; ++j) { + int64_t offset = j << float_displacement; + __m512 a = _mm512_loadu_ps(&input[input_idx + offset]); + __m512 b = _mm512_loadu_ps(&output[output_idx + offset]); + a = _mm512_add_ps(a, b); + a = _mm512_mul_ps(a, sum_); + _mm512_storeu_ps(&output[output_idx + offset], a); + } + + if (remainder != 0) { + __mmask16 mask = 0xffff >> (float_alignment - remainder); + int64_t offset = quotient << float_displacement; + __m512 zero = _mm512_setzero_ps(); + __m512 a = _mm512_mask_loadu_ps(zero, mask, &input[input_idx + offset]); + __m512 b = _mm512_mask_loadu_ps(zero, mask, &output[output_idx + offset]); + a = _mm512_mask_add_ps(zero, mask, a, b); + a = _mm512_mask_mul_ps(zero, mask, a, sum_); + _mm512_mask_storeu_ps(&output[output_idx + offset], mask, a); + } + }; +#endif + + for (int64_t i = input_size - 1; i >= 0; --i) { + // From sub_indices to find output row + auto row = partitioned_indices(indices, indice_dim, i); + row_values[row] += 1; + // From sub_embedding_tables to find embedding_table row ptr + auto embedding_row = + partitioned_embedding_tables(embedding_tables, embedding_size, i); + // add output_buffer to do block addition + uint64_t output_row = row * embedding_size; + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + avx512_add(embedding_row, 0, output_buffer, output_row, embedding_size); +#else + for (int64_t j = 0; j < embedding_size; ++j) { + output_buffer[output_row + j] += embedding_row[j]; + } +#endif + + if (row_values[row] == 8) { + memcpy(&output[output_row], &output_buffer[output_row], + embedding_size * sizeof(float)); + memset(&output_buffer[output_row], 0, embedding_size * sizeof(float)); + } else if (row_values[row] % 8 == 0) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + avx512_add(output_buffer, output_row, output, output_row, embedding_size); +#else + for (int64_t j = 0; j < embedding_size; ++j) { + output[output_row + j] += output_buffer[output_row + j]; + } +#endif + memset(&output_buffer[output_row], 0, embedding_size * sizeof(float)); + } + } + + for (int64_t i = 0; i < rows; ++i) { + int64_t output_row = i * embedding_size; + // zero emtpy rows + if (set_empty_row_zero && empty_row[i] == 1) { + memset(&output[output_row], 0, embedding_size * sizeof(float)); + } else { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + if (operation == SparseSegmentReductionOperation::kSum) { + if (row_values[i] < 8) { + memcpy(&output[output_row], &output_buffer[output_row], + embedding_size * sizeof(float)); + } else { + avx512_add(output_buffer, output_row, output, output_row, + embedding_size); + } + } else if (operation == SparseSegmentReductionOperation::kMean) { + float sum = 1.0 / static_cast(row_values[i]); + avx512_mean(output_buffer, output_row, &sum, output, output_row, + embedding_size); + } else if (operation == SparseSegmentReductionOperation::kSqrtN) { + float sqrt = 1.0 / std::sqrt(row_values[i]); + avx512_mean(output_buffer, output_row, &sqrt, output, output_row, + embedding_size); + } +#else + if (operation == SparseSegmentReductionOperation::kSum) { + for (int64_t j = 0; j < embedding_size; ++j) { + output[output_row + j] += output_buffer[output_row + j]; + } + } else if (operation == SparseSegmentReductionOperation::kMean) { + for (int64_t j = 0; j < embedding_size; ++j) { + output[output_row + j] += output_buffer[output_row + j]; + output[output_row + j] /= row_values[i]; + } + } else if (operation == SparseSegmentReductionOperation::kSqrtN) { + for (int64_t j = 0; j < embedding_size; ++j) { + output[output_row + j] += output_buffer[output_row + j]; + output[output_row + j] /= std::sqrt(row_values[i]); + } + } +#endif + } + } + + delete[] row_values; + delete[] output_buffer; +} + +static inline void set_feature_nums( + int32* feature_nums, int64 input_size, + std::vector> indices, int indice_dim) { + for (int64 i = 0; i < input_size; ++i) { + feature_nums[partitioned_indices(indices, indice_dim, i)]++; + } +} +} // namespace + +template +class FusedSafeEmbeddingPostLookupOp : public OpKernel { + public: + explicit FusedSafeEmbeddingPostLookupOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + if (combiner_ == "sum") { + operation_ = SparseSegmentReductionOperation::kSum; + } else if (combiner_ == "mean") { + operation_ = SparseSegmentReductionOperation::kMean; + } else if (combiner_ == "sqrtn") { + operation_ = SparseSegmentReductionOperation::kSqrtN; + } else { + OP_REQUIRES( + ctx, false, + errors::InvalidArgument( + "Currently, 'mean', 'sqrtn' and 'sum' are only supported")); + } + } + + ~FusedSafeEmbeddingPostLookupOp() {} + + void Compute(OpKernelContext* ctx) override { + OpInputList emb_shards; + OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards)); + + OpInputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->input_list("partitioned_indices", &partitioned_indices)); + + Tensor const* dense_shape_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor)); + + Tensor const* row_empty_and_invalid_flags = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags", + &row_empty_and_invalid_flags)); + + const int64_t embedding_size = emb_shards[0].shape().dim_size(1); + + int input_dims = dense_shape_tensor->dim_size(0); + int batch_size = 1; + for (int i = 0; i < input_dims - 1; ++i) { + batch_size *= dense_shape_tensor->flat().data()[i]; + } + + // To check the input + OP_REQUIRES( + ctx, (dense_shape_tensor->dims() == 1), + errors::InvalidArgument("Shape tensor is not valid (dims != 1)")); + OP_REQUIRES( + ctx, (dense_shape_tensor->dim_size(0) >= 2), + errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)")); + + const int* empty_row = row_empty_and_invalid_flags->flat().data(); + + Tensor* emb_vectors_tensor = nullptr; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, TensorShape({batch_size, embedding_size}), + &emb_vectors_tensor)); + float* output = (float*)emb_vectors_tensor->tensor_data().data(); + memset(output, 0, batch_size * embedding_size * sizeof(float)); + + Tensor* feature_nums_tensor; + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({batch_size}), + &feature_nums_tensor)); + int32* feature_nums = (int32*)feature_nums_tensor->tensor_data().data(); + memset(feature_nums, 0, batch_size * sizeof(int32)); + + int64 input_size = 0; + for (int i = 0; i < num_partitions_; ++i) { + input_size += partitioned_indices[i].shape().dim_size(0); + } + + int indice_dim = partitioned_indices[0].shape().dim_size(1); + + const bool set_empty_row_zero = default_id_ >= 0; + + std::vector> embedding_tables; + std::vector> indices; + embedding_tables.reserve(num_partitions_); + indices.reserve(num_partitions_); + for (int i = 0; i < num_partitions_; i++) { + const size_t sub_nnz = emb_shards[i].shape().dim_size(0); + OP_REQUIRES( + ctx, sub_nnz == partitioned_indices[i].shape().dim_size(0), + errors::InvalidArgument( + "emb_shard and partitioned_indice dosn't have the same length")); + embedding_tables.emplace_back( + std::make_tuple(sub_nnz, emb_shards[i].flat().data())); + indices.emplace_back(std::make_tuple( + sub_nnz, partitioned_indices[i].flat().data())); + } + + sparse_partitioned_gather(input_size, indices, indice_dim, batch_size, + embedding_tables, output, embedding_size, + operation_, set_empty_row_zero, empty_row); + set_feature_nums(feature_nums, input_size, indices, indice_dim); + } + + private: + int num_partitions_; + int partition_axis_; + std::string combiner_; + float max_norm_; + int64_t default_id_; + SparseSegmentReductionOperation operation_; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedEmbeddingSparsePostLookUp").Device(DEVICE_CPU), + FusedSafeEmbeddingPostLookupOp); + +template +class FusedSafeEmbeddingPostLookupGradOp : public OpKernel { + public: + explicit FusedSafeEmbeddingPostLookupGradOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + if (combiner_ == "sum") { + operation_ = SparseSegmentReductionOperation::kSum; + } else if (combiner_ == "mean") { + operation_ = SparseSegmentReductionOperation::kMean; + } else if (combiner_ == "sqrtn") { + operation_ = SparseSegmentReductionOperation::kSqrtN; + } else { + OP_REQUIRES( + ctx, false, + errors::InvalidArgument( + "Currently, 'mean', 'sqrtn' and 'sum' are only supported")); + } + } + + void Compute(OpKernelContext* ctx) override { + Tensor const* top_grad_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor)); + + OpInputList emb_shards; + OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards)); + + OpInputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->input_list("partitioned_indices", &partitioned_indices)); + + Tensor const* feature_nums = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("feature_nums", &feature_nums)); + + Tensor const* row_empty_and_invalid_flags = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags", + &row_empty_and_invalid_flags)); + + OpOutputList grad_shards; + OP_REQUIRES_OK(ctx, ctx->output_list("grad_shards", &grad_shards)); + + const float* top_grad = top_grad_tensor->flat().data(); + const int64_t batch_size = top_grad_tensor->shape().dim_size(0); + const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1); + const int* f_nums = feature_nums->flat().data(); + const int* empty_row = row_empty_and_invalid_flags->flat().data(); + + const bool set_empty_row_zero = default_id_ >= 0; + + for (int i = 0; i < num_partitions_; i++) { + const int64_t sub_nnz = partitioned_indices[i].shape().dim_size(0); + const int64_t indices_col = partitioned_indices[i].shape().dim_size(1); + const int64* indices = partitioned_indices[i].flat().data(); + Tensor* grad_shard; + OP_REQUIRES_OK( + ctx, grad_shards.allocate(i, TensorShape({sub_nnz, emb_vec_size}), + &grad_shard)); + float* grad = grad_shard->flat().data(); + + std::vector l2_norm(sub_nnz, 1.0); + if (max_norm_ > 0.0) { + const float* emb = emb_shards[i].flat().data(); + for (int j = 0; j < sub_nnz; ++j) { + float sum = 0.0; + for (int k = 0; k < emb_vec_size; ++k) { + sum += emb[j * emb_vec_size + k] * emb[j * emb_vec_size + k]; + } + l2_norm[j] = std::sqrt(sum); + } + } + + if (operation_ == SparseSegmentReductionOperation::kSum) { + for (int j = 0; j < sub_nnz; ++j) { + int64 idx = indices[j * indices_col]; + if (set_empty_row_zero == true && empty_row[idx] == 1) + memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size); + else + memcpy(&grad[j * emb_vec_size], &top_grad[idx * emb_vec_size], + sizeof(float) * emb_vec_size); + } + } else if (operation_ == SparseSegmentReductionOperation::kMean) { + for (int j = 0; j < sub_nnz; ++j) { + int64 idx = indices[j * indices_col]; + if (set_empty_row_zero == true && empty_row[idx] == 1) + memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size); + else { + for (int k = 0; k < emb_vec_size; ++k) { + grad[j * emb_vec_size + k] = + top_grad[idx * emb_vec_size + k] / f_nums[idx]; + if (max_norm_ > 0.0 && l2_norm[j] > max_norm_) { + grad[j * emb_vec_size + k] *= max_norm_ / l2_norm[j]; + } + } + } + } + } else if (operation_ == SparseSegmentReductionOperation::kSqrtN) { + for (int j = 0; j < sub_nnz; ++j) { + int64 idx = indices[j * indices_col]; + if (set_empty_row_zero == true && empty_row[idx] == 1) + memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size); + else { + for (int k = 0; k < emb_vec_size; ++k) { + grad[j * emb_vec_size + k] = + top_grad[idx * emb_vec_size + k] / std::sqrt(f_nums[idx]); + if (max_norm_ > 0.0 && l2_norm[j] > max_norm_) { + grad[j * emb_vec_size + k] *= max_norm_ / l2_norm[j]; + } + } + } + } + } else { + OP_REQUIRES( + ctx, false, + errors::InvalidArgument( + "Currently, 'mean', 'sqrtn' and 'sum' are only supported")); + } + } + } + + private: + int num_partitions_; + int partition_axis_; + std::string combiner_; + float max_norm_; + int64_t default_id_; + SparseSegmentReductionOperation operation_; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedEmbeddingSparsePostLookUpGrad").Device(DEVICE_CPU), + FusedSafeEmbeddingPostLookupGradOp); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc new file mode 100644 index 00000000..c9e38da0 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc @@ -0,0 +1,419 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; +class FusedSafeEmbeddingPostLookupOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype, + const std::string& combiner, const float max_norm, + const int default_id) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("fused_safe_embedding_post_look_up", + "FusedEmbeddingSparsePostLookUp") + .Attr("T", dtype) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(FakeInput(num_partitions, dtype)) + .Input(FakeInput(num_partitions, DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT64)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +// TEST_F(FusedSafeEmbeddingPostLookupOpTest, +// Partition3_Sqrtn_MaxNorm200_Float) { +// const int nnz = 10; +// const int batch_size = 4; +// const int emb_vector_dim = 8; +// const int entries = 8; + +// MakeOpAndSetDevice(Device::CPU, 3, DT_FLOAT, "sqrtn", 200.0, -1); + +// // emb_shards +// AddInputFromArray( +// TensorShape({6, emb_vector_dim}), +// { +// 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, +// 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, +// 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, +// 38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, +// 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, +// }); +// AddInputFromArray(TensorShape({1, emb_vector_dim}), +// {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); +// AddInputFromArray( +// TensorShape({3, emb_vector_dim}), +// {96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, +// 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, +// 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + +// // partitioned_indices +// AddInputFromArray(TensorShape({6, 2}), +// {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1}); +// AddInputFromArray(TensorShape({1, 2}), {1, 7}); +// AddInputFromArray(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0}); + +// // sp_dense_shape +// AddInputFromArray(TensorShape({2}), {batch_size, entries}); + +// // row_empty_and_invalid_flags +// AddInputFromArray(TensorShape({batch_size + nnz}), +// {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + +// TF_ASSERT_OK(RunOpKernel()); +// TF_EXPECT_OK(device_->Sync()); + +// { +// Tensor expected_emb_vectors(allocator(), DT_FLOAT, +// TensorShape({batch_size, emb_vector_dim})); +// test::FillValues( +// &expected_emb_vectors, +// {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, +// 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, +// 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, +// 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, +// 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, +// 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, +// 77.87342834, 78.98532867}); +// test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); +// } +// { +// Tensor feature_nums_expected(allocator(), DT_INT32, +// TensorShape({batch_size})); +// test::FillValues(&feature_nums_expected, {2, 3, 3, 2}); +// test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); +// } +// } + +TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition3_Sqrtn_Float) { + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 3, DT_FLOAT, "sqrtn", -1.0, -1); + + // emb_shards + AddInputFromArray( + TensorShape({6, emb_vector_dim}), + { + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, + 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, + 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, + 38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, + }); + AddInputFromArray(TensorShape({1, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + AddInputFromArray( + TensorShape({3, emb_vector_dim}), + {96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({6, 2}), + {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1}); + AddInputFromArray(TensorShape({1, 2}), {1, 7}); + AddInputFromArray(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {22.62741661, 24.04162979, 25.45584297, 26.87005806, 28.28427124, + 29.69848442, 31.11269760, 32.52691269, 73.90083313, 75.63288116, + 77.36493683, 79.09698486, 80.82903290, 82.56108856, 84.29313660, + 86.02519226, 124.70765686, 126.43970490, 128.17175293, 129.90380859, + 131.63586426, 133.36790466, 135.09996033, 136.83201599, 107.48023224, + 108.89443970, 110.30865479, 111.72286987, 113.13708496, 114.55130005, + 115.96550751, 117.37972260}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 3, 3, 2}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition2_Sum_No_Default) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, -1); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 1, 1}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition2_Sum_Default_0) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, 0); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 0.0, 0.0, 0.0, 0.0}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 1, 1}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +//----------------------------------------------------------------------------// +// Performance benchmarks // +//----------------------------------------------------------------------------// + +template +void FillValues(Tensor* tensor, gtl::ArraySlice vals) { + auto flat = tensor->flat(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + std::copy_n(vals.data(), vals.size(), flat.data()); + } +} + +template +void FillZerosValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 0.0; + } +} + +template +void FillOnesValues(Tensor* tensor) { + auto flat = tensor->flat(); + float scale = std::rand() / ((RAND_MAX + 1u) / 6); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 1.1 * scale; + } +} + +template +void FillIndiceValues(Tensor* tensor, const int partitions, + const int batch_size, const int entries) { + auto flat = tensor->flat(); + int k = 0; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < entries; ++j) { + flat.data()[k] = i + partitions; + flat.data()[k + 1] = j; + k += 2; + } + } +} + +template +void PrintValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + std::cout << flat.data()[i] << ", "; + } + std::cout << std::endl; +} + +template +static Graph* EmbPostOp(const string& kind, int num_partitions, + const std::string& combiner, const float max_norm, + const int default_id) { + const int nnz = 3; + const int batch_size = 512; + const int emb_vector_dim = 32; + const int entries = 8; + const float sparsity = 0.5; + const int total_inputs = batch_size * entries * sparsity; + + Graph* g = new Graph(OpRegistry::Global()); + DataType type = DataTypeToEnum::v(); + + const bool isDefault = (kind == "Default"); + string op_name = isDefault ? "FusedEmbeddingSparsePostLookUpOrigin" + : "FusedEmbeddingSparsePostLookUp"; + + // emb_shards + std::vector input_emb_shards; + input_emb_shards.reserve(num_partitions); + for (int i = 0; i < num_partitions; ++i) { + Tensor emb_shards( + type, TensorShape({total_inputs / num_partitions, emb_vector_dim})); + FillOnesValues(&emb_shards); + input_emb_shards.push_back(test::graph::Constant(g, emb_shards)); + // PrintValues(&emb_shards); + } + + // partitioned_indices + std::vector partitioned_indices; + partitioned_indices.reserve(num_partitions); + for (int i = 0; i < num_partitions; ++i) { + Tensor sub_partitioned_indice( + DT_INT64, TensorShape({total_inputs / num_partitions, 2})); + FillIndiceValues(&sub_partitioned_indice, i, + batch_size / num_partitions, entries * sparsity); + partitioned_indices.push_back( + test::graph::Constant(g, sub_partitioned_indice)); + // PrintValues(&sub_partitioned_indice); + } + + // sp_dense_shape + Tensor sp_dense_shape(DT_INT64, TensorShape({2})); + FillValues(&sp_dense_shape, {batch_size, entries}); + + // row_empty_and_invalid_flags + Tensor row_empty_and_invalid_flags(DT_INT32, TensorShape({batch_size + nnz})); + FillZerosValues(&row_empty_and_invalid_flags); + + auto nodeBuilder = + NodeBuilder(g->NewName("n"), op_name) + .Attr("T", type) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(input_emb_shards) + .Input(partitioned_indices) + .Input(test::graph::Constant(g, sp_dense_shape)) + .Input(test::graph::Constant(g, row_empty_and_invalid_flags)) + .Input(partitioned_indices); + TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr)); + return g; +} + +#define BM_EMB_POST_OP(kind, NP, C, T, DEVICE, NTH) \ + static void BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH( \ + int iters) { \ + testing::UseRealTime(); \ + SessionOptions opts; \ + opts.config.set_intra_op_parallelism_threads(NTH); \ + test::Benchmark(#DEVICE, EmbPostOp(#kind, NP, #C, -1.0, -1), &opts) \ + .Run(iters); \ + } \ + BENCHMARK(BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH); + +#define BM_EMB_POST_OP_kind(NP, C, NTH) \ + BM_EMB_POST_OP(OPT, NP, C, float, CPU, NTH); + +#define BM_EMB_POST_OP_NTH(NP, C) \ + BM_EMB_POST_OP_kind(NP, C, 1); \ + BM_EMB_POST_OP_kind(NP, C, 4); \ + BM_EMB_POST_OP_kind(NP, C, 8); + +BM_EMB_POST_OP_NTH(2, sum); + +} // namespace +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc new file mode 100644 index 00000000..c74e2317 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc @@ -0,0 +1,315 @@ +#define EIGEN_USE_THREADS + +#include + +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +namespace { + +struct IndicePair { + int64_t row; + int64_t column; +}; + +enum Part_Strategy { MOD, DIV, DIV_EV }; + +typedef void (*PARTITIONALGO)(const int64_t* id_table, + const int64_t numPartitions, + const int64_t idsPerPartition, + const int64_t extras, const int64_t originId, + int64_t* segment, int64_t* newId); + +template +inline void GetPartitionIndex(const int64_t* id_table, + const int64_t numPartitions, + const int64_t idsPerPartition, + const int64_t extras, const int64_t originId, + int64_t* segment, int64_t* newId) {} + +template <> +inline void GetPartitionIndex( + const int64_t* id_table, const int64_t numPartitions, + const int64_t idsPerPartition, const int64_t extras, const int64_t originId, + int64_t* segment, int64_t* newId) { + *segment = originId % numPartitions; + *newId = originId / numPartitions; +} + +template <> +inline void GetPartitionIndex( + const int64_t* id_table, const int64_t numPartitions, + const int64_t idsPerPartition, const int64_t extras, const int64_t originId, + int64_t* segment, int64_t* newId) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + const int64_t* prange = id_table + numPartitions % 8; + __m512i voffset = _mm512_set1_epi64(originId); + int vectorSize = numPartitions / 8; + for (int i = vectorSize - 1; i >= 0; --i) { + __m512i vrange = _mm512_maskz_loadu_epi64(0xff, prange + i * 8); + __mmask8 mask = _mm512_cmple_epi64_mask(vrange, voffset); + if (mask != 0) { + int numGreater = __builtin_ctz(mask); + *segment = (numPartitions - 1) - 8 * (vectorSize - 1 - i) - numGreater; + *newId = originId - id_table[*segment]; + return; + } + } + + for (int j = numPartitions % 8 - 1; j > -1; --j) { + if (originId >= id_table[j]) { + *segment = j; + *newId = originId - id_table[j]; + break; + } + } +#else + *segment = originId < extras * (idsPerPartition + 1) + ? originId / (idsPerPartition + 1) + : (originId - extras) / idsPerPartition; + *newId = *segment < extras ? originId % (idsPerPartition + 1) + : (originId - extras) % idsPerPartition; +#endif +} + +template <> +inline void GetPartitionIndex( + const int64_t* id_table, const int64_t numPartitions, + const int64_t idsPerPartition, const int64_t extras, const int64_t originId, + int64_t* segment, int64_t* newId) { + *segment = originId < 0 ? *segment : 0; + *newId = originId; +} +} // namespace + +typedef Eigen::ThreadPoolDevice CPUDevice; + +class FusedEmbeddingSparsePreLookUpCPU : public OpKernel { + public: + explicit FusedEmbeddingSparsePreLookUpCPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_empty_row", &fill_empty_row_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("prune_invalid_id", &prune_invalid_id_)); + + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + OP_REQUIRES_OK( + ctx, ctx->GetAttr("partition_strategy", &partition_strategy_str_)); + if (partition_strategy_str_ == "div") { + partition_strategy_ = GetPartitionIndex; + } else if (partition_strategy_str_ == "mod") { + partition_strategy_ = GetPartitionIndex; + } else if (partition_strategy_str_ == "div_ev") { + partition_strategy_ = GetPartitionIndex; + } else { + OP_REQUIRES( + ctx, false, + errors::InvalidArgument("Not support partition_strategy type. ", + partition_strategy_)); + } + } + + void Compute(OpKernelContext* ctx) override { + const int64_t default_id = default_id_ >= 0 ? default_id_ : 0; + // 1. get input tensor + Tensor const* values_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor)); + const int64_t nnz = values_tensor->shape().dim_size(0); + + const int64_t* values = + reinterpret_cast(values_tensor->flat().data()); + + Tensor const* indices_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor)); + + const int64_t* indices = + reinterpret_cast(indices_tensor->flat().data()); + + Tensor const* dense_shape = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape)); + const int64_t batch_size = dense_shape->flat().data()[0]; + + OpInputList partition_shapes; + OP_REQUIRES_OK(ctx, ctx->input_list("partition_shapes", &partition_shapes)); + + partition_total_sizes_ = 0; + for (const Tensor& shape : partition_shapes) { + OP_REQUIRES(ctx, shape.dims() <= 2, + errors::InvalidArgument( + "input partition_shapes must all less than rank 2")); + partition_total_sizes_ += shape.flat().data()[0]; + } + + if (partition_total_sizes_ == 1) { + partition_strategy_ = GetPartitionIndex; + } + + // 1.1 define output tensors + OpOutputList partitioned_values; + OP_REQUIRES_OK(ctx, + ctx->output_list("partitioned_values", &partitioned_values)); + OpOutputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->output_list("partitioned_indices", &partitioned_indices)); + + Tensor* all_flags; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(2 * num_partitions_, + TensorShape{batch_size + nnz}, &all_flags)); + int32_t* all_flags_list = all_flags->flat().data(); + + memset(all_flags_list, 0, (batch_size + nnz) * sizeof(int32_t)); + + // 2.1 get index + const int64_t idsPerPartition = partition_total_sizes_ / num_partitions_; + const int64_t extras = partition_total_sizes_ % num_partitions_; + std::vector empty_index_; + // [p_seg_nums + list(p_seg, p_id)] + int64_t* const id_index_array = new int64_t[num_partitions_ + 1 + nnz * 2]; + memset(id_index_array, 0, (num_partitions_ + 1) * sizeof(int64_t)); + + // 2.2 get the map of the mutli-table index + int64_t default_p_seg = 0; + int64_t default_p_val = 0; + int64_t p_seg = 0; + int64_t p_val = 0; + register int64_t tmp_id; + int64_t* const min_id_per_seg = new int64_t[num_partitions_]; +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + int64_t* tmp_value_arr; + + // 2.1 build min_id_per_seg + memset(min_id_per_seg, 0, (num_partitions_) * sizeof(int64_t)); + for (int i = 0; i < num_partitions_; ++i) { + min_id_per_seg[i] = + i < extras ? i * (idsPerPartition + 1) : i * idsPerPartition + extras; + } + + // 2.2.1 get new seg & id in id_index_array + int64_t* new_p_seg; + int64_t* new_p_id; + int64_t* id_indices = id_index_array + num_partitions_ + 1; + + for (int64_t index = 0; index < nnz; ++index) { + new_p_seg = id_indices + index * 2; + new_p_id = id_indices + index * 2 + 1; + + // set default values; + *(new_p_seg) = prune_invalid_id_ ? num_partitions_ : 0; + *(new_p_id) = *(values + index); + + // set all_flags_list; + all_flags_list[batch_size + index] = (*new_p_id < 0) ? 0 : 1; + all_flags_list[*(indices + index * 2)] += + !prune_invalid_id_ || !(*new_p_id < 0); + + partition_strategy_(min_id_per_seg, num_partitions_, idsPerPartition, + extras, *(new_p_seg + 1), new_p_seg, new_p_id); + ++id_index_array[*new_p_seg]; + } + +#else + for (int64_t index = 0; index < nnz; ++index) { + tmp_id = values[index]; + if (tmp_id < 0) { + p_seg = prune_invalid_id_ ? num_partitions_ : 0; + p_val = values[index]; + all_flags_list[*(indices + 2 * index)] += !p_seg; + } else { + all_flags_list[batch_size + index] = 1; + ++all_flags_list[*(indices + 2 * index)]; + partition_strategy_(nullptr, num_partitions_, idsPerPartition, extras, + tmp_id, &p_seg, &p_val); + } + ++id_index_array[p_seg]; + *(id_index_array + 2 * index + num_partitions_ + 1) = p_seg; + *(id_index_array + 2 * index + num_partitions_ + 2) = p_val; + } +#endif + + // 2.3 fill_empty_row_index_ + if (fill_empty_row_) { + // get default id p_seg_ and p_val_ + partition_strategy_(min_id_per_seg, num_partitions_, idsPerPartition, + extras, default_id, &default_p_seg, &default_p_val); + for (int64_t origin_index = 0; origin_index < batch_size; + ++origin_index) { + if (all_flags_list[origin_index]) { + all_flags_list[origin_index] = 0; + continue; + } + all_flags_list[origin_index] = 1; + empty_index_.push_back(origin_index); + empty_index_.push_back(0); + } + } + + // 3 packaging the output tensor + for (int i = 0; i < num_partitions_; ++i) { + int64_t size = id_index_array[i]; + if (fill_empty_row_ && i == default_p_seg) { + size += empty_index_.size() >> 1; + } + + Tensor* sub_partitioned_values; + OP_REQUIRES_OK(ctx, partitioned_values.allocate( + i, TensorShape({static_cast(size)}), + &sub_partitioned_values)); + int64_t* sub_p_values = reinterpret_cast( + sub_partitioned_values->flat().data()); + + Tensor* sub_partitioned_indices; + OP_REQUIRES_OK(ctx, partitioned_indices.allocate( + i, TensorShape({static_cast(size), 2}), + &sub_partitioned_indices)); + + int64_t* sub_p_indces = reinterpret_cast( + sub_partitioned_indices->flat().data()); + if (!size) continue; + + int sub_part_index = 0; + for (int index = 0; index < nnz; ++index) { + if (id_index_array[(index) * 2 + num_partitions_ + 1] == i) { + sub_p_values[sub_part_index] = + id_index_array[(index) * 2 + num_partitions_ + 2]; + sub_p_indces[sub_part_index * 2] = *(indices + (index) * 2); + sub_p_indces[sub_part_index * 2 + 1] = *(indices + (index) * 2 + 1); + ++sub_part_index; + } + } + if (fill_empty_row_ && default_p_seg == i) { + memcpy(sub_p_indces + sub_part_index * 2, empty_index_.data(), + empty_index_.size() * sizeof(int64_t)); + + std::fill(sub_p_values + sub_part_index, sub_p_values + size, + default_p_val); + } + } + delete[] min_id_per_seg; + delete[] id_index_array; + } + + private: + int num_partitions_; + int partition_total_sizes_; + int partition_axis_; + bool fill_empty_row_; + bool prune_invalid_id_; + int64_t default_id_; + PARTITIONALGO partition_strategy_; + std::string partition_strategy_str_; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedEmbeddingSparsePreLookUp").Device(DEVICE_CPU), + FusedEmbeddingSparsePreLookUpCPU); +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc new file mode 100644 index 00000000..ea74e624 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc @@ -0,0 +1,627 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +class FusedEmbeddingSparsePreLookUpOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, const int num_partitions, + const bool fill_empty_row, + const bool prune_invalid_id, const int default_id, + const string partition_strategy = "div") { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("FusedEmbeddingSparsePreLookUp", + "FusedEmbeddingSparsePreLookUp") + .Attr("num_partitions", num_partitions) + .Attr("partition_strategy", partition_strategy) + .Attr("partition_axis", 0) + .Attr("fill_empty_row", fill_empty_row) + .Attr("prune_invalid_id", prune_invalid_id) + .Attr("default_id", default_id) + .Input(FakeInput(num_partitions, DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Int64) { + MakeOpAndSetDevice(Device::CPU, 1, false, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {1, 1}); + // sp_values + AddInputFromArray(TensorShape({12}), + {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7}); + // sp_indices + AddInputFromArray(TensorShape({12, 2}), + {2, 3, 4, 6, 1, 6, 12, 12, 12, 12, 11, 5, + 15, 0, 11, 6, 7, 9, 11, 8, 12, 13, 13, 0}); + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {16, 16}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({12})); + test::FillValues(&expected_values, + {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({12, 2})); + test::FillValues(&expected_indices, + {2, 3, 4, 6, 1, 6, 12, 12, 12, 12, 11, 5, + 15, 0, 11, 6, 7, 9, 11, 8, 12, 13, 13, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Fill_Empty) { + MakeOpAndSetDevice(Device::CPU, 1, true, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {1, 1}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({11})); + test::FillValues(&expected_values, + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2, 0}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({11, 2})); + test::FillValues( + &expected_indices, + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7, 2, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Fill_Empty_Prune_Invalid) { + MakeOpAndSetDevice(Device::CPU, 1, true, true, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {1, 1}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({9})); + test::FillValues(&expected_values, {0, 4, 3, 5, 9, 2, 0, 0, 0}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6, + 7, 2, 0, 4, 0, 5, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64) { + MakeOpAndSetDevice(Device::CPU, 3, false, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {6, 16}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {3, 16}); + // partition_shapes 2 + AddInputFromArray(TensorShape({2}), {7, 16}); + // sp_values + AddInputFromArray(TensorShape({12}), + {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7}); + // sp_indices + AddInputFromArray(TensorShape({12, 2}), + {2, 3, 4, 6, 1, 6, 12, 12, 12, 12, 11, 5, + 15, 0, 11, 6, 7, 9, 11, 8, 12, 13, 13, 0}); + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {16, 16}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({6})); + test::FillValues(&expected_values, {1, 5, 3, 0, 5, 5}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({6, 2})); + test::FillValues(&expected_indices, + {2, 3, 4, 6, 1, 6, 11, 6, 7, 9, 11, 8}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 1}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {12, 12, 13, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(4)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {1, 3, 4, 0}); + test::ExpectTensorEqual(expected_values, *GetOutput(2)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {12, 12, 11, 5, 15, 0, 12, 13}); + test::ExpectTensorEqual(expected_indices, *GetOutput(5)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition2_Fill_Empty) { + MakeOpAndSetDevice(Device::CPU, 2, true, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({9})); + test::FillValues(&expected_values, {0, 4, 3, -2, -3, -4, -6, 2, 0}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 4, 0, 5, + 2, 6, 1, 6, 7, 2, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Fill_Empty_Prune_Invalid) { + MakeOpAndSetDevice(Device::CPU, 2, true, true, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({7})); + test::FillValues(&expected_values, {0, 4, 3, 2, 0, 0, 0}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({7, 2})); + test::FillValues(&expected_indices, + {0, 0, 0, 4, 1, 2, 6, 7, 2, 0, 4, 0, 5, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Fill_Empty_Prune_Invalid_Default_7) { + MakeOpAndSetDevice(Device::CPU, 2, true, true, 7); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {0, 4, 3, 2}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 6, 7}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({5})); + test::FillValues(&expected_values, {0, 4, 2, 2, 2}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({5, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0, 2, 0, 4, 0, 5, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Prune_Invalid_Default_3) { + MakeOpAndSetDevice(Device::CPU, 2, false, true, 3); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {0, 4, 3, 2}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 6, 7}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition1) { + MakeOpAndSetDevice(Device::CPU, 1, false, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {10, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({10})); + test::FillValues(&expected_values, + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({10, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, + 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition1_Fill_Empty_Prune_Invalid_Default_3) { + MakeOpAndSetDevice(Device::CPU, 1, true, true, 3); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {10, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({9})); + test::FillValues(&expected_values, {0, 4, 3, 5, 9, 2, 3, 3, 3}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + ; + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6, + 7, 2, 0, 4, 0, 5, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64_Perfs) { + int num_partitions = 4; + int batch_size = 100000; + int num_per_part = batch_size / num_partitions; + int embed_dim = 32; + int default_id = -1; + + std::vector sp_values; + std::vector sp_indices; + + MakeOpAndSetDevice(Device::CPU, num_partitions, false, false, default_id); + + for (int i = 0; i < num_partitions; ++i) { + AddInputFromArray(TensorShape({2}), + {num_per_part * embed_dim, embed_dim}); + } + + for (int i = 0; i < batch_size * embed_dim; ++i) { + sp_values.push_back(i); + } + + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < embed_dim; ++j) { + sp_indices.push_back(i); + sp_indices.push_back(j); + } + } + // sp_values + AddInputFromArray(TensorShape({sp_values.size()}), sp_values); + // sp_indices + AddInputFromArray(TensorShape({sp_values.size(), 2}), sp_indices); + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, embed_dim}); + TF_ASSERT_OK(RunOpKernel()); +} + +//----------------------------------------------------------------------------// +// Performance benchmarks // +//----------------------------------------------------------------------------// + +template +void FillValues(Tensor* tensor, gtl::ArraySlice vals) { + auto flat = tensor->flat(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + std::copy_n(vals.data(), vals.size(), flat.data()); + } +} + +template +void FillZerosValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 0.0; + } +} + +template +void FillOnesValues(Tensor* tensor) { + auto flat = tensor->flat(); + float scale = std::rand() / ((RAND_MAX + 1u) / 6); + for (int i = 0; i < flat.size(); ++i) { + flat.data()[i] = 1.1 * scale; + } +} + +template +void FillIndiceValues(Tensor* tensor, const int partitions, + const int batch_size, const int entries) { + auto flat = tensor->flat(); + int k = 0; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < entries; ++j) { + flat.data()[k] = i + partitions; + flat.data()[k + 1] = j; + k += 2; + } + } +} + +template +void PrintValues(Tensor* tensor) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) { + std::cout << flat.data()[i] << ", "; + } + std::cout << std::endl; +} + +template +static Graph* EmbPreOp(const string& kind, int num_partitions, + const std::string& combiner, const float max_norm, + const int default_id) { + int batch_size = 100000; + int num_per_part = batch_size / num_partitions; + int embed_dim = 32; + const string partition_strategy = "div"; + const bool fill_empty_row = false; + const bool prune_invalid_id = false; + + Graph* g = new Graph(OpRegistry::Global()); + DataType type = DataTypeToEnum::v(); + + const bool isDefault = (kind == "Default"); + string op_name = isDefault ? "FusedEmbeddingSparsePreLookUp" + : "FusedEmbeddingSparsePreLookUp"; + + std::vector sp_values; + std::vector sp_indices; + + // partitioned_indices + std::vector partitioned_indices; + partitioned_indices.reserve(num_partitions); + for (int i = 0; i < num_partitions; ++i) { + Tensor sub_partitioned_indice(DT_INT64, TensorShape({2})); + FillValues(&sub_partitioned_indice, + {num_per_part * embed_dim, embed_dim}); + partitioned_indices.push_back( + test::graph::Constant(g, sub_partitioned_indice)); + } + + for (int i = 0; i < batch_size * embed_dim; ++i) { + sp_values.push_back(i); + } + + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < embed_dim; ++j) { + sp_indices.push_back(i); + sp_indices.push_back(j); + } + } + + // sp_values + Tensor sp_values_t(DT_INT64, TensorShape({sp_values.size()})); + FillValues(&sp_values_t, sp_values); + + // sp_indices + Tensor sp_indices_t(DT_INT64, TensorShape({sp_values.size(), 2})); + FillValues(&sp_indices_t, sp_indices); + + // sp_dense_shape + Tensor sp_dense_shape_t(DT_INT64, TensorShape({2})); + FillValues(&sp_dense_shape_t, {batch_size, embed_dim}); + + auto nodeBuilder = NodeBuilder(g->NewName("n"), op_name) + .Attr("num_partitions", num_partitions) + .Attr("partition_strategy", partition_strategy) + .Attr("partition_axis", 0) + .Attr("fill_empty_row", fill_empty_row) + .Attr("prune_invalid_id", prune_invalid_id) + .Attr("default_id", default_id) + .Input(partitioned_indices) + .Input(test::graph::Constant(g, sp_values_t)) + .Input(test::graph::Constant(g, sp_indices_t)) + .Input(test::graph::Constant(g, sp_dense_shape_t)); + TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr)); + return g; +} + +#define BM_EMB_PRE_OP(kind, NP, C, T, DEVICE, NTH) \ + static void BM_EMB_PRE_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH( \ + int iters) { \ + testing::UseRealTime(); \ + SessionOptions opts; \ + opts.config.set_intra_op_parallelism_threads(NTH); \ + test::Benchmark(#DEVICE, EmbPreOp(#kind, NP, #C, -1.0, -1), &opts) \ + .Run(iters); \ + } \ + BENCHMARK(BM_EMB_PRE_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH); + +#define BM_EMB_PRE_OP_kind(NP, C, NTH) \ + BM_EMB_PRE_OP(OPT, NP, C, float, CPU, NTH); + +#define BM_EMB_PRE_OP_NTH(NP, C) \ + BM_EMB_PRE_OP_kind(NP, C, 1); \ + // BM_EMB_PRE_OP_kind(NP, C, 4); \ + // BM_EMB_PRE_OP_kind(NP, C, 8); \ + +BM_EMB_PRE_OP_NTH(4, sum); + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h new file mode 100644 index 00000000..19b839f0 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h @@ -0,0 +1,98 @@ +#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_ +#define TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_ + +#if GOOGLE_CUDA + +#define CK_CUDA_THROW_(x) \ + do { \ + cudaError_t retval = (x); \ + if (retval != cudaSuccess) { \ + throw std::runtime_error(std::string("Runtime error: ") + \ + (cudaGetErrorString(retval)) + " " + __FILE__ + \ + ":" + std::to_string(__LINE__) + " \n"); \ + } \ + } while (0) + +namespace tensorflow { + +namespace { + +inline int CalcBlocksLinearMapping(const int problem_size, const int threads) { + return problem_size % threads == 0 ? (problem_size / threads) + : (problem_size / threads + 1); +} + +struct IndicePair { + int64_t row_in_batch; + int64_t entry_in_column; +}; + +enum Combiner { Mean, Sum, Sqrtn }; + +template +__forceinline__ __device__ float Combine(const float in, const T feature_num); + +template <> +__forceinline__ __device__ float Combine(const float in, + const int feature_num) { + return in / sqrtf(feature_num); +} + +template <> +__forceinline__ __device__ float Combine(const float in, + const int feature_num) { + return in / feature_num; +} + +template <> +__forceinline__ __device__ float Combine(const float in, + const int feature_num) { + return in; +} + +template <> +__forceinline__ __device__ float Combine( + const float in, const float feature_num) { + return in / sqrtf(feature_num); +} + +template <> +__forceinline__ __device__ float Combine(const float in, + const float feature_num) { + return in / feature_num; +} + +template <> +__forceinline__ __device__ float Combine(const float in, + const float feature_num) { + return in; +} + +template +__forceinline__ __device__ float CombineGrad(const float grad, + const int feature_num); + +template <> +__forceinline__ __device__ float CombineGrad(const float grad, + const int feature_num) { + return grad / sqrtf(feature_num); +} + +template <> +__forceinline__ __device__ float CombineGrad(const float grad, + const int feature_num) { + return grad / feature_num; +} + +template <> +__forceinline__ __device__ float CombineGrad(const float grad, + const int feature_num) { + return grad; +} +} // namespace + +} // namespace tensorflow + +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc new file mode 100644 index 00000000..84581673 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc @@ -0,0 +1,315 @@ +#include +#include + +#include "tensorflow/core/framework/op_kernel.h" + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "fused_embedding_common.cu.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +namespace { + +__global__ void SetToIntMaxSTG128(int* values_offset, const int batch_size) { + const int thread_offset = 4 * (blockIdx.x * blockDim.x + threadIdx.x); + const int int_max = 0x7fffffff; + if (thread_offset + 4 < batch_size) { + ::int4 four = make_int4(int_max, int_max, int_max, int_max); + *((::int4*)(values_offset + thread_offset)) = four; + } else if (thread_offset < batch_size) { + for (int i = thread_offset; i < batch_size; i++) { + values_offset[i] = int_max; + } + } +} + +__global__ void CalcPerElementRowInBatchValuesOffset(const int64_t* indices, + int* values_offset, + const int64_t nnz) { + const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_offset < int(nnz)) { + const int64_t element_row = indices[2 * thread_offset]; + atomicMin(values_offset + int(element_row), thread_offset); + } +} + +template +__global__ void EmbeddingLookUp(const float* emb_variable, + const int64_t* values, const int* values_offset, + float* embedding_vector, const float max_norm, + const int emb_vec_size, + const int64_t batch_size, const int64_t nnz) { + __shared__ float l2_sum[1]; + + int value_offset = values_offset[blockIdx.x]; + int feature_num; + if (blockIdx.x == int(batch_size) - 1) { + feature_num = int(nnz) - value_offset; + } else { + feature_num = values_offset[blockIdx.x + 1] - value_offset; + } + float out = 0.0f; + for (int i = 0; i < feature_num; i++) { + float emb_element = + emb_variable[int(values[value_offset + i]) * emb_vec_size + + threadIdx.x]; + if (max_norm >= 0.0f) { + // calc l2 norm of this emb row(per block) and compare with max_norm. + // if greater than max_norm, then clip every element with factor + // max_norm / l2norm + if (threadIdx.x == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + out += emb_element; + } + + // combine + out = Combine(out, feature_num); + + // store the embedding vector + embedding_vector[blockIdx.x * emb_vec_size + threadIdx.x] = out; +} + +template +__global__ void DoEmbeddingGrad(const float* top_grad, + const float* emb_variable, + const int64_t* values, const int* values_offset, + float* grad_values, const float max_norm, + const int emb_vec_size, + const int64_t batch_size, const int64_t nnz) { + __shared__ float l2_sum[1]; + const int value_offset = values_offset[blockIdx.x]; + int feature_num; + if (blockIdx.x == int(batch_size) - 1) { + feature_num = int(nnz) - value_offset; + } else { + feature_num = values_offset[blockIdx.x + 1] - value_offset; + } + float grad = top_grad[blockIdx.x * emb_vec_size + threadIdx.x]; + grad = CombineGrad(grad, feature_num); + for (int i = 0; i < feature_num; i++) { + float grad_i = grad; + if (max_norm > 0.0f) { + float emb_element = + emb_variable[int(values[value_offset + i]) * emb_vec_size + + threadIdx.x]; + if (threadIdx.x == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + grad_i *= max_norm / l2_norm; + } + } + grad_values[(value_offset + i) * emb_vec_size + threadIdx.x] = grad_i; + } +} + +} // namespace + +class FusedEmbeddingLocalSparseLookUpGPU : public OpKernel { + public: + explicit FusedEmbeddingLocalSparseLookUpGPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + Tensor const* values_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor)); + Tensor const* indices_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor)); + Tensor const* dense_shape_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor)); + Tensor const* emb_variable_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("emb_variable", &emb_variable_tensor)); + + auto dense_shape = dense_shape_tensor->flat().data(); + const size_t batch_size = dense_shape[0]; + const int64 nnz = indices_tensor->shape().dim_size(0); + const int64 emb_vec_size = emb_variable_tensor->shape().dim_size(1); + + TensorShape emb_vectors_tensor_shape; + + emb_vectors_tensor_shape = TensorShape( + std::vector({static_cast(batch_size), emb_vec_size})); + Tensor* emb_vectors_tensor = nullptr; + // allocate output + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, emb_vectors_tensor_shape, + &emb_vectors_tensor)); + + // allocate offset tensor + TensorShape values_offset_tensor_shape = + TensorShape(std::vector({static_cast(batch_size)})); + + Tensor* values_offset_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, values_offset_tensor_shape, + &values_offset_tensor)); + + { + const int threads = 1024; + int blocks = batch_size / threads; + blocks = batch_size % threads == 0 ? blocks : blocks + 1; + SetToIntMaxSTG128<<>>( + values_offset_tensor->flat().data(), int(batch_size)); + } + { + const int threads = 1024; + int blocks = nnz % threads == 0 ? (nnz / threads) : (nnz / threads + 1); + + // calculate values offset + CalcPerElementRowInBatchValuesOffset<<>>( + reinterpret_cast( + indices_tensor->flat().data()), + values_offset_tensor->flat().data(), nnz); + } + { + const int blocks = int(batch_size); + const int threads = int(emb_vec_size); + if (combiner_ == "sqrtn") { + EmbeddingLookUp<<>>( + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast(emb_vectors_tensor->flat().data()), + max_norm_, int(emb_vec_size), batch_size, nnz); + } else if (combiner_ == "mean") { + EmbeddingLookUp<<>>( + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast(emb_vectors_tensor->flat().data()), + max_norm_, int(emb_vec_size), batch_size, nnz); + } else { + EmbeddingLookUp<<>>( + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast(emb_vectors_tensor->flat().data()), + max_norm_, int(emb_vec_size), batch_size, nnz); + } + } + } + + private: + std::string combiner_; + float max_norm_; +}; + +REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingLocalSparseLookUp") + .Device(DEVICE_GPU) + .HostMemory("sp_dense_shape"), + FusedEmbeddingLocalSparseLookUpGPU); + +class FusedEmbeddingLocalSparseLookUpGradGPU : public OpKernel { + public: + explicit FusedEmbeddingLocalSparseLookUpGradGPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + Tensor const* top_grad_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor)); + + Tensor const* emb_variable_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("emb_variable", &emb_variable_tensor)); + Tensor const* values_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor)); + Tensor const* values_offset_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_values_offset", &values_offset_tensor)); + + const int64 emb_vec_size = top_grad_tensor->shape().dim_size(1); + const int64 batch_size = top_grad_tensor->shape().dim_size(0); + const int64 nnz = values_tensor->shape().dim_size(0); + + Tensor* grad_emb_weight_sp_values_tensor; + TensorShape grad_emb_weight_sp_values_tensor_shape = + TensorShape(std::vector({nnz, emb_vec_size})); + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, grad_emb_weight_sp_values_tensor_shape, + &grad_emb_weight_sp_values_tensor)); + + { + const int blocks = int(batch_size); + const int threads = int(emb_vec_size); + + if (combiner_ == "sqrtn") { + DoEmbeddingGrad<<>>( + reinterpret_cast( + top_grad_tensor->flat().data()), + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast( + grad_emb_weight_sp_values_tensor->flat().data()), + max_norm_, emb_vec_size, batch_size, nnz); + } else if (combiner_ == "mean") { + DoEmbeddingGrad<<>>( + reinterpret_cast( + top_grad_tensor->flat().data()), + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast( + grad_emb_weight_sp_values_tensor->flat().data()), + max_norm_, emb_vec_size, batch_size, nnz); + } else { + DoEmbeddingGrad<<>>( + reinterpret_cast( + top_grad_tensor->flat().data()), + reinterpret_cast( + emb_variable_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + values_offset_tensor->flat().data(), + reinterpret_cast( + grad_emb_weight_sp_values_tensor->flat().data()), + max_norm_, emb_vec_size, batch_size, nnz); + } + } + } + + private: + float max_norm_; + std::string combiner_; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedEmbeddingLocalSparseLookUpGrad").Device(DEVICE_GPU), + FusedEmbeddingLocalSparseLookUpGradGPU); + +} // namespace tensorflow +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc new file mode 100644 index 00000000..04e79ad7 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc @@ -0,0 +1,419 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/image_ops.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 }; + +template +void get_node_attr_from_test_case(string& combiner_str, float& max_norm) { + if (test_case == Sqrtn) { + combiner_str = "sqrtn"; + max_norm = -1.0f; + } else if (test_case == Mean) { + combiner_str = "mean"; + max_norm = -1.0f; + } else if (test_case == Sum) { + combiner_str = "sum"; + max_norm = -1.0f; + } else if (test_case == SqrtnAndMaxNorm200) { + combiner_str = "sqrtn"; + max_norm = 200.0f; + } else if (test_case == MeanAndMaxNorm100) { + combiner_str = "mean"; + max_norm = 100.0f; + } +} + +template +void fill_emb_vector_expected(Tensor* expected); + +template <> +void fill_emb_vector_expected(Tensor* expected) { + test::FillValues( + expected, {22.627416610717773, 24.0416316986084, 25.45584487915039, + 26.870058059692383, 28.284271240234375, 29.698484420776367, + 31.112699508666992, 32.526912689208984, 73.90083312988281, + 75.63288879394531, 77.36493682861328, 79.09698486328125, + 80.82904052734375, 82.56108856201172, 84.29314422607422, + 86.02519226074219, 124.70765686035156, 126.43971252441406, + 128.17176818847656, 129.90380859375, 131.6358642578125, + 133.367919921875, 135.09996032714844, 136.83201599121094, + 107.48023223876953, 108.89444732666016, 110.30866241455078, + 111.72286987304688, 113.1370849609375, 114.55130004882812, + 115.96551513671875, 117.37973022460938}); +} + +template <> +void fill_emb_vector_expected(Tensor* expected) { + test::FillValues( + expected, {16.00000000000000, 17.00000000000000, 18.00000000000000, + 19.00000000000000, 20.00000000000000, 21.00000000000000, + 22.00000000000000, 23.00000000000000, 42.66666793823242, + 43.66666793823242, 44.66666793823242, 45.66666793823242, + 46.66666793823242, 47.66666793823242, 48.66666793823242, + 49.66666793823242, 72.00000000000000, 73.00000000000000, + 74.00000000000000, 75.00000000000000, 76.00000000000000, + 77.00000000000000, 78.00000000000000, 79.00000000000000, + 76.00000000000000, 77.00000000000000, 78.00000000000000, + 79.00000000000000, 80.00000000000000, 81.00000000000000, + 82.00000000000000, 83.00000000000000}); +} + +template <> +void fill_emb_vector_expected(Tensor* expected) { + test::FillValues( + expected, {32.0, 34.0, 36.0, 38.0, 40.0, 42.0, 44.0, 46.0, + 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0, + 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0, + 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0}); +} + +template <> +void fill_emb_vector_expected(Tensor* expected) { + test::FillValues( + expected, + {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, + 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, + 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, + 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, + 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, + 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, + 77.87342834, 78.98532867}); +} + +class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase { + protected: + template + void Run(Device device) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + DataType dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up", + "FusedEmbeddingLocalSparseLookUp") + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(dtype)) + .Attr("T", dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_indices(DT_INT64, {nnz, 2}); + Tensor sp_dense_shape(DT_INT64, {2}); + Tensor emb_variable(dtype, {bucket_size, emb_vector_dim}); + + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + test::FillValues(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7, + 2, 1, 2, 4, 2, 7, 3, 0, 3, 6}); + test::FillValues(&sp_dense_shape, {batch_size, entries}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + AddInputFromArray(sp_dense_shape.shape(), + sp_dense_shape.flat()); + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(dtype, {batch_size, emb_vector_dim}); + Tensor sp_values_offset_expected(DT_INT32, {batch_size}); + fill_emb_vector_expected(&emb_vector_expected); + test::FillValues(&sp_values_offset_expected, {0, 2, 5, 8}); + + const Tensor& emb_vector = *GetOutput(0); + const Tensor& values_offset = *GetOutput(1); + TF_EXPECT_OK(device_->Sync()); + + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-4); + test::ExpectTensorEqual(sp_values_offset_expected, values_offset); + } +}; + +template +void fill_grad_expected(Tensor* expected); + +template <> +void fill_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.7071067690849304, 1.4142135381698608, + 2.1213204860687256, 2.8284270763397217, 3.535533905029297, + 4.242640972137451, 4.949747562408447, 0.000000000000000, + 0.7071067690849304, 1.4142135381698608, 2.1213204860687256, + 2.8284270763397217, 3.535533905029297, 4.242640972137451, + 4.949747562408447, 4.618802070617676, 5.196152687072754, + 5.773502826690674, 6.350852966308594, 6.928203582763672, + 7.505553722381592, 8.082903861999512, 8.66025447845459, + 4.618802070617676, 5.196152687072754, 5.773502826690674, + 6.350852966308594, 6.928203582763672, 7.505553722381592, + 8.082903861999512, 8.66025447845459, 4.618802070617676, + 5.196152687072754, 5.773502826690674, 6.350852966308594, + 6.928203582763672, 7.505553722381592, 8.082903861999512, + 8.66025447845459, 9.237604141235352, 9.81495475769043, + 10.392305374145508, 10.96965503692627, 11.547005653381348, + 12.124356269836426, 12.701705932617188, 13.279056549072266, + 9.237604141235352, 9.81495475769043, 10.392305374145508, + 10.96965503692627, 11.547005653381348, 12.124356269836426, + 12.701705932617188, 13.279056549072266, 9.237604141235352, + 9.81495475769043, 10.392305374145508, 10.96965503692627, + 11.547005653381348, 12.124356269836426, 12.701705932617188, + 13.279056549072266, 16.970563888549805, 17.677669525146484, + 18.384777069091797, 19.091882705688477, 19.79899024963379, + 20.5060977935791, 21.21320343017578, 21.920310974121094, + 16.970563888549805, 17.677669525146484, 18.384777069091797, + 19.091882705688477, 19.79899024963379, 20.5060977935791, + 21.21320343017578, 21.920310974121094}); +} + +template <> +void fill_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.500000000000000, 1.000000000000000, + 1.500000000000000, 2.000000000000000, 2.500000000000000, + 3.000000000000000, 3.500000000000000, 0.000000000000000, + 0.500000000000000, 1.000000000000000, 1.500000000000000, + 2.000000000000000, 2.500000000000000, 3.000000000000000, + 3.500000000000000, 2.6666667461395264, 3.000000000000000, + 3.3333332538604736, 3.6666667461395264, 4.000000000000000, + 4.333333492279053, 4.666666507720947, 5.000000000000000, + 2.6666667461395264, 3.000000000000000, 3.3333332538604736, + 3.6666667461395264, 4.000000000000000, 4.333333492279053, + 4.666666507720947, 5.000000000000000, 2.6666667461395264, + 3.000000000000000, 3.3333332538604736, 3.6666667461395264, + 4.000000000000000, 4.333333492279053, 4.666666507720947, + 5.000000000000000, 5.333333492279053, 5.666666507720947, + 6.000000000000000, 6.333333492279053, 6.666666507720947, + 7.000000000000000, 7.333333492279053, 7.666666507720947, + 5.333333492279053, 5.666666507720947, 6.000000000000000, + 6.333333492279053, 6.666666507720947, 7.000000000000000, + 7.333333492279053, 7.666666507720947, 5.333333492279053, + 5.666666507720947, 6.000000000000000, 6.333333492279053, + 6.666666507720947, 7.000000000000000, 7.333333492279053, + 7.666666507720947, 12.000000000000000, 12.500000000000000, + 13.000000000000000, 13.500000000000000, 14.000000000000000, + 14.500000000000000, 15.000000000000000, 15.500000000000000, + 12.000000000000000, 12.500000000000000, 13.000000000000000, + 13.500000000000000, 14.000000000000000, 14.500000000000000, + 15.000000000000000, 15.500000000000000}); +} + +template <> +void fill_grad_expected(Tensor* expected) { + test::FillValues( + expected, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0, 9.0, 10.0, 11.0, + 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); +} + +template <> +void fill_grad_expected(Tensor* expected) { + test::FillValues( + expected, + {0.00000000, 0.50000000, 1.00000000, 1.50000000, 2.00000000, + 2.50000000, 3.00000000, 3.50000000, 0.00000000, 0.50000000, + 1.00000000, 1.50000000, 2.00000000, 2.50000000, 3.00000000, + 3.50000000, 2.65028572, 2.98157120, 3.31285667, 3.64414287, + 3.97542834, 4.30671406, 4.63799953, 4.96928549, 2.16437674, + 2.43492365, 2.70547056, 2.97601795, 3.24656487, 3.51711202, + 3.78765893, 4.05820608, 1.58337951, 1.78130186, 1.97922409, + 2.17714667, 2.37506914, 2.57299161, 2.77091384, 2.96883631, + 5.33333349, 5.66666651, 6.00000000, 6.33333349, 6.66666651, + 7.00000000, 7.33333349, 7.66666651, 1.89459133, 2.01300311, + 2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, + 2.72347474, 1.89459133, 2.01300311, 2.13141513, 2.24982715, + 2.36823893, 2.48665094, 2.60506320, 2.72347474, 3.43474555, + 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767, + 4.29343224, 4.43654633, 11.92628479, 12.42321396, 12.92014217, + 13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516}); +} + +class FusedEmbeddingLocalSparseLookUpGradOpTest : public OpsTestBase { + protected: + template + void Run(Device device) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + DataType dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up_grad", + "FusedEmbeddingLocalSparseLookUpGrad") + .Input(FakeInput(dtype)) + .Input(FakeInput(dtype)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Attr("T", dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int bucket_size = 16; + + Tensor top_grad(dtype, {batch_size, emb_vector_dim}); + Tensor emb_variable(dtype, {bucket_size, emb_vector_dim}); + Tensor sp_values(DT_INT64, {nnz}); + Tensor sp_values_offset(DT_INT32, {batch_size}); + + test::FillValues( + &top_grad, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + test::FillValues(&sp_values_offset, {0, 2, 5, 8}); + + AddInputFromArray(top_grad.shape(), top_grad.flat()); + AddInputFromArray(emb_variable.shape(), emb_variable.flat()); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(dtype, {nnz, emb_vector_dim}); + fill_grad_expected(&grad_expected); + + const Tensor& grad = *GetOutput(0); + TF_EXPECT_OK(device_->Sync()); + + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } +}; + +#ifdef GOOGLE_CUDA +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, + EmbeddingLocalSparseLookUpFloatSqrtnGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, + EmbeddingLocalSparseLookUpFloatMeanGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, + EmbeddingLocalSparseLookUpFloatSumGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, + EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest, + EmbeddingLocalSparseLookUpGradFloatGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest, + EmbeddingLocalSparseLookUpGradFloatMeanGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest, + EmbeddingLocalSparseLookUpGradFloatSumGpu) { + Run(Device::GPU); +} + +TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest, + EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) { + Run(Device::GPU); +} + +#endif + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc new file mode 100644 index 00000000..c8408134 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc @@ -0,0 +1,308 @@ +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("FusedEmbeddingLocalSparseLookUp") + .Attr("T: {float32}") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("max_norm: float = -1.0") + .Input("sp_values: int64") + .Input("sp_indices: int64") + .Input("sp_dense_shape: int64") + .Input("emb_variable: T") + .Output("emb_vectors: T") + .Output("sp_values_offset: int32") + .SetShapeFn([](InferenceContext* ctx) { + ShapeHandle temp; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 1, &temp)); + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(1), 2, &temp)); + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(2), 1, &temp)); + ShapeHandle emb_var_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3), 2, &emb_var_shape)); + + DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1); + DimensionHandle batch_dim = ctx->UnknownDim(); + + ShapeHandle output_shape = ctx->MakeShape({batch_dim, emb_vec_size_dim}); + ctx->set_output(0, output_shape); + + return OkStatus(); + }); +// .Doc(R"doc( +// FusedEmbedding ops that performs a local embedding lookup. The process will +// perform embedding vector copying from emb_variable. The input is usually a +// SparseTensor. The output sp_values_offset is reserved for gradient +// calculation. +// )doc"); + +REGISTER_OP("FusedEmbeddingLocalSparseLookUpGrad") + .Attr("T: {float32}") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("max_norm: float = -1.0") + .Input("top_grad: T") + .Input("emb_variable: T") + .Input("sp_values: int64") + .Input("sp_values_offset: int32") + .Output("grad_emb_weight_sp_values: T") + .SetShapeFn([](InferenceContext* ctx) { + ShapeHandle top_grad_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &top_grad_shape)); + DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1); + ctx->set_output(0, ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim})); + return OkStatus(); + }); + +// .Doc(R"doc( +// The gradient ops for FusedEmbeddingLocalSparseLookUp. sp_values_offset from +// the forward op need to be passed to this grad op as input. +// )doc"); + +REGISTER_OP("FusedEmbeddingSparsePreLookUp") + .Attr("num_partitions: int >= 1 = 1") + .Attr("partition_axis: int >= 0 = 0") // for now only support = 0, + // will consider support = 1 + // if necessary + .Attr("fill_empty_row: bool = false") + .Attr("prune_invalid_id: bool = false") + .Attr("default_id: int = -1") + .Attr("partition_strategy: {'div','mod'} = 'div'") + .Input("partition_shapes: num_partitions * int64") + .Input("sp_values: int64") + .Input("sp_indices: int64") + .Input("sp_dense_shape: int64") + .Output("partitioned_values: num_partitions * int64") + .Output("partitioned_indices: num_partitions * int64") + .Output("row_empty_and_invalid_flags: int32") + .SetShapeFn([](InferenceContext* ctx) { + int num_partitions; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions)); + int partition_axis; + TF_RETURN_IF_ERROR(ctx->GetAttr("partition_axis", &partition_axis)); + + ShapeHandle unused; + // sp_values + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(num_partitions), 1, &unused)); + // sp_indices + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(num_partitions + 1), 2, &unused)); + DimensionHandle unused_dim; + TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim)); + // sp_dense_shape + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(num_partitions + 2), 1, &unused)); + + // partition_shapes + for (int i = 0; i < num_partitions; i++) { + ShapeHandle partition_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 1, &partition_shape)); + TF_RETURN_IF_ERROR( + ctx->WithValue(ctx->NumElements(partition_shape), 2, &unused_dim)); + + ShapeHandle values_result_shape, indices_result_shape; + if (int(partition_axis) == 0) { + values_result_shape = ctx->MakeShape({ctx->UnknownDim()}); + indices_result_shape = ctx->MakeShape({ctx->UnknownDim(), 2}); + } else { + return errors::InvalidArgument("partition_axis > 0 not implemented!"); + } + ctx->set_output(i, values_result_shape); + ctx->set_output(i + num_partitions, indices_result_shape); + } + ctx->set_output(2 * num_partitions, ctx->MakeShape({ctx->UnknownDim()})); + + return OkStatus(); + }); +// .Doc(R"doc( +// A fused embedding op, usually using for partitioned and distriuted embedding +// variables. FusedEmbeddingSparsePreLookUp, FusedEmbeddingSparsePostLookUp +// should be used together. This op will first read the partition pattern of +// embedding variables through partition_shapes, then sort, re-calculate and +// assign the embedding indices to the corresponding partition. Several Gather +// ops usually should be appended after this op to gather embedding shards from +// multiple partitioned embedding variables. This op has no gradient function. +// )doc"); + +REGISTER_OP("FusedEmbeddingSparsePostLookUp") + .Attr("T : {float32}") + .Attr("num_partitions: int >= 1 = 1") + .Attr("default_id: int = -1") + .Attr("partition_axis: int >= 0 = 0") // for now only support = 0, + // will consider support = 1 + // if necessary + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("max_norm: float = -1.0") + .Input("emb_shards: num_partitions * T") + .Input("partitioned_indices: num_partitions * int64") + .Input("sp_dense_shape: int64") + .Input("row_empty_and_invalid_flags: int32") + .Input( + "partitioned_values: num_partitions * int64") // only for backward use. + // actually directly port + // to python grad op + // output + .Output("emb_vectors: T") + .Output("feature_nums: int32") + .SetShapeFn([](InferenceContext* ctx) { + int num_partitions; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions)); + + ShapeHandle first_emb_shard_shape; + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(0), 2, &first_emb_shard_shape)); + + ShapeHandle unused; + for (int i = 0; i < num_partitions; i++) { + // emb_shards + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused)); + // partitioned_indices + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(i + num_partitions), 2, &unused)); + DimensionHandle unused_dim; + TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim)); + } + // sp_dense_shape + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(2 * num_partitions), 1, &unused)); + // row_empty_and_invalid_flags + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(2 * num_partitions + 1), 1, &unused)); + + DimensionHandle emb_vec_size_dim = ctx->Dim(first_emb_shard_shape, 1); + ctx->set_output(0, ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim})); + ctx->set_output(1, ctx->MakeShape({ctx->UnknownDim()})); + return OkStatus(); + }); + +// .Doc(R"doc( +// A fused embedding op, usually using for partitioned and distriuted embedding +// variables. FusedEmbeddingSparsePreLookUp, FusedEmbeddingSparsePostLookUp +// should be used together. There should be several Gather ops before this op. +// The Gather ops gather embedding shards from embedding variable and this op +// glue them together, then apply combiner and max_morm according to embedding +// indices. +// )doc"); + +REGISTER_OP("FusedEmbeddingSparsePostLookUpGrad") + .Attr("T : {float32}") + .Attr("num_partitions: int >= 1 = 1") + .Attr("partition_axis: int >= 0 = 0") // for now only support = 0, + // will consider support = 1 + // if necessary + .Attr("default_id: int = -1") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("max_norm: float = -1.0") + .Input("top_grad: T") + .Input("emb_shards: num_partitions * T") + .Input("partitioned_indices: num_partitions * int64") + .Input("feature_nums: int32") + .Input("row_empty_and_invalid_flags: int32") + .Output("grad_shards: num_partitions * T") + .SetShapeFn([](InferenceContext* ctx) { + int num_partitions; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions)); + + ShapeHandle unused; + ShapeHandle top_grad_shape; + + // top_grad + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &top_grad_shape)); + // emb_shards + for (int i = 1; i < num_partitions + 1; i++) { + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused)); + } + // partitioned_indices + for (int i = num_partitions + 1; i < 2 * num_partitions + 1; i++) { + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused)); + DimensionHandle unused_dim; + TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim)); + } + // feature_nums + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(2 * num_partitions + 1), 1, &unused)); + // row_empty_and_invalid_flags + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(2 * num_partitions + 2), 1, &unused)); + + DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1); + + ShapeHandle output_shape = + ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim}); + for (int i = 0; i < num_partitions; i++) { + ctx->set_output(i, output_shape); + } + return OkStatus(); + }); + +// .Doc(R"doc( +// Calculate gradient of FusedEmbeddingSparsePostLookUp +// )doc"); + +REGISTER_OP("FusedSafeEmbeddingLookupSparseLocal") + .Input("weight: T_weight") + .Input("id_input: T_id") + .Input("dense_shape: T_shape") + .Input("indice: T_shape") + .Input("weight_input: T_id") + .Output("embedded: T") + .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'") + .Attr("prune: bool = true") + .Attr("max_norm: float = -1.0") + .Attr("default_id: int = -1") + .Attr("partition_strategy: {'div','mod'} = 'div'") + .Attr("T_id: {int64, int32}") + .Attr("T_shape: {int64, int32}") + .Attr("T_weight: {float, resource}") + .Attr("T: {float} = DT_FLOAT") + .SetShapeFn([](InferenceContext* ctx) { + ShapeHandle temp; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(1), 1, &temp)); + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3), 2, &temp)); + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(2), 1, &temp)); + ShapeHandle emb_var_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &emb_var_shape)); + + DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1); + DimensionHandle batch_dim = ctx->UnknownDim(); + + ShapeHandle output_shape = ctx->MakeShape({batch_dim, emb_vec_size_dim}); + ctx->set_output(0, output_shape); + + return OkStatus(); + }); + +REGISTER_OP("FusedSafeEmbeddingLookupSparseLocalGrad") + .Input("gradients: T") + .Input("input: Tinput") + .Input("indices: Tindices") + .Input("dense_shape: Tdense_shape") + .Output("output: T") + .Output("unique_value: Tinput") + .Attr("T: {float}") + .Attr("Tinput: {int64}") + .Attr("Tindices: {int64, int32}") + .Attr("Tdense_shape: {int64, int32}") + .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'") + .SetShapeFn([](InferenceContext* ctx) { + ShapeHandle emb_var_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &emb_var_shape)); + + DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1); + DimensionHandle unique_dim = ctx->UnknownDim(); + + ShapeHandle output_shape = ctx->MakeShape({unique_dim, emb_vec_size_dim}); + ctx->set_output(0, output_shape); + + ShapeHandle unique_value_shape = ctx->MakeShape({unique_dim}); + ctx->set_output(1, unique_value_shape); + + return OkStatus(); + }); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc new file mode 100644 index 00000000..acef2961 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc @@ -0,0 +1,243 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +class FusedEmbeddingSparsePostLookUpGradOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype, + const std::string& combiner, const float max_norm, + const int default_id) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding__sparse_post_look_up_grad", + "FusedEmbeddingSparsePostLookUpGrad") + .Attr("T", dtype) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(FakeInput(dtype)) + .Input(FakeInput(dtype)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT32)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest, + Partition2_Mean_MaxNorm100_Float) { + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "mean", 100.0, -1); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + + // emb_shards + AddInputFromArray( + TensorShape({6, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, 26.0, 27.0, + 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 32.0, 33.0, 34.0, 35.0, + 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0}); + AddInputFromArray( + TensorShape({4, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + // sp_values: 3, 1, 4, 5, 7, 3, 12, 12, 15, 4 + // partitioned_values: 1, 3, 3, 4, 4, 5 and 7, 12, 12, 15 + // partitioned_indices + AddInputFromArray(TensorShape({6, 2}), + {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1}); + AddInputFromArray(TensorShape({4, 2}), {1, 7, 2, 4, 2, 7, 3, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 3, 3, 2}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({6, emb_vector_dim})); + test::FillValues( + &grad_shards_1, + {0.00000000, 0.50000000, 1.00000000, 1.50000000, 2.00000000, + 2.50000000, 3.00000000, 3.50000000, 0.00000000, 0.50000000, + 1.00000000, 1.50000000, 2.00000000, 2.50000000, 3.00000000, + 3.50000000, 5.33333349, 5.66666651, 6.00000000, 6.33333349, + 6.66666651, 7.00000000, 7.33333349, 7.66666651, 2.65028572, + 2.98157120, 3.31285667, 3.64414287, 3.97542834, 4.30671406, + 4.63799953, 4.96928549, 11.92628479, 12.42321396, 12.92014217, + 13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516, + 2.16437674, 2.43492365, 2.70547056, 2.97601795, 3.24656487, + 3.51711202, 3.78765893, 4.05820608}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({4, emb_vector_dim})); + test::FillValues( + &grad_shards_2, + {1.58337951, 1.78130186, 1.97922409, 2.17714667, 2.37506914, 2.57299161, + 2.77091384, 2.96883631, 1.89459133, 2.01300311, 2.13141513, 2.24982715, + 2.36823893, 2.48665094, 2.60506320, 2.72347474, 1.89459133, 2.01300311, + 2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, 2.72347474, + 3.43474555, 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767, + 4.29343224, 4.43654633}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest, + Partition2_SUM_Float_No_Default) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, -1); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 1, 1}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_1, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_2, + {2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest, + Partition2_SUM_Float_Default_0) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, 0); + + // top_grad + AddInputFromArray( + TensorShape({batch_size, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // feature_nums + AddInputFromArray(TensorShape({batch_size}), {2, 1, 1}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor grad_shards_1(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_1, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + test::ExpectTensorNear(grad_shards_1, *GetOutput(0), 1e-4); + } + + { + Tensor grad_shards_2(allocator(), DT_FLOAT, + TensorShape({2, emb_vector_dim})); + test::FillValues(&grad_shards_2, + {2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0}); + test::ExpectTensorNear(grad_shards_2, *GetOutput(1), 1e-4); + } +} + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc new file mode 100644 index 00000000..1e3bacc2 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc @@ -0,0 +1,328 @@ +#include +#include +#include + +#include "tensorflow/core/framework/op_kernel.h" + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "cub/thread/thread_operators.cuh" +#include "fused_embedding_common.cu.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +namespace { +__global__ void SumUpEmbeddingShard(const float* emb_shard, + const int64_t* partitioned_indice, + float* emb_vectors, int* feature_nums, + const float max_norm, + const int emb_vec_size) { + __shared__ float l2_sum[1]; + + const int64_t row_in_batch = partitioned_indice[2 * blockIdx.x]; + float emb_element = emb_shard[blockIdx.x * emb_vec_size + threadIdx.x]; + if (max_norm >= 0.0f) { + if (threadIdx.x == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + + atomicAdd(emb_vectors + row_in_batch * emb_vec_size + threadIdx.x, + emb_element); + + if (threadIdx.x == 0) { + atomicAdd(feature_nums + row_in_batch, 1); + } +} + +template +__global__ void ApplyCombiner(float* emb_vectors, const int* row_emptiness_flag, + const bool set_empty_row_zero, + const int* feature_nums) { + const int offset = blockIdx.x * blockDim.x + threadIdx.x; + if (set_empty_row_zero) { + if (row_emptiness_flag[blockIdx.x]) { + emb_vectors[offset] = 0.0f; + return; + } + } + const int feature_num = feature_nums[blockIdx.x]; + const float emb_element = emb_vectors[offset]; + emb_vectors[offset] = Combine(emb_element, feature_num); +} + +template +__global__ void DistributeGradToShard( + const float* top_grad, const float* emb_shard, + const int64_t* partitioned_indice, const int* feature_nums, + const int* row_emptiness_flag, const bool set_empty_row_zero, + float* grad_shard, const int64_t sub_nnz, const int64_t emb_vec_size, + const float max_norm) { + __shared__ int64_t row_in_batch_shared[1]; + __shared__ int feature_num_shared[1]; + __shared__ float l2_sum[1]; + int64_t row_in_batch; + if (threadIdx.x == 0) { + row_in_batch = partitioned_indice[2 * blockIdx.x]; + row_in_batch_shared[0] = row_in_batch; + feature_num_shared[0] = feature_nums[row_in_batch]; + } + __syncthreads(); + row_in_batch = row_in_batch_shared[0]; + const int feature_num = feature_num_shared[0]; + if (set_empty_row_zero) { + if (row_emptiness_flag[row_in_batch]) { + grad_shard[blockIdx.x * emb_vec_size + threadIdx.x] = 0.0f; + return; + } + } + float grad = top_grad[row_in_batch * emb_vec_size + threadIdx.x]; + grad = CombineGrad(grad, feature_num); + if (max_norm >= 0.0f) { + const float emb_element = + emb_shard[blockIdx.x * emb_vec_size + threadIdx.x]; + if (threadIdx.x == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + grad *= max_norm / l2_norm; + } + } + grad_shard[blockIdx.x * emb_vec_size + threadIdx.x] = grad; +} +} // namespace + +class FusedEmbeddingSparsePostLookUpGPU : public OpKernel { + public: + explicit FusedEmbeddingSparsePostLookUpGPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + OpInputList emb_shards; + OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards)); + + OpInputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->input_list("partitioned_indices", &partitioned_indices)); + + Tensor const* dense_shape_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor)); + + Tensor const* row_empty_and_invalid_flags = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags", + &row_empty_and_invalid_flags)); + + const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1); + const int64_t batch_size = dense_shape_tensor->flat().data()[0]; + + // 1. sum up emb values from different entries and dump into output + Tensor* emb_vectors_tensor = nullptr; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, TensorShape({batch_size, emb_vec_size}), + &emb_vectors_tensor)); + // stream_executor::DeviceMemoryBase emb_vectors_wrapper( + // emb_vectors_tensor.flat().data(), + // emb_vectors_tensor->NumElements() * sizeof(float)); + // stream->ThenMemZero(&emb_vectors_wrapper, + // emb_vectors_tensor->NumElements() * sizeof(float)); + + cudaMemsetAsync(emb_vectors_tensor->flat().data(), 0x0, + sizeof(float) * emb_vectors_tensor->NumElements(), stream); + + Tensor* feature_nums; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(1, TensorShape({batch_size}), &feature_nums)); + // stream_executor::DeviceMemoryBase feature_nums_wrapper( + // feature_nums.flat().data(), + // feature_nums.NumElements() * sizeof(int)); + // stream->ThenMemZero(&feature_nums_wrapper, + // feature_nums.NumElements() * sizeof(int)); + cudaMemsetAsync(feature_nums->flat().data(), 0x0, + sizeof(int) * feature_nums->NumElements(), stream); + + for (int i = 0; i < num_partitions_; i++) { + const size_t sub_nnz = emb_shards[i].shape().dim_size(0); + OP_REQUIRES( + ctx, sub_nnz == partitioned_indices[i].shape().dim_size(0), + errors::InvalidArgument( + "emb_shard and partitioned_indice dosn't have the same length")); + + { + const int blocks = sub_nnz; + const int threads = emb_vec_size; + SumUpEmbeddingShard<<>>( + emb_shards[i].flat().data(), + reinterpret_cast( + partitioned_indices[i].flat().data()), + emb_vectors_tensor->flat().data(), + feature_nums->flat().data(), max_norm_, emb_vec_size); + CK_CUDA_THROW_(cudaGetLastError()); + } + } + + const bool set_empty_row_zero = default_id_ >= 0; + // 2. combiner + { + const int blocks = batch_size; + const int threads = emb_vec_size; + if (combiner_ == "sqrtn") { + ApplyCombiner<<>>( + emb_vectors_tensor->flat().data(), + row_empty_and_invalid_flags->flat().data(), set_empty_row_zero, + feature_nums->flat().data()); + } else if (combiner_ == "mean") { + ApplyCombiner<<>>( + emb_vectors_tensor->flat().data(), + row_empty_and_invalid_flags->flat().data(), set_empty_row_zero, + feature_nums->flat().data()); + } else { + ApplyCombiner<<>>( + emb_vectors_tensor->flat().data(), + row_empty_and_invalid_flags->flat().data(), set_empty_row_zero, + feature_nums->flat().data()); + } + CK_CUDA_THROW_(cudaGetLastError()); + } + } + + private: + int num_partitions_; + int partition_axis_; + std::string combiner_; + float max_norm_; + int64_t default_id_; +}; + +REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingSparsePostLookUp") + .Device(DEVICE_GPU) + .HostMemory("sp_dense_shape"), + FusedEmbeddingSparsePostLookUpGPU); + +class FusedEmbeddingSparsePostLookUpGradGPU : public OpKernel { + public: + explicit FusedEmbeddingSparsePostLookUpGradGPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_)); + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + Tensor const* top_grad_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor)); + + OpInputList emb_shards; + OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards)); + + OpInputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->input_list("partitioned_indices", &partitioned_indices)); + + Tensor const* feature_nums = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("feature_nums", &feature_nums)); + + Tensor const* row_empty_and_invalid_flags = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags", + &row_empty_and_invalid_flags)); + + OpOutputList grad_shards; + OP_REQUIRES_OK(ctx, ctx->output_list("grad_shards", &grad_shards)); + + const int64_t batch_size = top_grad_tensor->shape().dim_size(0); + const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1); + + const bool set_empty_row_zero = default_id_ >= 0; + + for (int i = 0; i < num_partitions_; i++) { + const int64_t sub_nnz = partitioned_indices[i].shape().dim_size(0); + + Tensor* grad_shard; + OP_REQUIRES_OK( + ctx, grad_shards.allocate(i, TensorShape({sub_nnz, emb_vec_size}), + &grad_shard)); + + { + const int blocks = sub_nnz; + const int threads = emb_vec_size; + if (combiner_ == "sqrtn") { + DistributeGradToShard<<>>( + top_grad_tensor->flat().data(), + emb_shards[i].flat().data(), + reinterpret_cast( + partitioned_indices[i].flat().data()), + feature_nums->flat().data(), + row_empty_and_invalid_flags->flat().data(), + set_empty_row_zero, grad_shard->flat().data(), sub_nnz, + emb_vec_size, max_norm_); + } else if (combiner_ == "mean") { + DistributeGradToShard<<>>( + top_grad_tensor->flat().data(), + emb_shards[i].flat().data(), + reinterpret_cast( + partitioned_indices[i].flat().data()), + feature_nums->flat().data(), + row_empty_and_invalid_flags->flat().data(), + set_empty_row_zero, grad_shard->flat().data(), sub_nnz, + emb_vec_size, max_norm_); + } else { + DistributeGradToShard<<>>( + top_grad_tensor->flat().data(), + emb_shards[i].flat().data(), + reinterpret_cast( + partitioned_indices[i].flat().data()), + feature_nums->flat().data(), + row_empty_and_invalid_flags->flat().data(), + set_empty_row_zero, grad_shard->flat().data(), sub_nnz, + emb_vec_size, max_norm_); + } + CK_CUDA_THROW_(cudaGetLastError()); + } + } + } + + private: + int num_partitions_; + int partition_axis_; + std::string combiner_; + float max_norm_; + int64_t default_id_; +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedEmbeddingSparsePostLookUpGrad").Device(DEVICE_GPU), + FusedEmbeddingSparsePostLookUpGradGPU); + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc new file mode 100644 index 00000000..3321f3ff --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc @@ -0,0 +1,213 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; +class FusedEmbeddingSparsePostLookUpOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype, + const std::string& combiner, const float max_norm, + const int default_id) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_sparse_post_look_up", + "FusedEmbeddingSparsePostLookUp") + .Attr("T", dtype) + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("combiner", combiner) + .Attr("max_norm", max_norm) + .Attr("default_id", default_id) + .Input(FakeInput(num_partitions, dtype)) + .Input(FakeInput(num_partitions, DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT64)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedEmbeddingSparsePostLookUpOpTest, + Partition3_Sqrtn_MaxNorm200_Float) { + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 3, DT_FLOAT, "sqrtn", 200.0, -1); + + // emb_shards + AddInputFromArray( + TensorShape({6, emb_vector_dim}), + { + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, + 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, + 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, + 38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, + }); + AddInputFromArray(TensorShape({1, emb_vector_dim}), + {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}); + AddInputFromArray( + TensorShape({3, emb_vector_dim}), + {96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({6, 2}), + {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1}); + AddInputFromArray(TensorShape({1, 2}), {1, 7}); + AddInputFromArray(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, + 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, + 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, + 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, + 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, + 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, + 77.87342834, 78.98532867}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 3, 3, 2}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePostLookUpOpTest, Partition2_Sum_No_Default) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, -1); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 1, 1}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePostLookUpOpTest, Partition2_Sum_Default_0) { + const int nnz = 3; + const int batch_size = 3; + const int emb_vector_dim = 4; + const int entries = 8; + + MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, 0); + + // emb_shards + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0}); + AddInputFromArray(TensorShape({2, emb_vector_dim}), + {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0}); + + // partitioned_indices + AddInputFromArray(TensorShape({2, 2}), {0, 0, 0, 5}); + AddInputFromArray(TensorShape({2, 2}), {1, 4, 2, 0}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {batch_size, entries}); + + // row_empty_and_invalid_flags + AddInputFromArray(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_emb_vectors(allocator(), DT_FLOAT, + TensorShape({batch_size, emb_vector_dim})); + test::FillValues( + &expected_emb_vectors, + {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 0.0, 0.0, 0.0, 0.0}); + test::ExpectTensorNear(expected_emb_vectors, *GetOutput(0), 1e-4); + } + { + Tensor feature_nums_expected(allocator(), DT_INT32, + TensorShape({batch_size})); + test::FillValues(&feature_nums_expected, {2, 1, 1}); + test::ExpectTensorEqual(feature_nums_expected, *GetOutput(1)); + } +} + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc new file mode 100644 index 00000000..9e2f2378 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc @@ -0,0 +1,521 @@ +#include +#include +#include + +#include "tensorflow/core/framework/op_kernel.h" + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "cub/device/device_radix_sort.cuh" +#include "cub/device/device_select.cuh" +#include "cub/iterator/constant_input_iterator.cuh" +#include "cub/thread/thread_operators.cuh" +#include "fused_embedding_common.cu.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +namespace { + +__global__ void InitFlagsToOneInt4(int length, int* flags) { + int offset = blockIdx.x * blockDim.x + threadIdx.x; + if (4 * offset + 3 < length) { + *((::int4*)(flags + 4 * offset)) = make_int4(1, 1, 1, 1); + } else if (4 * offset < length) { + for (int i = 0; i < length - 4 * offset; i++) { + flags[4 * offset + i] = 1; + } + } +} + +__global__ void FusedMultiFunctionalKernel( + const IndicePair* indices, const int64_t* values, const int64_t nnz, + const int64_t batch_size, const bool prune_invalid_id, + const int64_t default_id, int* row_emptiness_flag, int* invalid_id_flag, + IndicePair* tmp_indices_buffer, int64_t* values_extended) { + // This kernel will do many things together + // 1. The first part of threads will do job 1(DetectRowEmptiness), others will + // do job2(InitBatchRowsBuffer) + // 2. Do job3 (set values extended to default id) + + const int offset = blockIdx.x * blockDim.x + threadIdx.x; + if (offset < nnz) { + // do DetectRowEmptiness + if (prune_invalid_id) { + const int64_t value = values[offset]; + if (value < 0) { + // invalid, set invalid_id_flag + atomicAnd(invalid_id_flag + offset, 0); + } else { + // valid, set row_emptiness_flag + const int64_t row_in_batch = indices[offset].row_in_batch; + atomicAnd(row_emptiness_flag + row_in_batch, 0); + } + } else { + // set row_emptiness_flag + const int64_t row_in_batch = indices[offset].row_in_batch; + atomicAnd(row_emptiness_flag + row_in_batch, 0); + } + } else { + // do InitBatchRowsBuffer + const int other_offset = offset - nnz; + if (other_offset < batch_size) { + tmp_indices_buffer[other_offset].row_in_batch = other_offset; + // always set entry id to 0; + tmp_indices_buffer[other_offset].entry_in_column = 0; + } + } + + // set values extended to default id + if (2 * offset + 1 < nnz + batch_size) { + longlong2 l2 = make_longlong2(default_id, default_id); + *((longlong2*)(values_extended + 2 * offset)) = l2; + } else if (2 * offset < nnz + batch_size) { + values_extended[2 * offset] = default_id; + } +} + +__global__ void DetectInvalid(const int64_t* values, const int64_t nnz, + int* invalid_id_flag) { + const int offset = blockIdx.x * blockDim.x + threadIdx.x; + if (offset < nnz) { + const int64_t value = values[offset]; + if (value < 0) { + atomicAnd(invalid_id_flag + offset, 0); + } + } +} + +__global__ void CalcElementsOffsetPerPartition( + const int64_t* values_sorted, int64_t* partition_sizes_accumulate, + int64_t* elements_offset_per_partition, int nnz) { + // dichotomy + const int64_t target = partition_sizes_accumulate[blockIdx.x]; + int roof = nnz; + int floor = 0; + + int pos = (roof + floor) / 2; + while (1) { + if (pos == 0) { + pos = -1; + break; + } else if (pos == nnz - 1) { + break; + } + int64_t value = values_sorted[pos]; + int64_t value_plus_1 = values_sorted[pos + 1]; + if (value < target && value_plus_1 >= target) { + break; + } + if (value < target) { + floor = pos; + } else { + roof = pos; + } + pos = (roof + floor) / 2; + } + elements_offset_per_partition[blockIdx.x] = int64_t(pos + 1); +} + +__global__ void GatherAndConvertToSubPartition( + const int64_t* sub_values_sorted, int64_t* sub_partitioned_values, + const int64_t partition_start_base, const int64_t partition_size) { + const int t_offset = blockIdx.x * blockDim.x + threadIdx.x; + if (t_offset < partition_size) { + int64_t value = sub_values_sorted[t_offset]; + // rebase value to it's corresponding sub partition + value = value - partition_start_base; + sub_partitioned_values[t_offset] = value; + } +} + +} // namespace + +class FusedEmbeddingSparsePreLookUpGPU : public OpKernel { + public: + explicit FusedEmbeddingSparsePreLookUpGPU(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_empty_row", &fill_empty_row_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("prune_invalid_id", &prune_invalid_id_)); + int temp_default_id; + OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id)); + default_id_ = int64_t(temp_default_id); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + const int64_t default_id = default_id_ >= 0 ? default_id_ : 0; + const int linear_mapping_threads = 128; + + // 1. bind inputs + Tensor const* values_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor)); + const int64_t nnz = values_tensor->shape().dim_size(0); + + Tensor const* indices_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor)); + + Tensor const* dense_shape = nullptr; + OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape)); + const int64_t batch_size = dense_shape->flat().data()[0]; + + OpInputList partition_shapes; + OP_REQUIRES_OK(ctx, ctx->input_list("partition_shapes", &partition_shapes)); + + partition_sizes_accumulate_.clear(); + for (const Tensor& shape : partition_shapes) { + OP_REQUIRES(ctx, shape.dims() <= 2, + errors::InvalidArgument( + "input partition_shapes must all less than rank 2")); + const int64_t accu = partition_sizes_accumulate_.empty() + ? shape.flat().data()[0] + : shape.flat().data()[0] + + partition_sizes_accumulate_.back(); + partition_sizes_accumulate_.push_back(accu); + } + + // 2. allocate cub tmp storage + Tensor cub_temp_storage; + size_t max_cub_bytes = 0; + size_t temp_storage_bytes = 0; + + if (num_partitions_ > 1) { + cub::DeviceRadixSort::SortPairs( + (void*)nullptr, temp_storage_bytes, (int64_t*)nullptr, + (int64_t*)nullptr, (IndicePair*)nullptr, (IndicePair*)nullptr, + int(nnz + batch_size), 0, sizeof(int64_t) * 8, stream); + max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes + : max_cub_bytes; + } + + if (fill_empty_row_ || prune_invalid_id_) { + cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (int64_t*)nullptr, + (int*)nullptr, (int64_t*)nullptr, + (int*)nullptr, nnz, stream); + + max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes + : max_cub_bytes; + + cub::DeviceSelect::Flagged( + (void*)nullptr, temp_storage_bytes, (IndicePair*)nullptr, + (int*)nullptr, (IndicePair*)nullptr, (int*)nullptr, nnz, stream); + + max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes + : max_cub_bytes; + + if (fill_empty_row_) { + cub::DeviceSelect::Flagged((void*)nullptr, temp_storage_bytes, + (IndicePair*)nullptr, (int*)nullptr, + (IndicePair*)nullptr, (int*)nullptr, + batch_size, stream); + max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes + : max_cub_bytes; + } + } + + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(max_cub_bytes)}), + &cub_temp_storage)); + + // 3. fill_empty_row, prune, if avaliable. + Tensor values_extended; + Tensor indices_extended; + Tensor tmp_indices_buffer; + Tensor* all_flags; + Tensor selected_num_d; + int new_nnz = nnz; + + OP_REQUIRES_OK( + ctx, ctx->allocate_output(2 * num_partitions_, + TensorShape{batch_size + nnz}, &all_flags)); + + if (fill_empty_row_ || prune_invalid_id_) { + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DT_INT64, TensorShape{nnz + batch_size}, + &values_extended)); + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DT_INT64, TensorShape{2 * (nnz + batch_size)}, + &indices_extended)); + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DT_INT64, TensorShape{2 * batch_size}, + &tmp_indices_buffer)); + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DT_INT32, TensorShape{1}, &selected_num_d)); + + { + const int threads = linear_mapping_threads; + const int blocks = + CalcBlocksLinearMapping(batch_size + nnz, threads * 4); + InitFlagsToOneInt4<<>>( + batch_size + nnz, all_flags->flat().data()); + CK_CUDA_THROW_(cudaGetLastError()); + } + + // 3.1 set flags, init tmp_indices_buffer etc. + if (fill_empty_row_) { + { + const int threads = linear_mapping_threads; + const int blocks = CalcBlocksLinearMapping(nnz + batch_size, threads); + FusedMultiFunctionalKernel<<>>( + reinterpret_cast( + indices_tensor->flat().data()), + reinterpret_cast( + values_tensor->flat().data()), + nnz, batch_size, prune_invalid_id_, default_id, + all_flags->flat().data(), + all_flags->flat().data() + batch_size, + reinterpret_cast( + tmp_indices_buffer.flat().data()), + reinterpret_cast(values_extended.flat().data())); + CK_CUDA_THROW_(cudaGetLastError()); + } + } else if (prune_invalid_id_) { + { + const int threads = linear_mapping_threads; + const int blocks = CalcBlocksLinearMapping(nnz, threads); + DetectInvalid<<>>( + reinterpret_cast( + values_tensor->flat().data()), + nnz, all_flags->flat().data() + batch_size); + CK_CUDA_THROW_(cudaGetLastError()); + } + } + // 3.2 select copy valid id, select copy empty row indices + + cudaError_t cuda_ret = cudaSuccess; + cuda_ret = cub::DeviceSelect::Flagged( + cub_temp_storage.flat().data(), max_cub_bytes, + reinterpret_cast(values_tensor->flat().data()), + (const int*)(all_flags->flat().data() + batch_size), + reinterpret_cast(values_extended.flat().data()), + selected_num_d.flat().data(), int(nnz), stream); + CK_CUDA_THROW_(cudaGetLastError()); + + cub::DeviceSelect::Flagged( + cub_temp_storage.flat().data(), max_cub_bytes, + reinterpret_cast( + indices_tensor->flat().data()), + all_flags->flat().data() + batch_size, + reinterpret_cast(indices_extended.flat().data()), + selected_num_d.flat().data(), nnz, stream); + + if (prune_invalid_id_) { + int selected_num; + cudaMemcpyAsync(&selected_num, selected_num_d.flat().data(), + sizeof(int), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + new_nnz = selected_num; + } + + if (fill_empty_row_) { + cub::DeviceSelect::Flagged( + cub_temp_storage.flat().data(), max_cub_bytes, + reinterpret_cast( + tmp_indices_buffer.flat().data()), + all_flags->flat().data(), + reinterpret_cast( + indices_extended.flat().data()) + + new_nnz, + selected_num_d.flat().data(), batch_size, stream); + CK_CUDA_THROW_(cudaGetLastError()); + int selected_num; + cudaMemcpyAsync(&selected_num, selected_num_d.flat().data(), + sizeof(int), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + new_nnz += selected_num; + } + } + + // 3.5 set the correct pointer + const int64_t* values_in = (fill_empty_row_ || prune_invalid_id_) + ? reinterpret_cast( + values_extended.flat().data()) + : reinterpret_cast( + values_tensor->flat().data()); + const IndicePair* indices_in = + (fill_empty_row_ || prune_invalid_id_) + ? reinterpret_cast( + indices_extended.flat().data()) + : reinterpret_cast( + indices_tensor->flat().data()); + + OpOutputList partitioned_values; + OP_REQUIRES_OK(ctx, + ctx->output_list("partitioned_values", &partitioned_values)); + OpOutputList partitioned_indices; + OP_REQUIRES_OK( + ctx, ctx->output_list("partitioned_indices", &partitioned_indices)); + + // 4. set output + if (num_partitions_ == 1) { + // single partition case, just directly copy + Tensor* pv_out; + OP_REQUIRES_OK( + ctx, partitioned_values.allocate( + 0, TensorShape({static_cast(new_nnz)}), &pv_out)); + Tensor* pi_out; + OP_REQUIRES_OK( + ctx, + partitioned_indices.allocate( + 0, TensorShape({static_cast(new_nnz), 2}), &pi_out)); + + cudaMemcpyAsync(pv_out->flat().data(), values_in, + sizeof(int64_t) * new_nnz, cudaMemcpyDeviceToDevice, + stream); + cudaMemcpyAsync(pi_out->flat().data(), indices_in, + sizeof(IndicePair) * new_nnz, cudaMemcpyDeviceToDevice, + stream); + + } else { + // multi-partitions case, calcaulate indices and split them. + Tensor values_sorted; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT64, TensorShape{new_nnz}, + &values_sorted)); + Tensor indices_sorted; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT64, TensorShape{new_nnz, 2}, + &indices_sorted)); + + cub::DeviceRadixSort::SortPairs( + cub_temp_storage.flat().data(), max_cub_bytes, values_in, + reinterpret_cast(values_sorted.flat().data()), + indices_in, + reinterpret_cast(indices_sorted.flat().data()), + int(new_nnz), 0, sizeof(int64_t) * 8, stream); + CK_CUDA_THROW_(cudaGetLastError()); + + // 4.1 calculate how many elements for each + // partition + Tensor partition_sizes_accumulate; + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT64, TensorShape({static_cast(num_partitions_)}), + &partition_sizes_accumulate)); + cudaMemcpyAsync(partition_sizes_accumulate.flat().data(), + partition_sizes_accumulate_.data(), + num_partitions_ * sizeof(int64_t), cudaMemcpyHostToDevice, + stream); + + Tensor elements_offset_per_partition; + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT64, TensorShape({static_cast(num_partitions_)}), + &elements_offset_per_partition)); + + { + const int blocks = num_partitions_; + const int threads = 1; + CalcElementsOffsetPerPartition<<>>( + reinterpret_cast( + values_sorted.flat().data()), + reinterpret_cast( + partition_sizes_accumulate.flat().data()), + reinterpret_cast( + elements_offset_per_partition.flat().data()), + int(new_nnz)); + CK_CUDA_THROW_(cudaGetLastError()); + } + + elements_offset_per_partition_.clear(); + elements_offset_per_partition_.resize(num_partitions_); + // stream_executor::DeviceMemoryBase + // elements_offset_per_partition_wrapped( + // elements_offset_per_partition.flat().data(), + // num_partitions_); + // stream->ThenMemcpy(elements_offset_per_partition_.data(), + // elements_offset_per_partition_wrapped, + // num_partitions_ * + // sizeof(int64_t)); + // stream->BlockHostUntilDone(); + + cudaMemcpyAsync(elements_offset_per_partition_.data(), + elements_offset_per_partition.flat().data(), + num_partitions_ * sizeof(int64_t), cudaMemcpyDeviceToHost, + stream); + cudaStreamSynchronize(stream); + + // 4.2 set output + int64_t sub_start_offset = 0; + for (int i = 0; i < num_partitions_; i++) { + int64_t size = elements_offset_per_partition_[i] - sub_start_offset; + + Tensor* sub_partitioned_values; + OP_REQUIRES_OK(ctx, partitioned_values.allocate( + i, TensorShape({static_cast(size)}), + &sub_partitioned_values)); + + Tensor* sub_partitioned_indices; + OP_REQUIRES_OK(ctx, partitioned_indices.allocate( + i, TensorShape({static_cast(size), 2}), + &sub_partitioned_indices)); + + if (size > 0) { + // some partition does not have any + // element that falls in it + const int threads = linear_mapping_threads; + int blocks = CalcBlocksLinearMapping(size, threads); + + const int partition_start_base = + i == 0 ? 0 : partition_sizes_accumulate_[i - 1]; + GatherAndConvertToSubPartition<<>>( + reinterpret_cast( + values_sorted.flat().data()) + + sub_start_offset, + reinterpret_cast( + sub_partitioned_values->flat().data()), + partition_start_base, size); + + CK_CUDA_THROW_(cudaGetLastError()); + + // stream_executor::DeviceMemoryBase + // sub_indices_sorted_wrapped( + // reinterpret_cast(indices_sorted.flat().data()) + // + + // partition_start_base, + // size * sizeof(IndicePair)); + // stream_executor::DeviceMemoryBase + // sub_indices_out_wrapped( + // reinterpret_cast( + // sub_partitioned_indices.flat().data()), + // size * sizeof(IndicePair)); + // stream->ThenMemcpy(&sub_indices_out_wrapped, + // sub_indices_sorted_wrapped, + // size * 2 * + // sizeof(int64_t)); + cudaMemcpyAsync( + sub_partitioned_indices->flat().data(), + indices_sorted.flat().data() + 2 * sub_start_offset, + size * 2 * sizeof(int64_t), cudaMemcpyDeviceToDevice, stream); + } + sub_start_offset = elements_offset_per_partition_[i]; + } + } + // Op kernel execution done + } + + private: + int num_partitions_; + int partition_axis_; + bool fill_empty_row_; + bool prune_invalid_id_; + int64_t default_id_; + std::vector partition_sizes_accumulate_; + std::vector elements_offset_per_partition_; +}; + +REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingSparsePreLookUp") + .Device(DEVICE_GPU) + .HostMemory("partition_shapes") + .HostMemory("sp_dense_shape"), + FusedEmbeddingSparsePreLookUpGPU); +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc new file mode 100644 index 00000000..e9603304 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc @@ -0,0 +1,352 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +class FusedEmbeddingSparsePreLookUpOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, const int num_partitions, + const bool fill_empty_row, + const bool prune_invalid_id, const int default_id) { + if (device == Device::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + TF_EXPECT_OK(NodeDefBuilder("fused_embedding_sparse_pre_look_up", + "FusedEmbeddingSparsePreLookUp") + .Attr("num_partitions", num_partitions) + .Attr("partition_axis", 0) + .Attr("fill_empty_row", fill_empty_row) + .Attr("prune_invalid_id", prune_invalid_id) + .Attr("default_id", default_id) + .Input(FakeInput(num_partitions, DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Input(FakeInput(DT_INT64)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64) { + MakeOpAndSetDevice(Device::GPU, 3, false, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {6, 16}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {3, 16}); + // partition_shapes 2 + AddInputFromArray(TensorShape({2}), {7, 16}); + // sp_values + AddInputFromArray(TensorShape({12}), + {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7}); + // sp_indices + AddInputFromArray(TensorShape({12, 2}), + {2, 3, 4, 6, 1, 6, 12, 12, 12, 12, 11, 5, + 15, 0, 11, 6, 7, 9, 11, 8, 12, 13, 13, 0}); + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {16, 16}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({6})); + test::FillValues(&expected_values, {0, 1, 3, 5, 5, 5}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({6, 2})); + test::FillValues(&expected_indices, + {11, 6, 2, 3, 1, 6, 4, 6, 7, 9, 11, 8}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 1}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {12, 12, 13, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(4)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {2, 3, 5, 6}); + test::ExpectTensorEqual(expected_values, *GetOutput(2)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {12, 13, 12, 12, 11, 5, 15, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(5)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition2_Fill_Empty) { + MakeOpAndSetDevice(Device::GPU, 2, true, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({9})); + test::FillValues(&expected_values, {-6, -4, -3, -2, 0, 0, 2, 3, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2})); + test::FillValues(&expected_indices, {6, 1, 5, 2, 4, 0, 3, 0, 0, 0, 2, + 0, 6, 7, 1, 2, 0, 4}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Fill_Empty_Prune_Invalid) { + MakeOpAndSetDevice(Device::GPU, 2, true, true, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({7})); + test::FillValues(&expected_values, {0, 0, 0, 0, 2, 3, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({7, 2})); + test::FillValues(&expected_indices, + {0, 0, 2, 0, 4, 0, 5, 0, 6, 7, 1, 2, 0, 4}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Fill_Empty_Prune_Invalid_Default_7) { + MakeOpAndSetDevice(Device::GPU, 2, true, true, 7); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {0, 2, 3, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {0, 0, 6, 7, 1, 2, 0, 4}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({5})); + test::FillValues(&expected_values, {0, 2, 2, 2, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({5, 2})); + test::FillValues(&expected_indices, {3, 4, 2, 0, 4, 0, 5, 0, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition2_Prune_Invalid_Default_3) { + MakeOpAndSetDevice(Device::GPU, 2, false, true, 3); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {5, 8}); + // partition_shapes 1 + AddInputFromArray(TensorShape({2}), {5, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({4})); + test::FillValues(&expected_values, {0, 2, 3, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2})); + test::FillValues(&expected_indices, {0, 0, 6, 7, 1, 2, 0, 4}); + test::ExpectTensorEqual(expected_indices, *GetOutput(2)); + } + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({2})); + test::FillValues(&expected_values, {0, 4}); + test::ExpectTensorEqual(expected_values, *GetOutput(1)); + Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2})); + test::FillValues(&expected_indices, {3, 4, 6, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(3)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition1) { + MakeOpAndSetDevice(Device::GPU, 1, false, false, -1); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {10, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({10})); + test::FillValues(&expected_values, + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({10, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, + 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +TEST_F(FusedEmbeddingSparsePreLookUpOpTest, + Partition1_Fill_Empty_Prune_Invalid_Default_3) { + MakeOpAndSetDevice(Device::GPU, 1, true, true, 3); + // partition_shapes 0 + AddInputFromArray(TensorShape({2}), {10, 8}); + + // sp_values + AddInputFromArray(TensorShape({10}), + {0, 4, 3, -2, 5, -3, -4, 9, -6, 2}); + + // sp_indices + AddInputFromArray( + TensorShape({10, 2}), + {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7}); + + // sp_dense_shape + AddInputFromArray(TensorShape({2}), {7, 8}); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_values(allocator(), DT_INT64, TensorShape({9})); + test::FillValues(&expected_values, {0, 4, 3, 5, 9, 2, 3, 3, 3}); + test::ExpectTensorEqual(expected_values, *GetOutput(0)); + + Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2})); + test::FillValues(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6, + 7, 2, 0, 4, 0, 5, 0}); + test::ExpectTensorEqual(expected_indices, *GetOutput(1)); + } +} + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD new file mode 100644 index 00000000..379f2da7 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD @@ -0,0 +1,22 @@ +load("//tensorflow:tensorflow.bzl", "tf_custom_op_library") + +tf_custom_op_library( + name = "fused_embedding_ops", + srcs = [ + "compile_util.h", + "fused_layer_normalize_ops.cc", + ], + gpu_deps = [ + ], + gpu_srcs = [ + ], + deps = [ + "@com_github_google_leveldb//:leveldb", + "@sparsehash_c11//:dense_hash_map", + # "@org_tensorflow//tensorflow/core:framework_headers_lib", + # "@org_tensorflow//tensorflow/core/common_runtime:core_cpu", + "@org_tensorflow//tensorflow/core/kernels:training_op_helpers", + "@org_tensorflow//tensorflow/core/kernels:gpu_device_array", + "@org_tensorflow//tensorflow/core/kernels:gather_functor", + ], +) diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h new file mode 100644 index 00000000..cfeffbd8 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h @@ -0,0 +1,78 @@ +#ifndef TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_ +#define TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_ + +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/lib/core/threadpool.h" + +using namespace tensorflow; +// A class for forced loop unrolling at compile time +template +struct compile_time_for { + template + inline static void op(const Lambda& function, Args... args) { + compile_time_for::op(function, args...); + function(std::integral_constant{}, args...); + } +}; +template <> +struct compile_time_for<1> { + template + inline static void op(const Lambda& function, Args... args) { + function(std::integral_constant{}, args...); + } +}; +template <> +struct compile_time_for<0> { + // 0 loops, do nothing + template + inline static void op(const Lambda& function, Args... args) {} +}; +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + +template +inline __m512 reduce_sum_block(const __m512* v) { + __m512 block_sum = _mm512_setzero_ps(); + auto reduce_sum = [&](auto idx) { + block_sum = _mm512_add_ps(block_sum, v[idx]); + }; + compile_time_for::op(reduce_sum); + return block_sum; +} + +inline __m512 reduce_sum_block_ps(const __m512* v, int64 BLOCK_NUM) { + switch (BLOCK_NUM) { + case 1: + return v[0]; + case 2: + return reduce_sum_block<2>(v); + case 3: + return reduce_sum_block<3>(v); + case 4: + return reduce_sum_block<4>(v); + case 5: + return reduce_sum_block<5>(v); + case 6: + return reduce_sum_block<6>(v); + case 7: + return reduce_sum_block<7>(v); + case 8: + return reduce_sum_block<8>(v); + } +} + +static inline float horizontal_add(__m512 src) { + __m512 tmp = _mm512_add_ps( + src, _mm512_shuffle_f32x4(src, src, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 r = _mm512_castps512_ps128(_mm512_add_ps( + tmp, _mm512_shuffle_f32x4(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1)))); + r = _mm_hadd_ps(r, r); + return _mm_cvtss_f32(_mm_hadd_ps(r, r)); +} + +#endif // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) +#endif // TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc new file mode 100644 index 00000000..d91dfe6c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc @@ -0,0 +1,678 @@ +#include "compile_util.h" + +using namespace tensorflow; + +template +class FusedLayerNormOp : public OpKernel { + private: + float epsilon; + + public: + explicit FusedLayerNormOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon)); + } + + ~FusedLayerNormOp() {} + + void Compute(OpKernelContext* context) override { + // Grab the input + const Tensor* input_tensor = &context->input(0); + const Tensor* gamma_tensor = &context->input(1); + const Tensor* beta_tensor = &context->input(2); + + const T* input = input_tensor->flat().data(); + const float* gamma = gamma_tensor->flat().data(); + const float* beta = beta_tensor->flat().data(); + + // To check the input + OP_REQUIRES(context, (input_tensor->dims() >= 2), + errors::InvalidArgument("Input dimension should be >= 2")); + OP_REQUIRES(context, (gamma_tensor->dims() == 1), + errors::InvalidArgument("dims(gamma) != 1")); + OP_REQUIRES(context, (beta_tensor->dims() == 1), + errors::InvalidArgument("dims(beta) != 1")); + + int64 cols = input_tensor->dim_size(input_tensor->dims() - 1); + OP_REQUIRES( + context, (gamma_tensor->dim_size(0) == cols), + errors::InvalidArgument("size(gamma) != last_dim_size_of_input")); + OP_REQUIRES( + context, (beta_tensor->dim_size(0) == cols), + errors::InvalidArgument("size(beta) != last_dim_size_of_input")); + + int64 rows = 1; + TensorShape mean_var_shape; + for (int i = 0; i < input_tensor->dims() - 1; ++i) { + auto dim_size = input_tensor->dim_size(i); + rows *= dim_size; + mean_var_shape.AddDim(dim_size); + } + + // Create output tensors + Tensor* output_tensor = NULL; + Tensor* mean_tensor = NULL; + Tensor* rvariance_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), + &output_tensor)); + OP_REQUIRES_OK(context, + context->allocate_output(1, mean_var_shape, &mean_tensor)); + OP_REQUIRES_OK(context, context->allocate_output(2, mean_var_shape, + &rvariance_tensor)); + T* output = output_tensor->flat().data(); + float* mean = mean_tensor->flat().data(); + float* rvariance = rvariance_tensor->flat().data(); + + // Init + memset(mean, 0, sizeof(float) * rows); + memset(rvariance, 0, sizeof(float) * rows); + + // Do it + // Let every thread compute 16 rows to avoid false sharing + const int64 total_unit = (rows + 15) / 16; + const int64 unit_cost = + 16 * cols * 50; // assume every element consumes 50 cycles + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + int64 block_num = cols >> 7; + int64 remainder_128 = cols & 0x7F; + int64 remainder_16 = remainder_128 & 0x0F; + int64 remainder_block_num = remainder_128 >> 4; + int64 remainder_block_num_total = remainder_block_num + !!remainder_16; +#endif // AVX512F + const float one_over_cols = 1.0f / cols; + + auto& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + thread::ThreadPool* thread_pool = worker_threads.workers; + + thread_pool->ParallelFor( + total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) { + auto begin_row = begin_unit * 16; + auto end_row = end_unit * 16; + if (end_row > rows) { + end_row = rows; + } +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + forward_avx512(input, gamma, beta, output, mean, rvariance, cols, + begin_row, end_row, block_num, remainder_block_num, + remainder_block_num_total, remainder_128, remainder_16, + one_over_cols); +#else + forward(input, gamma, beta, output, mean, rvariance, cols, begin_row, end_row, one_over_cols); +#endif // AVX512F + }); + } + + private: + // Compute the rows locate in the range of [begin_row, begin_row + ROWS) + void forward(const float* input, const float* gamma, const float* beta, + float* output, float* mean, float* rvariance, int64 cols, + int64 begin_row, int64 end_row, const float one_over_cols) { + for (int64 i = begin_row; i < end_row; i++) { + // Sum + int64 j = 0; + for (; j + 7 < cols; j += 8) { + T data_0 = input[i * cols + j]; + T data_1 = input[i * cols + j + 1]; + T data_2 = input[i * cols + j + 2]; + T data_3 = input[i * cols + j + 3]; + T data_4 = input[i * cols + j + 4]; + T data_5 = input[i * cols + j + 5]; + T data_6 = input[i * cols + j + 6]; + T data_7 = input[i * cols + j + 7]; + mean[i] += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 + + data_6 + data_7; + } + for (; j < cols; j++) { + mean[i] += input[i * cols + j]; + } + // Mean + mean[i] *= one_over_cols; + + // variance + for (j = 0; j + 7 < cols; j += 8) { + T data_0 = input[i * cols + j] - mean[i]; + T data_1 = input[i * cols + j + 1] - mean[i]; + T data_2 = input[i * cols + j + 2] - mean[i]; + T data_3 = input[i * cols + j + 3] - mean[i]; + T data_4 = input[i * cols + j + 4] - mean[i]; + T data_5 = input[i * cols + j + 5] - mean[i]; + T data_6 = input[i * cols + j + 6] - mean[i]; + T data_7 = input[i * cols + j + 7] - mean[i]; + rvariance[i] += data_0 * data_0 + data_1 * data_1 + data_2 * data_2 + + data_3 * data_3 + data_4 * data_4 + data_5 * data_5 + + data_6 * data_6 + data_7 * data_7; + } + for (; j < cols; j++) { + T data = input[i * cols + j] - mean[i]; + rvariance[i] += data * data; + } + rvariance[i] *= one_over_cols; + rvariance[i] += epsilon; + rvariance[i] = 1.0f / sqrtf(rvariance[i]); + + for (j = 0; j + 7 < cols; j += 8) { + T data_0 = (input[i * cols + j] - mean[i]) * rvariance[i]; + T data_1 = (input[i * cols + j + 1] - mean[i]) * rvariance[i]; + T data_2 = (input[i * cols + j + 2] - mean[i]) * rvariance[i]; + T data_3 = (input[i * cols + j + 3] - mean[i]) * rvariance[i]; + T data_4 = (input[i * cols + j + 4] - mean[i]) * rvariance[i]; + T data_5 = (input[i * cols + j + 5] - mean[i]) * rvariance[i]; + T data_6 = (input[i * cols + j + 6] - mean[i]) * rvariance[i]; + T data_7 = (input[i * cols + j + 7] - mean[i]) * rvariance[i]; + output[i * cols + j] = gamma[j] * data_0 + beta[j]; + output[i * cols + j + 1] = gamma[j] * data_1 + beta[j]; + output[i * cols + j + 2] = gamma[j] * data_2 + beta[j]; + output[i * cols + j + 3] = gamma[j] * data_3 + beta[j]; + output[i * cols + j + 4] = gamma[j] * data_4 + beta[j]; + output[i * cols + j + 5] = gamma[j] * data_5 + beta[j]; + output[i * cols + j + 6] = gamma[j] * data_6 + beta[j]; + output[i * cols + j + 7] = gamma[j] * data_7 + beta[j]; + } + for (; j < cols; j++) { + T data = (input[i * cols + j] - mean[i]) * rvariance[i]; + output[i * cols + j] = gamma[j] * data + beta[j]; + } + } + } + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + // AVX512 block size = 8; pack 8 * 16 = 128; + inline void forward_avx512(const float* input, const float* gamma, + const float* beta, float* output, float* mean, + float* rvariance, int64 cols, int64 begin_row, + int64 end_row, int64 block_num, + int64 remainder_block_num, + int64 remainder_block_num_total, + int64 remainder_128, int64 remainder_16, + const float one_over_cols) { + for (int64 i = begin_row; i < end_row; ++i) { + // Sum + for (int64 j = 0; j < block_num; ++j) { + __m512 inputs[8]; + auto load = [&](auto idx) { + inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx); + }; + compile_time_for<8>::op(load); + __m512 block_sum = reduce_sum_block<8>(inputs); + mean[i] += _mm512_reduce_add_ps(block_sum); + } + if (remainder_block_num_total) { // remainder sum + __m512 inputs[remainder_block_num_total]; + for (int64 idx = 0; idx < remainder_block_num; idx++) { + inputs[idx] = _mm512_loadu_ps(input + cols * i + cols - + remainder_128 + 16 * idx); + } + if (remainder_16) { + __mmask16 mask = 0xFFFF >> (16 - remainder_16); + inputs[remainder_block_num] = _mm512_maskz_loadu_ps( + mask, input + cols * i + cols - remainder_16); + } + __m512 block_sum = + reduce_sum_block_ps(inputs, remainder_block_num_total); + mean[i] += _mm512_reduce_add_ps(block_sum); + } + + // Mean + mean[i] *= one_over_cols; + __m512 means = _mm512_set1_ps(mean[i]); + + // Variance + for (int64 j = 0; j < block_num; ++j) { + __m512 inputs[8]; + auto load_var = [&](auto idx) { + inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx); + inputs[idx] = _mm512_sub_ps(inputs[idx], means); + inputs[idx] = _mm512_mul_ps(inputs[idx], inputs[idx]); + }; + compile_time_for<8>::op(load_var); + __m512 block_sum = reduce_sum_block<8>(inputs); + rvariance[i] += _mm512_reduce_add_ps(block_sum); + } + if (remainder_block_num_total) { // remainder var + __m512 inputs[remainder_block_num_total]; + for (int64 idx = 0; idx < remainder_block_num; idx++) { + inputs[idx] = _mm512_loadu_ps(input + cols * i + cols - + remainder_128 + 16 * idx); + inputs[idx] = _mm512_sub_ps(inputs[idx], means); + inputs[idx] = _mm512_mul_ps(inputs[idx], inputs[idx]); + } + if (remainder_16) { + __mmask16 mask = 0xFFFF >> (16 - remainder_16); + inputs[remainder_block_num] = _mm512_maskz_loadu_ps( + mask, input + cols * i + cols - remainder_16); + inputs[remainder_block_num] = + _mm512_maskz_sub_ps(mask, inputs[remainder_block_num], means); + inputs[remainder_block_num] = _mm512_maskz_mul_ps( + mask, inputs[remainder_block_num], inputs[remainder_block_num]); + } + __m512 block_sum = + reduce_sum_block_ps(inputs, remainder_block_num_total); + rvariance[i] += _mm512_reduce_add_ps(block_sum); + } + + rvariance[i] *= one_over_cols; + rvariance[i] += epsilon; + rvariance[i] = 1.0f / sqrtf(rvariance[i]); + __m512 rvariances = _mm512_set1_ps(rvariance[i]); + // Normalize and store + for (int64 j = 0; j < block_num; ++j) { + __m512 inputs[8]; + __m512 nums[8]; // used to load gammas and betas + auto load_normalize = [&](auto idx) { + // (x - mean) / sqrt(var + eps) + inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx); + inputs[idx] = _mm512_sub_ps(inputs[idx], means); + inputs[idx] = _mm512_mul_ps(inputs[idx], rvariances); + // Mul gamma + nums[idx] = _mm512_loadu_ps(gamma + 128 * j + 16 * idx); + inputs[idx] = _mm512_mul_ps(inputs[idx], nums[idx]); + // Add beta + nums[idx] = _mm512_loadu_ps(beta + 128 * j + 16 * idx); + inputs[idx] = _mm512_add_ps(inputs[idx], nums[idx]); + + // Store + _mm512_storeu_ps(output + cols * i + 128 * j + 16 * idx, inputs[idx]); + }; + compile_time_for<8>::op(load_normalize); + } + if (remainder_block_num_total) { // remainder normalize and store + __m512 inputs; + __m512 nums; // used to load gammas and betas + for (int64 idx = 0; idx < remainder_block_num; + idx++) { // remainder of 128 + // (x - mean) / sqrt(var + eps) + inputs = _mm512_loadu_ps(input + cols * i + cols - remainder_128 + + 16 * idx); + inputs = _mm512_sub_ps(inputs, means); + inputs = _mm512_mul_ps(inputs, rvariances); + // Mul gamma + nums = _mm512_loadu_ps(gamma + cols - remainder_128 + 16 * idx); + inputs = _mm512_mul_ps(inputs, nums); + // Add beta + nums = _mm512_loadu_ps(beta + cols - remainder_128 + 16 * idx); + inputs = _mm512_add_ps(inputs, nums); + + // Store + _mm512_storeu_ps(output + cols * i + cols - remainder_128 + 16 * idx, + inputs); + } + if (remainder_16) { // remainder of 16 + __mmask16 mask = 0xFFFF >> (16 - remainder_16); + // (x - mean) / sqrt(var + eps) + inputs = _mm512_maskz_loadu_ps( + mask, input + cols * i + cols - remainder_16); + inputs = _mm512_maskz_sub_ps(mask, inputs, means); + inputs = _mm512_maskz_mul_ps(mask, inputs, rvariances); + // Mul gamma + nums = _mm512_maskz_loadu_ps(mask, gamma + cols - remainder_16); + inputs = _mm512_maskz_mul_ps(mask, inputs, nums); + // Add beta + nums = _mm512_maskz_loadu_ps(mask, beta + cols - remainder_16); + inputs = _mm512_maskz_add_ps(mask, inputs, nums); + + // Store + _mm512_mask_storeu_ps(output + cols * i + cols - remainder_16, mask, + inputs); + } + } + } + } + +#endif // forward layer norm avx512 impl +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedLayerNorm").Device(DEVICE_CPU).TypeConstraint("T"), + FusedLayerNormOp); + +template +class FusedLayerNormGradOp : public OpKernel { + public: + explicit FusedLayerNormGradOp(OpKernelConstruction* context) + : OpKernel(context) {} + + ~FusedLayerNormGradOp() {} + + void Compute(OpKernelContext* context) override { + // Grab the input + const Tensor* y_grad_tensor = &context->input(0); + const Tensor* x_tensor = &context->input(1); + const Tensor* mean_tensor = &context->input(2); + const Tensor* rvariance_tensor = &context->input(3); + const Tensor* gamma_tensor = &context->input(4); + + const T* y_grad = y_grad_tensor->flat().data(); + const T* x = x_tensor->flat().data(); + const float* mean = mean_tensor->flat().data(); + const float* rvariance = rvariance_tensor->flat().data(); + const float* gamma = gamma_tensor->flat().data(); + + int64 cols = x_tensor->dim_size(x_tensor->dims() - 1); + int64 rows = mean_tensor->NumElements(); + + // Create output tensors + Tensor* x_grad_tensor = NULL; + Tensor* gamma_grad_tensor = NULL; + Tensor* beta_grad_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, x_tensor->shape(), + &x_grad_tensor)); + OP_REQUIRES_OK(context, context->allocate_output(1, gamma_tensor->shape(), + &gamma_grad_tensor)); + OP_REQUIRES_OK(context, context->allocate_output(2, gamma_tensor->shape(), + &beta_grad_tensor)); + T* x_grad = x_grad_tensor->flat().data(); + float* gamma_grad = gamma_grad_tensor->flat().data(); + float* beta_grad = beta_grad_tensor->flat().data(); + + // Init + memset(gamma_grad, 0, sizeof(float) * cols); + memset(beta_grad, 0, sizeof(float) * cols); + + auto& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + thread::ThreadPool* thread_pool = worker_threads.workers; + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + const int total_unit = (rows >= 128 ? 8 : (rows + 15) / 16); + const int64 rows_per_unit = (rows + total_unit - 1) / total_unit; + const int64 unit_cost = rows_per_unit * cols * 100; + thread_pool->ParallelFor( + total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) { + auto begin_row = begin_unit * rows_per_unit; + auto end_row = end_unit * rows_per_unit; + if (end_row > rows) { + end_row = rows; + } + backward(y_grad, x, mean, rvariance, gamma, x_grad, gamma_grad, + beta_grad, cols, begin_row, end_row); + }); +#else + const float one_over_cols = 1.0f / cols; + const int64 total_unit = (rows + 15) / 16; + const int64 unit_cost = + 16 * cols * 100; // assume every element consumes 100 cycles + + thread_pool->ParallelFor( + total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) { + auto begin_row = begin_unit * 16; + auto end_row = end_unit * 16; + if (end_row > rows) { + end_row = rows; + } + backward(y_grad, x, mean, rvariance, gamma, x_grad, gamma_grad, + beta_grad, begin_row, end_row, cols, one_over_cols); + }); +#endif // backward compute + } + + private: +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + void backward(const float* diff, const float* x, const float* mean, + const float* rvariance, const float* gamma, float* x_diff, + float* gamma_diff, float* beta_diff, int64 cols, int begin_row, + int end_row) { + int i = begin_row; + for (; i + 3 < end_row; i += 4) { + backward_avx512<4>(diff, x, mean, rvariance, gamma, x_diff, gamma_diff, + beta_diff, cols, i); + } + for (; i < end_row; ++i) { + backward_avx512<1>(diff, x, mean, rvariance, gamma, x_diff, gamma_diff, + beta_diff, cols, i); + } + } +#else + // For gradient of x, it comes from 3 parts: x-mean, mean, and rvariance + // grad from (x - mean): y_grad * gamma * [rvariance] + // grad from mean: - sum_row(y_grad * gamma * [rvariance]) / #cols + // grad from rvariance: sum_row(y_grad * gamma * (x - mean)) * (- + // [rvariance]^3) * (x - mean) / #cols + // For gradient of gamma, grad = y_grad * (x - mean) * rvariance + // For gradient of beta, grad = y_grad + void backward(const float* y_grad, const float* x, const float* mean, + const float* rvariance, const float* gamma, float* x_grad, + float* gamma_grad, float* beta_grad, int64 begin_row, + int64 end_row, int64 cols, const float one_over_cols) { + for (int64 i = begin_row; i < end_row; ++i) { + int64 j = 0; + float sum_m = 0; + float sum_r = 0; + // sum_m: sum_row(y_grad * gamma) + // sum_r: sum_row(y_grad * gamma * (x - mean)) + for (; j + 7 < cols; j += 8) { + T data_0 = y_grad[i * cols + j] * gamma[j]; + T data_1 = y_grad[i * cols + j + 1] * gamma[j + 1]; + T data_2 = y_grad[i * cols + j + 2] * gamma[j + 2]; + T data_3 = y_grad[i * cols + j + 3] * gamma[j + 3]; + T data_4 = y_grad[i * cols + j + 4] * gamma[j + 4]; + T data_5 = y_grad[i * cols + j + 5] * gamma[j + 5]; + T data_6 = y_grad[i * cols + j + 6] * gamma[j + 6]; + T data_7 = y_grad[i * cols + j + 7] * gamma[j + 7]; + sum_m += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 + data_6 + + data_7; + + data_0 = data_0 * (x[i * cols + j] - mean[i]); + data_1 = data_1 * (x[i * cols + j + 1] - mean[i]); + data_2 = data_2 * (x[i * cols + j + 2] - mean[i]); + data_3 = data_3 * (x[i * cols + j + 3] - mean[i]); + data_4 = data_4 * (x[i * cols + j + 4] - mean[i]); + data_5 = data_5 * (x[i * cols + j + 5] - mean[i]); + data_6 = data_6 * (x[i * cols + j + 6] - mean[i]); + data_7 = data_7 * (x[i * cols + j + 7] - mean[i]); + sum_r += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 + data_6 + + data_7; + } + for (; j < cols; ++j) { // remainder + sum_m += y_grad[i * cols + j] * gamma[j]; + sum_r += y_grad[i * cols + j] * gamma[j] * (x[i * cols + j] - mean[i]); + } + sum_m *= one_over_cols; + sum_r *= rvariance[i] * rvariance[i]; + sum_r *= one_over_cols; + + for (j = 0; j + 7 < cols; j += 8) { + x_grad[i * cols + j] = y_grad[i * cols + j] * gamma[j]; + x_grad[i * cols + j + 1] = y_grad[i * cols + j + 1] * gamma[j + 1]; + x_grad[i * cols + j + 2] = y_grad[i * cols + j + 2] * gamma[j + 2]; + x_grad[i * cols + j + 3] = y_grad[i * cols + j + 3] * gamma[j + 3]; + x_grad[i * cols + j + 4] = y_grad[i * cols + j + 4] * gamma[j + 4]; + x_grad[i * cols + j + 5] = y_grad[i * cols + j + 5] * gamma[j + 5]; + x_grad[i * cols + j + 6] = y_grad[i * cols + j + 6] * gamma[j + 6]; + x_grad[i * cols + j + 7] = y_grad[i * cols + j + 7] * gamma[j + 7]; + + x_grad[i * cols + j] -= sum_m + sum_r * (x[i * cols + j] - mean[i]); + x_grad[i * cols + j + 1] -= + sum_m + sum_r * (x[i * cols + j + 1] - mean[i]); + x_grad[i * cols + j + 2] -= + sum_m + sum_r * (x[i * cols + j + 2] - mean[i]); + x_grad[i * cols + j + 3] -= + sum_m + sum_r * (x[i * cols + j + 3] - mean[i]); + x_grad[i * cols + j + 4] -= + sum_m + sum_r * (x[i * cols + j + 4] - mean[i]); + x_grad[i * cols + j + 5] -= + sum_m + sum_r * (x[i * cols + j + 5] - mean[i]); + x_grad[i * cols + j + 6] -= + sum_m + sum_r * (x[i * cols + j + 6] - mean[i]); + x_grad[i * cols + j + 7] -= + sum_m + sum_r * (x[i * cols + j + 7] - mean[i]); + + x_grad[i * cols + j] *= rvariance[i]; + x_grad[i * cols + j + 1] *= rvariance[i]; + x_grad[i * cols + j + 2] *= rvariance[i]; + x_grad[i * cols + j + 3] *= rvariance[i]; + x_grad[i * cols + j + 4] *= rvariance[i]; + x_grad[i * cols + j + 5] *= rvariance[i]; + x_grad[i * cols + j + 6] *= rvariance[i]; + x_grad[i * cols + j + 7] *= rvariance[i]; + } + for (; j < cols; ++j) { // remainder + x_grad[i * cols + j] = y_grad[i * cols + j] * gamma[j]; + x_grad[i * cols + j] -= sum_m + sum_r * (x[i * cols + j] - mean[i]); + x_grad[i * cols + j] *= rvariance[i]; + } + + // grad of gamma + for (j = 0; j + 7 < cols; j += 8) { + gamma_grad[j] += + y_grad[i * cols + j] * (x[i * cols + j] - mean[i]) * rvariance[i]; + gamma_grad[j + 1] += y_grad[i * cols + j + 1] * + (x[i * cols + j + 1] - mean[i]) * rvariance[i]; + gamma_grad[j + 2] += y_grad[i * cols + j + 2] * + (x[i * cols + j + 2] - mean[i]) * rvariance[i]; + gamma_grad[j + 3] += y_grad[i * cols + j + 3] * + (x[i * cols + j + 3] - mean[i]) * rvariance[i]; + gamma_grad[j + 4] += y_grad[i * cols + j + 4] * + (x[i * cols + j + 4] - mean[i]) * rvariance[i]; + gamma_grad[j + 5] += y_grad[i * cols + j + 5] * + (x[i * cols + j + 5] - mean[i]) * rvariance[i]; + gamma_grad[j + 6] += y_grad[i * cols + j + 6] * + (x[i * cols + j + 6] - mean[i]) * rvariance[i]; + gamma_grad[j + 7] += y_grad[i * cols + j + 7] * + (x[i * cols + j + 7] - mean[i]) * rvariance[i]; + } + for (; j < cols; ++j) { // remainder + gamma_grad[j] += + y_grad[i * cols + j] * (x[i * cols + j] - mean[i]) * rvariance[i]; + } + + // grad of beta + for (j = 0; j + 7 < cols; j += 8) { + beta_grad[j] += y_grad[i * cols + j]; + beta_grad[j + 1] += y_grad[i * cols + j + 1]; + beta_grad[j + 2] += y_grad[i * cols + j + 2]; + beta_grad[j + 3] += y_grad[i * cols + j + 3]; + beta_grad[j + 4] += y_grad[i * cols + j + 4]; + beta_grad[j + 5] += y_grad[i * cols + j + 5]; + beta_grad[j + 6] += y_grad[i * cols + j + 6]; + beta_grad[j + 7] += y_grad[i * cols + j + 7]; + } + for (; j < cols; ++j) { // remainder + beta_grad[j] += y_grad[i * cols + j]; + } + } + } +#endif // backward define + +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + template + inline void backward_avx512(const float* y_grad, const float* x, + const float* mean, const float* rvariance, + const float* gamma, float* x_grad, + float* gamma_grad, float* beta_grad, int64 cols, + int64 start_row) { + float sum_m[ROWS], sum_r[ROWS]; + __m512 vsum_m[ROWS], vsum_r[ROWS], vmean[ROWS], vrvariance[ROWS]; + + // Init + auto setzero = [&](auto idx) { + vsum_m[idx] = _mm512_setzero_ps(); + vsum_r[idx] = _mm512_setzero_ps(); + vmean[idx] = _mm512_set1_ps(mean[start_row + idx]); + vrvariance[idx] = _mm512_set1_ps(rvariance[start_row + idx]); + }; + compile_time_for::op(setzero); + + // Compute sum for y_grad * gamma and y_grad * gamma * (x - mean) + int64 j = 0; + for (; j + 15 < cols; j += 16) { + auto compute_sum = [&](auto idx) { + __m512 vy_grad = _mm512_loadu_ps(y_grad + (start_row + idx) * cols + j); + __m512 vgamma = _mm512_loadu_ps(gamma + j); + + __m512 mul = _mm512_mul_ps(vy_grad, vgamma); + vsum_m[idx] = _mm512_add_ps(mul, vsum_m[idx]); + + __m512 vx = _mm512_loadu_ps(x + (start_row + idx) * cols + j); + __m512 x_minus_mean = _mm512_sub_ps(vx, vmean[idx]); + vsum_r[idx] = _mm512_fmadd_ps(mul, x_minus_mean, vsum_r[idx]); + }; + + compile_time_for::op(compute_sum); + } + + auto reduce_sum = [&](auto idx) { + sum_m[idx] = horizontal_add(vsum_m[idx]); + sum_r[idx] = horizontal_add(vsum_r[idx]); + + for (int64 c = j; c < cols; ++c) { + const auto offset = (start_row + idx) * cols + c; + sum_m[idx] += y_grad[offset] * gamma[c]; + sum_r[idx] += + y_grad[offset] * gamma[c] * (x[offset] - mean[start_row + idx]); + } + + sum_m[idx] /= cols; + sum_r[idx] *= rvariance[start_row + idx] * rvariance[start_row + idx]; + sum_r[idx] /= cols; + + vsum_m[idx] = _mm512_set1_ps(sum_m[idx]); + vsum_r[idx] = _mm512_set1_ps(sum_r[idx]); + }; + + compile_time_for::op(reduce_sum); + + // Compute gradient for x, gamma, beta + for (j = 0; j + 15 < cols; j += 16) { + __m512 vgamma_grad = _mm512_loadu_ps(gamma_grad + j); + __m512 vbeta_grad = _mm512_loadu_ps(beta_grad + j); + + auto compute_grad = [&](auto idx) { + __m512 vy_grad = _mm512_loadu_ps(y_grad + (start_row + idx) * cols + j); + __m512 vgamma = _mm512_loadu_ps(gamma + j); + + __m512 vx_grad = _mm512_mul_ps(vy_grad, vgamma); + + __m512 vx = _mm512_loadu_ps(x + (start_row + idx) * cols + j); + __m512 x_minus_mean = _mm512_sub_ps(vx, vmean[idx]); + + vx_grad = _mm512_sub_ps( + vx_grad, _mm512_fmadd_ps(vsum_r[idx], x_minus_mean, vsum_m[idx])); + vx_grad = _mm512_mul_ps(vx_grad, vrvariance[idx]); + + // save gradient of x + _mm512_storeu_ps(x_grad + (start_row + idx) * cols + j, vx_grad); + + // gradient for gamma and beta + vgamma_grad = _mm512_fmadd_ps(_mm512_mul_ps(vy_grad, x_minus_mean), + vrvariance[idx], vgamma_grad); + vbeta_grad = _mm512_add_ps(vy_grad, vbeta_grad); + }; + + compile_time_for::op(compute_grad); + + // save gradient of gamma, beta + _mm512_storeu_ps(gamma_grad + j, vgamma_grad); + _mm512_storeu_ps(beta_grad + j, vbeta_grad); + } + + // Deal with the remain data + if (cols % 16 != 0) { + int remain = cols % 16; + auto remain_grad = [&](auto idx) { + for (int64 c = j; c < cols; ++c) { + const auto offset = (start_row + idx) * cols + c; + float vx_grad = y_grad[offset] * gamma[c]; + float x_minus_mean = x[offset] - mean[start_row + idx]; + vx_grad -= sum_m[idx] + sum_r[idx] * x_minus_mean; + vx_grad *= rvariance[start_row + idx]; + + // save gradient of x + x_grad[offset] = vx_grad; + + // gradient for gamma and beta + gamma_grad[c] += + y_grad[offset] * x_minus_mean * rvariance[start_row + idx]; + beta_grad[c] += y_grad[offset]; + } + }; + + compile_time_for::op(remain_grad); + } + } +#endif // backward layer norm avx512 impl +}; + +REGISTER_KERNEL_BUILDER( + Name("FusedLayerNormGrad").Device(DEVICE_CPU).TypeConstraint("T"), + FusedLayerNormGradOp); diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc new file mode 100644 index 00000000..45b5ea15 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc @@ -0,0 +1,269 @@ +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace { + +enum class Device { CPU, GPU }; + +class FusedLayerNormalizeOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, DataType dtype, int axis, + float epsilon) { + TF_EXPECT_OK(NodeDefBuilder("fused_layer_normalize", "FusedLayerNorm") + .Attr("T", dtype) + .Attr("epsilon", epsilon) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedLayerNormalizeOpTest, 2Dims_Float) { + const int rows = 7; + const int cols = 255; + + MakeOpAndSetDevice(Device::CPU, DT_FLOAT, 0, 1e-12); + + float input_array[1785]; + for (int i = 0; i < sizeof(input_array) / sizeof(float); i++) { + input_array[i] = 1.0; + } + for (int i = 0; i < rows; i++) { + input_array[i * cols] = 2.0; + } + AddInputFromArray(TensorShape({rows, cols}), input_array); + AddInput(TensorShape({cols}), [](int i) -> float { return 2.0; }); + AddInput(TensorShape({cols}), [](int i) -> float { return 1.0; }); + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_output(allocator(), DT_FLOAT, TensorShape({rows, cols})); + Tensor mean(allocator(), DT_FLOAT, TensorShape({rows})); + Tensor rvariance(allocator(), DT_FLOAT, TensorShape({rows})); + float output_array[1785]; + float rvar_value = 16.000125885009766f; + float mean_value = 256.0f / 255.0f; + // 1.00392162799835205f; + for (int i = 0; i < sizeof(output_array) / sizeof(float); i++) { + output_array[i] = 0.87450695037841797; + } + for (int i = 0; i < rows; i++) { + output_array[i * cols] = 2.0f * sqrtf(254.0f) + 1.0f; + // 32.874755859375; + } + + float mean_array[rows]; + for (int i = 0; i < sizeof(mean_array) / sizeof(float); i++) { + mean_array[i] = mean_value; + } + + float rvariance_array[rows]; + + for (int i = 0; i < sizeof(rvariance_array) / sizeof(float); i++) { + rvariance_array[i] = rvar_value; + } + test::FillValues(&expected_output, output_array); + test::FillValues(&mean, mean_array); + test::FillValues(&rvariance, rvariance_array); + test::ExpectTensorNear(expected_output, *GetOutput(0), 1e-5); + test::ExpectTensorNear(mean, *GetOutput(1), 1e-5); + test::ExpectTensorNear(rvariance, *GetOutput(2), 1e-5); + } +} + +class FusedLayerNormalizeGradOpTest : public OpsTestBase { + protected: + void MakeOpAndSetDevice(Device device, DataType dtype, int axis, + float epsilon) { + TF_EXPECT_OK( + NodeDefBuilder("fused_layer_normalize_grad", "FusedLayerNormGrad") + .Attr("T", dtype) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + } +}; + +TEST_F(FusedLayerNormalizeGradOpTest, 2Dims_Float) { + const int rows = 7; + const int cols = 255; + + MakeOpAndSetDevice(Device::CPU, DT_FLOAT, 0, 1e-12); + + AddInput(TensorShape({rows, cols}), + [](int i) -> float { return 1.0f; }); // y_grad + AddInput(TensorShape({rows, cols}), [](int i) -> float { + return (i % cols) ? 1.0f : 2.0f; + }); // x + AddInput(TensorShape({rows}), + [](int i) -> float { return 256.0f / 255.0f; }); // mean + AddInput(TensorShape({rows}), [](int i) -> float { + return 16.00012302493275484; + }); // rvariance + AddInput(TensorShape({cols}), + [](int i) -> float { return 2.0f; }); // gamma + + TF_ASSERT_OK(RunOpKernel()); + TF_EXPECT_OK(device_->Sync()); + + { + Tensor expected_output(allocator(), DT_FLOAT, TensorShape({rows, cols})); + Tensor gamma_grad(allocator(), DT_FLOAT, TensorShape({cols})); + Tensor beta_grad(allocator(), DT_FLOAT, TensorShape({cols})); + float x_grad[1785]; + for (int i = 0; i < sizeof(x_grad) / sizeof(float); i++) { + x_grad[i] = 0.0f; + } + for (int i = 0; i < rows; i++) { + x_grad[i * cols] = 0.00048447030712850392f; + } + + float gamma_grads[cols]; + for (int i = 0; i < sizeof(gamma_grads) / sizeof(float); i++) { + gamma_grads[i] = -0.4392257034778595; + } + gamma_grads[0] = 111.56163787841797; + + float beta_grads[cols]; + for (int i = 0; i < sizeof(beta_grads) / sizeof(float); i++) { + beta_grads[i] = 7.0f; + } + test::FillValues(&expected_output, x_grad); + test::FillValues(&gamma_grad, gamma_grads); + test::FillValues(&beta_grad, beta_grads); + test::ExpectTensorNear(expected_output, *GetOutput(0), 1e-5); + test::ExpectTensorNear(gamma_grad, *GetOutput(1), 1e-5); + test::ExpectTensorNear(beta_grad, *GetOutput(2), 1e-5); + } +} + +//----------------------------------------------------------------------------// +// Performance benchmarks // +//----------------------------------------------------------------------------// +static Graph* FusedLayerNormalize(int rows, int cols) { + Graph* g = new Graph(OpRegistry::Global()); + DataType dtype = DT_FLOAT; + + Tensor in(dtype, TensorShape({rows, cols})); + in.flat().setRandom(); + Tensor gamma(dtype, TensorShape({cols})); + gamma.flat().setRandom(); + Tensor beta(dtype, TensorShape({cols})); + beta.flat().setRandom(); + + Node* input_in = test::graph::Constant(g, in); + Node* input_gamma = test::graph::Constant(g, gamma); + Node* input_beta = test::graph::Constant(g, beta); + auto nodeBuilder = NodeBuilder(g->NewName("n"), "FusedLayerNorm") + .Input(input_in) + .Input(input_gamma) + .Input(input_beta) + .Attr("T", dtype) + .Attr("epsilon", 1e-12); + TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr)); + + return g; +} + +#define BM_FusedLayerNorm(ROWS, COLS, NTH) \ + static void BM_FusedLayerNorm##_##ROWS##_##COLS##_##NTH##_CPU(int iters) { \ + testing::UseRealTime(); \ + testing::ItemsProcessed(static_cast(iters) * ROWS * COLS * 3); \ + SessionOptions opts; \ + opts.config.set_intra_op_parallelism_threads(NTH); \ + test::Benchmark("cpu", FusedLayerNormalize(ROWS, COLS), &opts).Run(iters); \ + } \ + BENCHMARK(BM_FusedLayerNorm##_##ROWS##_##COLS##_##NTH##_CPU); + +#define BM_FusedLayerNorm_NTH(ROWS, COLS) \ + BM_FusedLayerNorm(ROWS, COLS, 1); \ + BM_FusedLayerNorm(ROWS, COLS, 4); \ + BM_FusedLayerNorm(ROWS, COLS, 8); + +BM_FusedLayerNorm_NTH(1024, 63); +BM_FusedLayerNorm_NTH(1024, 255); +BM_FusedLayerNorm_NTH(1024, 511); +BM_FusedLayerNorm_NTH(1024, 1023); +BM_FusedLayerNorm_NTH(1024, 1024); +BM_FusedLayerNorm_NTH(1024, 2048); +BM_FusedLayerNorm_NTH(1024, 4096); + +} // namespace + +static Graph* FusedLayerNormalizeGrad(int rows, int cols) { + Graph* g = new Graph(OpRegistry::Global()); + DataType dtype = DT_FLOAT; + + Tensor y_grad(dtype, TensorShape({rows, cols})); + y_grad.flat().setRandom(); + Tensor x(dtype, TensorShape({rows, cols})); + x.flat().setRandom(); + Tensor mean(dtype, TensorShape({rows})); + mean.flat().setRandom(); + Tensor rvarance(dtype, TensorShape({rows})); + rvarance.flat().setRandom(); + Tensor gamma(dtype, TensorShape({cols})); + gamma.flat().setRandom(); + + Node* input_y_grad = test::graph::Constant(g, y_grad); + Node* input_x = test::graph::Constant(g, x); + Node* input_mean = test::graph::Constant(g, mean); + Node* input_rvarance = test::graph::Constant(g, rvarance); + Node* input_gamma = test::graph::Constant(g, gamma); + auto nodeBuilder = NodeBuilder(g->NewName("n"), "FusedLayerNormGrad") + .Input(input_y_grad) + .Input(input_x) + .Input(input_mean) + .Input(input_rvarance) + .Input(input_gamma) + .Attr("T", dtype); + TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr)); + + return g; +} + +#define BM_FusedLayerNormGrad(ROWS, COLS, NTH) \ + static void BM_FusedLayerNormGrad##_##ROWS##_##COLS##_##NTH##_CPU( \ + int iters) { \ + testing::UseRealTime(); \ + testing::ItemsProcessed(static_cast(iters) * ROWS * COLS * 3); \ + SessionOptions opts; \ + opts.config.set_intra_op_parallelism_threads(NTH); \ + test::Benchmark("cpu", FusedLayerNormalizeGrad(ROWS, COLS), &opts) \ + .Run(iters); \ + } \ + BENCHMARK(BM_FusedLayerNormGrad##_##ROWS##_##COLS##_##NTH##_CPU); + +#define BM_FusedLayerNormGrad_NTH(ROWS, COLS) \ + BM_FusedLayerNormGrad(ROWS, COLS, 1); \ + BM_FusedLayerNormGrad(ROWS, COLS, 4); \ + BM_FusedLayerNormGrad(ROWS, COLS, 8); + +BM_FusedLayerNormGrad_NTH(1024, 63); +BM_FusedLayerNormGrad_NTH(1024, 255); +BM_FusedLayerNormGrad_NTH(1024, 511); +BM_FusedLayerNormGrad_NTH(1024, 1023); +BM_FusedLayerNormGrad_NTH(1024, 1024); +BM_FusedLayerNormGrad_NTH(1024, 2048); +BM_FusedLayerNormGrad_NTH(1024, 4096); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc new file mode 100644 index 00000000..4603ed2f --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc @@ -0,0 +1,176 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "group_embedding_lookup_sparse_forward_base_ops.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +#define USING_BASE_CLASS_MEMBER \ + using GroupLookupBaseCpuOp::m_num_lookup; \ + using GroupLookupBaseCpuOp::m_dimension; \ + using GroupLookupBaseCpuOp::m_is_use_default_value_tensor; + +using CPUDevice = Eigen::ThreadPoolDevice; + +template +class GroupEmbeddingVariableLookupDenseCpuOp + : public GroupLookupBaseCpuOp { + USING_BASE_CLASS_MEMBER + public: + explicit GroupEmbeddingVariableLookupDenseCpuOp(OpKernelConstruction* c) + : GroupLookupBaseCpuOp(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &m_is_use_default_value_tensor)); + } + + void Compute(OpKernelContext* ctx) override { + /* + step 1: unique and assign unique output and index + step 2: doing parallel unique value gather + */ + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + for (int i = 0; i < m_num_lookup; ++i) { + EmbeddingVar* embedding_var = nullptr; + OP_REQUIRES_OK( + ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var)); + core::ScopedUnref unref_me(embedding_var); + + const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i); + auto dense_values = dense_values_tensor.flat().data(); + int nnz = dense_values_tensor.NumElements(); + + auto dense_values_tensor_shape = dense_values_tensor.shape(); + TensorShape emb_vectors_tensor_shape = + TensorShape(dense_values_tensor_shape); + emb_vectors_tensor_shape.AddDim(m_dimension); + Tensor* gather_embedding_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &gather_embedding_tensor)); + auto gather_embedding = gather_embedding_tensor->flat().data(); + + OP_REQUIRES( + ctx, + !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() && + embedding_var->CacheSize() >= nnz), + errors::InvalidArgument("MultiLevel EV's Cache size ", + embedding_var->CacheSize(), + " should large than IDs in batch ", nnz)); + + EmbeddingVarContext ev_ctx(ctx); + if (m_is_use_default_value_tensor) { + embedding_var->GetEmbeddings( + ev_ctx, dense_values, gather_embedding, nnz, + reinterpret_cast(ctx->input(m_num_lookup * 4 + 1).data())); + } else { + embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding, + nnz); + embedding_var->UpdateCache(dense_values_tensor, true); + } + } + } +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVarLookupDense") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVariableLookupDenseCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +template +class GroupVariableLookupDenseCpuOp + : public GroupLookupBaseCpuOp { + USING_BASE_CLASS_MEMBER + public: + explicit GroupVariableLookupDenseCpuOp(OpKernelConstruction* c) + : GroupLookupBaseCpuOp(c) {} + + void Compute(OpKernelContext* ctx) override { + /* + step 1: unique and assign unique output and index + step 2: doing parallel unique value gather + */ + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + for (int i = 0; i < m_num_lookup; ++i) { + const Tensor& emb_variable_tensor = ctx->input(i); + auto embedding_variable = emb_variable_tensor.flat().data(); + + const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i); + + int nnz = dense_values_tensor.NumElements(); + + auto dense_values_tensor_shape = dense_values_tensor.shape(); + TensorShape emb_vectors_tensor_shape = + TensorShape(dense_values_tensor_shape); + emb_vectors_tensor_shape.AddDim(m_dimension); + Tensor* gather_embedding_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &gather_embedding_tensor)); + auto gather_embedding = gather_embedding_tensor->flat().data(); + + // Stage 1 + Tensor unique_idx_tensor; + Tensor unique_tensor; + Tensor unique_counter; + + UniqueWithoutAxis( + ctx, dense_values_tensor, &unique_idx_tensor, &unique_tensor, + &unique_counter, 0, this->partition_size_, this->serial_, + this->unique_ratio_hint_, this->map_flag_); + + ctx->set_output(m_num_lookup + i, unique_tensor); + ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor); + auto* unique = unique_tensor.flat().data(); + auto* unique_idx = unique_idx_tensor.flat().data(); + int slice_bytes = nnz * m_dimension * 1000; + auto do_lookup = [this, ctx, embedding_variable, unique, unique_idx, + gather_embedding](int64 start, int64 end) { + for (int k = start; k < end; ++k) { + auto indices = unique_idx[k]; + TKey unique_id = unique[indices]; + memcpy(gather_embedding + k * m_dimension, + embedding_variable + unique_id * m_dimension, + sizeof(float) * m_dimension); + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, nnz, + slice_bytes, do_lookup); + } + } +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER(Name("GroupVariableLookupDense") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupVariableLookupDenseCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +#undef USING_BASE_CLASS_MEMBER +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc new file mode 100644 index 00000000..fefd6041 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc @@ -0,0 +1,105 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "group_embedding_lookup_sparse_forward_base_ops.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { + +using GPUDevice = Eigen::GpuDevice; + +#define USING_BASE_CLASS_MEMBER \ + using GroupLookupBaseCpuOp::m_num_lookup; \ + using GroupLookupBaseCpuOp::m_dimension; \ + using GroupLookupBaseCpuOp::m_is_use_default_value_tensor; + +template +class GroupEmbeddingVariableLookupDenseGpuOp + : public GroupLookupBaseCpuOp { + USING_BASE_CLASS_MEMBER + public: + explicit GroupEmbeddingVariableLookupDenseGpuOp(OpKernelConstruction* c) + : GroupLookupBaseCpuOp(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &m_is_use_default_value_tensor)); + } + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + + for (int i = 0; i < m_num_lookup; ++i) { + EmbeddingVar* embedding_var = nullptr; + OP_REQUIRES_OK( + ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var)); + core::ScopedUnref unref_me(embedding_var); + + const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i); + auto dense_values = dense_values_tensor.flat().data(); + int nnz = dense_values_tensor.NumElements(); + + auto dense_values_tensor_shape = dense_values_tensor.shape(); + TensorShape emb_vectors_tensor_shape = + TensorShape(dense_values_tensor_shape); + emb_vectors_tensor_shape.AddDim(m_dimension); + Tensor* gather_embedding_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &gather_embedding_tensor)); + auto gather_embedding = gather_embedding_tensor->flat().data(); + + OP_REQUIRES( + ctx, + !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() && + embedding_var->CacheSize() >= nnz), + errors::InvalidArgument("MultiLevel EV's Cache size ", + embedding_var->CacheSize(), + " should large than IDs in batch ", nnz)); + + EmbeddingVarContext ev_ctx(ctx); + if (m_is_use_default_value_tensor) { + embedding_var->GetEmbeddings( + ev_ctx, dense_values, gather_embedding, nnz, + reinterpret_cast(ctx->input(m_num_lookup * 4 + 1).data()), + stream); + } else { + embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding, + nnz, nullptr, stream); + embedding_var->UpdateCache(dense_values_tensor, true, stream); + } + } + } +}; + +#define REGISTER_GPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVarLookupDense") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVariableLookupDenseGpuOp) + +REGISTER_GPU_KERNELS(int32, float); +REGISTER_GPU_KERNELS(int64, float); +#undef REGISTER_GPU_KERNELS + +#undef USING_BASE_CLASS_MEMBER +} // namespace tensorflow + +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc new file mode 100644 index 00000000..e62668fc --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc @@ -0,0 +1,1089 @@ +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "deepray/custom_ops/utils/fake_input.h" +#include "deepray/custom_ops/utils/kernel_benchmark_testlib.h" +#include "deepray/custom_ops/utils/ops_testutil.h" +#include "deepray/custom_ops/utils/tensor_testutil.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +// #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" +#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +enum DEVICE { CPU, GPU }; + +enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 }; + +template +void get_node_attr_from_test_case(string& combiner_str, float& max_norm) { + if (test_case == Sqrtn) { + combiner_str = "sqrtn"; + max_norm = -1.0f; + } else if (test_case == Mean) { + combiner_str = "mean"; + max_norm = -1.0f; + } else if (test_case == Sum) { + combiner_str = "sum"; + max_norm = -1.0f; + } else if (test_case == SqrtnAndMaxNorm200) { + combiner_str = "sqrtn"; + max_norm = 200.0f; + } else if (test_case == MeanAndMaxNorm100) { + combiner_str = "mean"; + max_norm = 100.0f; + } +} + +template +void fill_var_vector_expected(Tensor* expected); + +template <> +void fill_var_vector_expected(Tensor* expected) { + test::FillValues( + expected, {22.627416610717773, 24.0416316986084, 25.45584487915039, + 26.870058059692383, 28.284271240234375, 29.698484420776367, + 31.112699508666992, 32.526912689208984, 73.90083312988281, + 75.63288879394531, 77.36493682861328, 79.09698486328125, + 80.82904052734375, 82.56108856201172, 84.29314422607422, + 86.02519226074219, 124.70765686035156, 126.43971252441406, + 128.17176818847656, 129.90380859375, 131.6358642578125, + 133.367919921875, 135.09996032714844, 136.83201599121094, + 107.48023223876953, 108.89444732666016, 110.30866241455078, + 111.72286987304688, 113.1370849609375, 114.55130004882812, + 115.96551513671875, 117.37973022460938}); +} + +template <> +void fill_var_vector_expected(Tensor* expected) { + test::FillValues( + expected, {16.00000000000000, 17.00000000000000, 18.00000000000000, + 19.00000000000000, 20.00000000000000, 21.00000000000000, + 22.00000000000000, 23.00000000000000, 42.66666793823242, + 43.66666793823242, 44.66666793823242, 45.66666793823242, + 46.66666793823242, 47.66666793823242, 48.66666793823242, + 49.66666793823242, 72.00000000000000, 73.00000000000000, + 74.00000000000000, 75.00000000000000, 76.00000000000000, + 77.00000000000000, 78.00000000000000, 79.00000000000000, + 76.00000000000000, 77.00000000000000, 78.00000000000000, + 79.00000000000000, 80.00000000000000, 81.00000000000000, + 82.00000000000000, 83.00000000000000}); +} + +template <> +void fill_var_vector_expected(Tensor* expected) { + test::FillValues( + expected, {32.0, 34.0, 36.0, 38.0, 40.0, 42.0, 44.0, 46.0, + 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0, + 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0, + 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0}); +} + +template <> +void fill_var_vector_expected(Tensor* expected) { + test::FillValues( + expected, + {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, + 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, + 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, + 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, + 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, + 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, + 77.87342834, 78.98532867}); +} + +class GroupVariableForWardOpTest : public OpsTestBase { + protected: + template + void Run(DEVICE device) { + if (device == DEVICE::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + DataType k_dtype = DataTypeToEnum::value; + DataType v_dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + const int num_lookups = 2; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("group_variable_lookup", "GroupVariableLookup") + .Input(FakeInput(num_lookups, v_dtype)) // ev + .Input(FakeInput(num_lookups, k_dtype)) // sp_values + .Input(FakeInput(num_lookups, DT_INT64)) // sp_indices + .Input(FakeInput(num_lookups, v_dtype)) // sp_weights + .Input(FakeInput(DT_INT32)) // dense_shape + .Input(FakeInput(v_dtype)) // default_value + .Attr("dtype", v_dtype) + .Attr("Tkeys", k_dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Attr("dimension", emb_vector_dim) + .Attr("num_lookups", num_lookups) + .Attr("ignore_weights", true) + .Attr("is_use_default_value_tensor", false) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + for (int i = 0; i < num_lookups; ++i) { + Tensor emb_variable(v_dtype, {bucket_size, emb_vector_dim}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + + AddInputFromArray(emb_variable.shape(), + emb_variable.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nnz}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_indices(DT_INT64, {nnz}); + test::FillValues(&sp_indices, {0, 0, 1, 1, 1, 2, 2, 2, 3, 3}); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_weights(v_dtype, {nnz}); + test::FillValues(&sp_weights, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f}); + AddInputFromArray(sp_weights.shape(), sp_weights.flat()); + } + + Tensor* batch_size_tensor = + AddInput(DataTypeToEnum::v(), TensorShape({})); + auto batch_size_data = batch_size_tensor->flat().data(); + batch_size_data[0] = batch_size; + + Tensor* default_v_tensor = + AddInput(DataTypeToEnum::v(), TensorShape({})); + auto default_v = default_v_tensor->flat().data(); + default_v[0] = 1.0f; + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(v_dtype, {batch_size, emb_vector_dim}); + Tensor unique_values_expected(DT_INT64, {7}); + Tensor unique_idx_expected(DT_INT32, {nnz}); + Tensor batch_size_expected(DT_INT32, {batch_size}); + + fill_var_vector_expected(&emb_vector_expected); + + if (device == DEVICE::GPU) { + test::FillValues(&batch_size_expected, {0, 2, 5, 8}); + } else { + test::FillValues(&unique_values_expected, {3, 1, 4, 5, 7, 12, 15}); + test::FillValues(&unique_idx_expected, + {0, 1, 2, 3, 4, 0, 5, 5, 6, 2}); + test::FillValues(&batch_size_expected, {2, 5, 8, 10}); + } + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& emb_vector = *GetOutput(i); + const Tensor& unique_values = *GetOutput(num_lookups + i); + const Tensor& unique_idx_output = *GetOutput(2 * num_lookups + i); + const Tensor& batch_size_output = *GetOutput(3 * num_lookups + i); + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-4); + if (device == DEVICE::CPU) { + test::ExpectTensorEqual(unique_values_expected, unique_values); + test::ExpectTensorEqual(unique_idx_expected, unique_idx_output); + } + test::ExpectTensorEqual(batch_size_expected, batch_size_output); + } + } +}; + +#ifdef GOOGLE_CUDA +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSqrtnGpu) { + Run(DEVICE::GPU); +} + +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatMeanGpu) { + Run(DEVICE::GPU); +} + +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSumGpu) { + Run(DEVICE::GPU); +} + +// TEST_F(GroupVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) { +// Run(DEVICE::GPU); +// } +#endif // GOOGLE_CUDA + +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSqrtnCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatMeanCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSumCpu) { + Run(DEVICE::CPU); +} + +// TEST_F(GroupVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Cpu) { +// Run(DEVICE::CPU); +// } + +template +void fill_var_grad_expected(Tensor* expected); + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.7071067690849304, 1.4142135381698608, + 2.1213204860687256, 2.8284270763397217, 3.535533905029297, + 4.242640972137451, 4.949747562408447, 0.000000000000000, + 0.7071067690849304, 1.4142135381698608, 2.1213204860687256, + 2.8284270763397217, 3.535533905029297, 4.242640972137451, + 4.949747562408447, 4.618802070617676, 5.196152687072754, + 5.773502826690674, 6.350852966308594, 6.928203582763672, + 7.505553722381592, 8.082903861999512, 8.66025447845459, + 4.618802070617676, 5.196152687072754, 5.773502826690674, + 6.350852966308594, 6.928203582763672, 7.505553722381592, + 8.082903861999512, 8.66025447845459, 4.618802070617676, + 5.196152687072754, 5.773502826690674, 6.350852966308594, + 6.928203582763672, 7.505553722381592, 8.082903861999512, + 8.66025447845459, 9.237604141235352, 9.81495475769043, + 10.392305374145508, 10.96965503692627, 11.547005653381348, + 12.124356269836426, 12.701705932617188, 13.279056549072266, + 16.970563888549805, 17.677669525146484, 18.384777069091797, + 19.091882705688477, 19.79899024963379, 20.5060977935791, + 21.21320343017578, 21.920310974121094}); +} + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.500000000000000, 1.000000000000000, + 1.500000000000000, 2.000000000000000, 2.500000000000000, + 3.000000000000000, 3.500000000000000, 0.000000000000000, + 0.500000000000000, 1.000000000000000, 1.500000000000000, + 2.000000000000000, 2.500000000000000, 3.000000000000000, + 3.500000000000000, + + 2.6666667461395264, 3.000000000000000, 3.3333332538604736, + 3.6666667461395264, 4.000000000000000, 4.333333492279053, + 4.666666507720947, 5.000000000000000, 2.6666667461395264, + 3.000000000000000, 3.3333332538604736, 3.6666667461395264, + 4.000000000000000, 4.333333492279053, 4.666666507720947, + 5.000000000000000, 2.6666667461395264, 3.000000000000000, + 3.3333332538604736, 3.6666667461395264, 4.000000000000000, + 4.333333492279053, 4.666666507720947, 5.000000000000000, + 5.333333492279053, 5.666666507720947, 6.000000000000000, + 6.333333492279053, 6.666666507720947, 7.000000000000000, + 7.333333492279053, 7.666666507720947, 12.000000000000000, + 12.500000000000000, 13.000000000000000, 13.500000000000000, + 14.000000000000000, 14.500000000000000, 15.000000000000000, + 15.500000000000000}); +} + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0, 9.0, 10.0, 11.0, + 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, + 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); +} + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.7071067690849304, 1.4142135381698608, + 2.1213204860687256, 2.8284270763397217, 3.535533905029297, + 4.242640972137451, 4.949747562408447, 0.000000000000000, + 0.7071067690849304, 1.4142135381698608, 2.1213204860687256, + 2.8284270763397217, 3.535533905029297, 4.242640972137451, + 4.949747562408447, 4.618802070617676, 5.196152687072754, + 5.773502826690674, 6.350852966308594, 6.928203582763672, + 7.505553722381592, 8.082903861999512, 8.66025447845459, + 4.618802070617676, 5.196152687072754, 5.773502826690674, + 6.350852966308594, 6.928203582763672, 7.505553722381592, + 8.082903861999512, 8.66025447845459, 4.618802070617676, + 5.196152687072754, 5.773502826690674, 6.350852966308594, + 6.928203582763672, 7.505553722381592, 8.082903861999512, + 8.66025447845459, 9.237604141235352, 9.81495475769043, + 10.392305374145508, 10.96965503692627, 11.547005653381348, + 12.124356269836426, 12.701705932617188, 13.279056549072266, + 9.237604141235352, 9.81495475769043, 10.392305374145508, + 10.96965503692627, 11.547005653381348, 12.124356269836426, + 12.701705932617188, 13.279056549072266, 9.237604141235352, + 9.81495475769043, 10.392305374145508, 10.96965503692627, + 11.547005653381348, 12.124356269836426, 12.701705932617188, + 13.279056549072266, 16.970563888549805, 17.677669525146484, + 18.384777069091797, 19.091882705688477, 19.79899024963379, + 20.5060977935791, 21.21320343017578, 21.920310974121094, + 16.970563888549805, 17.677669525146484, 18.384777069091797, + 19.091882705688477, 19.79899024963379, 20.5060977935791, + 21.21320343017578, 21.920310974121094}); +} + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, {0.000000000000000, 0.500000000000000, 1.000000000000000, + 1.500000000000000, 2.000000000000000, 2.500000000000000, + 3.000000000000000, 3.500000000000000, 0.000000000000000, + 0.500000000000000, 1.000000000000000, 1.500000000000000, + 2.000000000000000, 2.500000000000000, 3.000000000000000, + 3.500000000000000, 2.6666667461395264, 3.000000000000000, + 3.3333332538604736, 3.6666667461395264, 4.000000000000000, + 4.333333492279053, 4.666666507720947, 5.000000000000000, + 2.6666667461395264, 3.000000000000000, 3.3333332538604736, + 3.6666667461395264, 4.000000000000000, 4.333333492279053, + 4.666666507720947, 5.000000000000000, 2.6666667461395264, + 3.000000000000000, 3.3333332538604736, 3.6666667461395264, + 4.000000000000000, 4.333333492279053, 4.666666507720947, + 5.000000000000000, 5.333333492279053, 5.666666507720947, + 6.000000000000000, 6.333333492279053, 6.666666507720947, + 7.000000000000000, 7.333333492279053, 7.666666507720947, + 5.333333492279053, 5.666666507720947, 6.000000000000000, + 6.333333492279053, 6.666666507720947, 7.000000000000000, + 7.333333492279053, 7.666666507720947, 5.333333492279053, + 5.666666507720947, 6.000000000000000, 6.333333492279053, + 6.666666507720947, 7.000000000000000, 7.333333492279053, + 7.666666507720947, 12.000000000000000, 12.500000000000000, + 13.000000000000000, 13.500000000000000, 14.000000000000000, + 14.500000000000000, 15.000000000000000, 15.500000000000000, + 12.000000000000000, 12.500000000000000, 13.000000000000000, + 13.500000000000000, 14.000000000000000, 14.500000000000000, + 15.000000000000000, 15.500000000000000}); +} + +template <> +void fill_var_grad_expected(Tensor* expected) { + test::FillValues( + expected, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0, 9.0, 10.0, 11.0, + 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); +} + +// template <> +// void fill_var_grad_expected(Tensor* expected) { +// test::FillValues( +// expected, +// {0.00000000, 0.50000000, 1.00000000, 1.50000000, 2.00000000, +// 2.50000000, 3.00000000, 3.50000000, 0.00000000, 0.50000000, +// 1.00000000, 1.50000000, 2.00000000, 2.50000000, 3.00000000, +// 3.50000000, 2.65028572, 2.98157120, 3.31285667, 3.64414287, +// 3.97542834, 4.30671406, 4.63799953, 4.96928549, 2.16437674, +// 2.43492365, 2.70547056, 2.97601795, 3.24656487, 3.51711202, +// 3.78765893, 4.05820608, 1.58337951, 1.78130186, 1.97922409, +// 2.17714667, 2.37506914, 2.57299161, 2.77091384, 2.96883631, +// 5.33333349, 5.66666651, 6.00000000, 6.33333349, 6.66666651, +// 7.00000000, 7.33333349, 7.66666651, 1.89459133, 2.01300311, +// 2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, +// 2.72347474, 1.89459133, 2.01300311, 2.13141513, 2.24982715, +// 2.36823893, 2.48665094, 2.60506320, 2.72347474, 3.43474555, +// 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767, +// 4.29343224, 4.43654633, 11.92628479, 12.42321396, 12.92014217, +// 13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516}); +// } + +class GroupVariableBackWardOpTest : public OpsTestBase { + protected: + template + void Run(DEVICE device) { + if (device == DEVICE::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + DataType k_dtype = DataTypeToEnum::value; + DataType v_dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + + const int nnz = 7; + const int nums = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + const int num_lookups = 2; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK( + NodeDefBuilder("group_variable_lookup_grad", "GroupVariableLookupGrad") + .Input(FakeInput(num_lookups, DT_FLOAT)) // grads + .Input(FakeInput(num_lookups, v_dtype)) // variable + .Input(FakeInput(num_lookups, k_dtype)) // unique_key + .Input(FakeInput(num_lookups, DT_INT64)) // unique_idx + .Input(FakeInput(num_lookups, DT_INT32)) // batch_nums + .Attr("dtype", v_dtype) + .Attr("Tkeys", k_dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Attr("dimension", emb_vector_dim) + .Attr("num_lookups", num_lookups) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + for (int i = 0; i < num_lookups; ++i) { + Tensor top_grad(DT_FLOAT, {batch_size, emb_vector_dim}); + test::FillValues( + &top_grad, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + + AddInputFromArray(top_grad.shape(), top_grad.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor emb_variable(v_dtype, {bucket_size, emb_vector_dim}); + test::FillValues( + &emb_variable, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, + 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, + 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, + 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, + 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, + 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, + 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, + 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, + 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0}); + AddInputFromArray(emb_variable.shape(), + emb_variable.flat()); + } + if (device == DEVICE::GPU) { + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nums}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT64, {nnz}); + test::FillValues(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT32, {batch_size}); + test::FillValues(&sp_values_offset, {0, 2, 5, 8}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(v_dtype, {nums, emb_vector_dim}); + fill_var_grad_expected(&grad_expected); + + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& grad = *GetOutput(i); + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } + } else { + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nnz}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 12, 15}); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT64, {nnz}); + test::FillValues(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT32, {batch_size}); + test::FillValues(&sp_values_offset, {2, 5, 8, 10}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(v_dtype, {nnz, emb_vector_dim}); + fill_var_grad_expected(&grad_expected); + + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& grad = *GetOutput(i); + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } + } + } +}; + +#ifdef GOOGLE_CUDA +TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatGpu) { + Run(DEVICE::GPU); +} + +TEST_F(GroupVariableBackWardOpTest, + EmbeddingLocalSparseLookUpGradFloatMeanGpu) { + Run(DEVICE::GPU); +} + +TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatSumGpu) { + Run(DEVICE::GPU); +} + +// TEST_F(GroupVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) { +// Run(DEVICE::GPU); +// } +#endif // GOOGLE_CUDA + +TEST_F(GroupVariableBackWardOpTest, + EmbeddingLocalSparseLookUpGradFloatSqrtCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupVariableBackWardOpTest, + EmbeddingLocalSparseLookUpGradFloatMeanCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatSumCpu) { + Run(DEVICE::CPU); +} + +// TEST_F(GroupVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Cpu) { +// Run(DEVICE::CPU); +// } + +template +void fill_ev_vector_expected(Tensor* expected); + +template <> +void fill_ev_vector_expected(Tensor* expected) { + test::FillValues( + expected, + {1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, + 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, + 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, + 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421}); +} + +template <> +void fill_ev_vector_expected(Tensor* expected) { + test::FillValues( + expected, { + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + }); +} + +template <> +void fill_ev_vector_expected(Tensor* expected) { + test::FillValues( + expected, {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}); +} + +template <> +void fill_ev_vector_expected(Tensor* expected) { + test::FillValues( + expected, + {22.62741661, 24.04163170, 25.45584488, 26.87005806, 28.28427124, + 29.69848442, 31.11269951, 32.52691269, 73.90083313, 75.63288879, + 77.36493683, 79.09698486, 80.82904053, 82.56108856, 84.29314423, + 86.02519226, 92.61308289, 94.01081848, 95.40855408, 96.80628204, + 98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688, + 72.31395721, 73.42584991, 74.53774261, 75.64963531, 76.76153564, + 77.87342834, 78.98532867}); +} + +class GroupEmbeddingVariableForWardOpTest : public OpsTestBase { + protected: + template + void Run(DEVICE device) { + if (device == DEVICE::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + DataType k_dtype = DataTypeToEnum::value; + DataType v_dtype = DataTypeToEnum::value; + // TensorShapeProto tshape_proto; + // tshape_proto.add_dim()->set_size(8); + // TF_EXPECT_OK(NodeDefBuilder("kv_var_handle", "KvVarHandleOp") + // .Attr("dtype", v_dtype) + // .Attr("Tkeys", k_dtype) + // .Attr("shape", tshape_proto) + // .Attr("container", "EV") + // .Attr("shared_name", "EV") + // .Finalize(node_def())); + // TF_EXPECT_OK(InitOp()); + // TF_ASSERT_OK(RunOpKernel()); + // const Tensor& ev_resource = *GetOutput(0); + // ResourceHandle ev_handle = ev_resource.flat()(0); + + // TF_EXPECT_OK(NodeDefBuilder("initialize_kv_variable", + // "InitializeKvVariableOp") + // .Input(FakeInput(DT_RESOURCE)) // ev + // .Input(FakeInput(DT_RESOURCE)) // ev + // .Input(FakeInput(v_dtype)) // sp_values + // .Input(FakeInput(k_dtype)) // sp_indices + // .Attr("dtype", v_dtype) + // .Attr("Tkeys", k_dtype) + // .Attr("slot_num", 0) + // .Attr("shape", tshape_proto) + // .Attr("initial_num_buckets", 131072) // 2^17 + // .Attr("max_load_factor", 0.8) + // .Attr("steps_to_live", 0) + // .Attr("emb_index", 0) + // .Attr("block_num", 1) + // .Attr("slot_index", 0) + // .Attr("ht_partition_num", 1000) + // .Attr("filter_freq", 0) + // .Attr("max_freq", 999999) + // .Attr("max_element_size", 0) + // .Attr("counter_type", k_dtype) + // .Attr("false_positive_probability", -1.0) + // .Attr("l2_weight_threshold", -1.0) + // .Attr("layout", "") + // .Attr("storage_type", 0) + // .Attr("default_value_dim", 8) + // .Attr("default_value_no_permission", 0.0) + // .Attr("record_freq", false) + // .Attr("record_version", false) + // .Finalize(node_def())); + // TF_EXPECT_OK(InitOp()); + + // AddInputFromArray(TensorShape({}), {ev_handle}); + // AddInputFromArray(TensorShape({}), {ev_handle}); + + // Tensor default_values(v_dtype, {8}); + // test::FillValues(&default_values, + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + // AddInputFromArray(default_values.shape(), + // default_values.flat()); + // Tensor empty_key(k_dtype, {1}); + // test::FillValues(&empty_key, {-1}); + // AddInputFromArray(empty_key.shape(), empty_key.flat()); + // TF_ASSERT_OK(RunOpKernel()); + + // Clear Resource + // inputs_.clear(); + // gtl::STLDeleteElements(&tensors_); + // gtl::STLDeleteElements(&managed_outputs_); + + std::string combiner_str; + float max_norm; + + const int nnz = 10; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int num_lookups = 2; + std::vector sp_values_vec{3, 1, 4, 5, 7, 3, 12, 12, 15, 4}; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("group_embedding_variable_lookup", + "GroupEmbeddingVarLookup") + .Input(FakeInput(num_lookups, DT_RESOURCE)) // ev + .Input(FakeInput(num_lookups, k_dtype)) // sp_values + .Input(FakeInput(num_lookups, DT_INT64)) // sp_indices + .Input(FakeInput(num_lookups, v_dtype)) // sp_weights + .Input(FakeInput(DT_INT32)) // dense_shape + .Input(FakeInput(v_dtype)) // default_value + .Attr("dtype", v_dtype) + .Attr("Tkeys", k_dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Attr("dimension", emb_vector_dim) + .Attr("num_lookups", num_lookups) + .Attr("ignore_weights", true) + .Attr("is_use_default_value_tensor", false) + .Attr("is_inference", false) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + for (int i = 0; i < num_lookups; ++i) { + EmbeddingVar* embedding_var = nullptr; + Allocator* gpu_allocator = device_->GetAllocator(AllocatorAttributes()); + auto embedding_config = + EmbeddingConfig(0, 0, 1, 1, "", 0, 0, 99999, 14.0); + embedding::StorageType storage_type = embedding::StorageType::DRAM; + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), storage_type, false, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), 0}); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(storage_type, "", {1024, 1024, 1024, 1024}, + embedding_config), + gpu_allocator, feat_desc, "EV" + std::to_string(i)); + embedding_var = new EmbeddingVar("EV" + std::to_string(i), + storage, embedding_config, + gpu_allocator, feat_desc); + Tensor value(DT_FLOAT, TensorShape({emb_vector_dim})); + test::FillValues(&value, + std::vector(emb_vector_dim, 1.0)); + embedding_var->Init(value, 1); + + for (int64 j = 0; j < nnz; ++j) { + void* value_ptr = nullptr; + Status s = + embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); + typename TTypes::Flat vflat = embedding_var->flat(value_ptr); + } + AddResourceInput>("", "EV" + std::to_string(i), + embedding_var); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nnz}); + test::FillValues(&sp_values, sp_values_vec); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_indices(DT_INT64, {nnz}); + test::FillValues(&sp_indices, {0, 0, 1, 1, 1, 2, 2, 2, 3, 3}); + AddInputFromArray(sp_indices.shape(), sp_indices.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_weights(v_dtype, {nnz}); + test::FillValues(&sp_weights, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f}); + AddInputFromArray(sp_weights.shape(), sp_weights.flat()); + } + + Tensor* batch_size_tensor = + AddInput(DataTypeToEnum::v(), TensorShape({})); + auto batch_size_data = batch_size_tensor->flat().data(); + batch_size_data[0] = batch_size; + + TF_ASSERT_OK(RunOpKernel()); + + Tensor emb_vector_expected(v_dtype, {batch_size, emb_vector_dim}); + Tensor sp_values_offset_expected(DT_INT64, {7}); + Tensor unique_idx_expected(DT_INT32, {nnz}); + Tensor batch_size_expected(DT_INT32, {batch_size}); + fill_ev_vector_expected(&emb_vector_expected); + + if (device == DEVICE::GPU) { + test::FillValues(&batch_size_expected, {0, 2, 5, 8}); + } else { + test::FillValues(&sp_values_offset_expected, + {3, 1, 4, 5, 7, 12, 15}); + test::FillValues(&unique_idx_expected, + {0, 1, 2, 3, 4, 0, 5, 5, 6, 2}); + test::FillValues(&batch_size_expected, {2, 5, 8, 10}); + } + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& emb_vector = *GetOutput(i); + const Tensor& values_offset = *GetOutput(num_lookups + i); + const Tensor& unique_idx_output = *GetOutput(2 * num_lookups + i); + const Tensor& batch_size_output = *GetOutput(3 * num_lookups + i); + test::ExpectTensorNear(emb_vector_expected, emb_vector, 1e-4); + // Currently GPU do not have Unique logic. + if (device == DEVICE::CPU) { + test::ExpectTensorEqual(sp_values_offset_expected, + values_offset); + test::ExpectTensorEqual(unique_idx_expected, unique_idx_output); + } + test::ExpectTensorEqual(batch_size_expected, batch_size_output); + } + } +}; + +#ifdef GOOGLE_CUDA +// TODO(junqi): Complete GPUEV related test +// TEST_F(GroupEmbeddingVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSqrtnGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatMeanGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSumGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) { +// Run(DEVICE::GPU); +// } +#endif // GOOGLE_CUDA + +TEST_F(GroupEmbeddingVariableForWardOpTest, + EmbeddingVarLocalSparseLookUpFloatSqrtnCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupEmbeddingVariableForWardOpTest, + EmbeddingVarLocalSparseLookUpFloatMeanCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupEmbeddingVariableForWardOpTest, + EmbeddingVarLocalSparseLookUpFloatSumCpu) { + Run(DEVICE::CPU); +} + +// TEST_F(GroupEmbeddingForWardOpTest, +// EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Cpu) { +// Run(DEVICE::CPU); +// } + +class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase { + protected: + template + void Run(DEVICE device) { + if (device == DEVICE::GPU) { + SetDevice(DEVICE_GPU, + std::unique_ptr(DeviceFactory::NewDevice( + "GPU", {}, "/job:a/replica:0/task:0"))); + } + + DataType k_dtype = DataTypeToEnum::value; + DataType v_dtype = DataTypeToEnum::value; + std::string combiner_str; + float max_norm; + + const int nums = 10; + const int nnz = 7; + const int batch_size = 4; + const int emb_vector_dim = 8; + const int entries = 8; + const int bucket_size = 16; + const int num_lookups = 2; + std::vector sp_values_vec{3, 1, 4, 5, 7, 3, 12, 12, 15, 4}; + get_node_attr_from_test_case(combiner_str, max_norm); + + TF_EXPECT_OK(NodeDefBuilder("group_embedding_variable_lookup_grad", + "GroupEmbeddingVariableLookupGrad") + .Input(FakeInput(num_lookups, DT_FLOAT)) // grads + .Input(FakeInput(num_lookups, DT_RESOURCE)) // ev + .Input(FakeInput(num_lookups, k_dtype)) // unique_key + .Input(FakeInput(num_lookups, DT_INT64)) // unique_idx + .Input(FakeInput(num_lookups, DT_INT32)) // batch_nums + .Attr("dtype", v_dtype) + .Attr("Tkeys", k_dtype) + .Attr("combiner", combiner_str) + .Attr("max_norm", max_norm) + .Attr("dimension", emb_vector_dim) + .Attr("num_lookups", num_lookups) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + for (int i = 0; i < num_lookups; ++i) { + Tensor top_grad(DT_FLOAT, {batch_size, emb_vector_dim}); + test::FillValues( + &top_grad, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, + 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0}); + + AddInputFromArray(top_grad.shape(), top_grad.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + EmbeddingVar* embedding_var = nullptr; + Allocator* gpu_allocator = device_->GetAllocator(AllocatorAttributes()); + auto embedding_config = + EmbeddingConfig(0, 0, 1, 1, "", 0, 0, 99999, 14.0); + embedding::StorageType storage_type = embedding::StorageType::DRAM; + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), storage_type, false, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), 0}); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(storage_type, "", {1024, 1024, 1024, 1024}, + embedding_config), + gpu_allocator, feat_desc, "EV" + std::to_string(i)); + embedding_var = new EmbeddingVar("EV" + std::to_string(i), + storage, embedding_config, + gpu_allocator, feat_desc); + Tensor value(DT_FLOAT, TensorShape({emb_vector_dim})); + test::FillValues(&value, + std::vector(emb_vector_dim, 1.0)); + embedding_var->Init(value, 1); + + for (int64 j = 0; j < nnz; ++j) { + void* value_ptr = nullptr; + Status s = + embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); + typename TTypes::Flat vflat = embedding_var->flat(value_ptr); + } + AddResourceInput>("", "EV" + std::to_string(i), + embedding_var); + } + + if (device == DEVICE::GPU) { + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nums}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4}); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT64, {nnz}); + test::FillValues(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT32, {batch_size}); + test::FillValues(&sp_values_offset, {0, 2, 5, 8}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(v_dtype, {nums, emb_vector_dim}); + fill_var_grad_expected(&grad_expected); + + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& grad = *GetOutput(i); + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } + } else { + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values(k_dtype, {nnz}); + test::FillValues(&sp_values, {3, 1, 4, 5, 7, 12, 15}); + AddInputFromArray(sp_values.shape(), sp_values.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT64, {nnz}); + test::FillValues(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + + for (int i = 0; i < num_lookups; ++i) { + Tensor sp_values_offset(DT_INT32, {batch_size}); + test::FillValues(&sp_values_offset, {2, 5, 8, 10}); + AddInputFromArray(sp_values_offset.shape(), + sp_values_offset.flat()); + } + TF_ASSERT_OK(RunOpKernel()); + + Tensor grad_expected(v_dtype, {nnz, emb_vector_dim}); + fill_var_grad_expected(&grad_expected); + + TF_EXPECT_OK(device_->Sync()); + + for (int i = 0; i < num_lookups; ++i) { + const Tensor& grad = *GetOutput(i); + test::ExpectTensorNear(grad_expected, grad, 1e-4); + } + } + } +}; + +#ifdef GOOGLE_CUDA +// TODO(junqi): Complete GPUEV related test + +// TEST_F(GroupEmbeddingVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatMeanGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatSumGpu) { +// Run(DEVICE::GPU); +// } + +// TEST_F(GroupEmbeddingVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) { +// Run(DEVICE::GPU); +// } +#endif // GOOGLE_CUDA + +TEST_F(GroupEmbeddingVariableBackWardOpTest, + EmbeddingVarLocalSparseLookUpGradFloatSqrtCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupEmbeddingVariableBackWardOpTest, + EmbeddingVarLocalSparseLookUpGradFloatMeanCpu) { + Run(DEVICE::CPU); +} + +TEST_F(GroupEmbeddingVariableBackWardOpTest, + EmbeddingVarLocalSparseLookUpGradFloatSumCpu) { + Run(DEVICE::CPU); +} + +// TEST_F(GroupEmbeddingVariableBackWardOpTest, +// EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Cpu) { +// Run(DEVICE::CPU); +// } + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h new file mode 100644 index 00000000..8ced8a0c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h @@ -0,0 +1,371 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include +#include + +#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +namespace { + +template +struct GroupEmbeddingBackWardArgs { + GroupEmbeddingBackWardArgs() = default; + GroupEmbeddingBackWardArgs(TValue *grads, TKey *sp_values, + TValue *emb_variable, TValue *grads_output, + int *offset_indices, int nnz) + : grads_(grads), + sp_values_(sp_values), + emb_variable_(emb_variable), + grads_output_(grads_output), + offset_indices_(offset_indices), + nnz_(nnz) {} + TValue *grads_; + TKey *sp_values_; + TValue *emb_variable_; + TValue *grads_output_; + int *offset_indices_; + int nnz_; +}; + +template +__global__ void ComputeEVGradFn( + const int batch_size, const float max_norm, const int num_lookups, + const int dimension, GroupEmbeddingBackWardArgs *args) { + float l2_sum; + + const auto &block = cooperative_groups::this_thread_block(); + const auto &tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int idx = 0; idx < num_lookups; ++idx) { + int value_offset = args[idx].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[idx].nnz_ - value_offset; + } else { + feature_num = args[idx].offset_indices_[bid + 1] - value_offset; + } + + if (feature_num > 0) { + float grad = args[idx].grads_[bid * dimension + tid]; + grad = CombineGrad(grad, feature_num); + + for (int j = 0; j < feature_num; ++j) { + float grad_i = grad; + int feature_offset = (value_offset + j) * dimension; + if (max_norm > 0.0f) { + float emb_element = 0.0f; // TODO(junqihu): get emb_weight + if (tid == 0) { + l2_sum = 0.0f; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, emb_element * emb_element); + tile.sync(); + float l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + grad_i *= max_norm / l2_norm; + } + } + args[idx].grads_output_[(value_offset + j) * dimension + tid] = + grad_i; + } + } + } + } +} + +template +__global__ void ComputeSparseGradFn( + const int batch_size, const float max_norm, const int num_lookups, + const int dimension, GroupEmbeddingBackWardArgs *args) { + float l2_sum; + const auto &block = cooperative_groups::this_thread_block(); + const auto &tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int idx = 0; idx < num_lookups; ++idx) { + const int value_offset = args[idx].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[idx].nnz_ - value_offset; + } else { + feature_num = args[idx].offset_indices_[bid + 1] - value_offset; + } + + if (feature_num > 0) { + float grad = args[idx].grads_[bid * dimension + tid]; + grad = CombineGrad(grad, feature_num); + for (int i = 0; i < feature_num; i++) { + float grad_i = grad; + if (max_norm > 0.0f) { + int64_t indices = int(args[idx].sp_values_[value_offset + i]); + float emb_element = + args[idx].emb_variable_[indices * dimension + tid]; + if (tid == 0) { + l2_sum = 0.0f; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, emb_element * emb_element); + tile.sync(); + float l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + grad_i *= max_norm / l2_norm; + } + } + args[idx].grads_output_[(value_offset + i) * dimension + tid] = + grad_i; + } + } + } + } +} + +template +__global__ void NormalComputeEVGradFn( + const int batch_size, const float max_norm, const int num_lookups, + const int dimension, GroupEmbeddingBackWardArgs *args) { + __shared__ TValue l2_sum[1]; + + const auto &block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int idx = 0; idx < num_lookups; ++idx) { + int value_offset = args[idx].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[idx].nnz_ - value_offset; + } else { + feature_num = args[idx].offset_indices_[bid + 1] - value_offset; + } + + if (feature_num > 0) { + float grad = args[idx].grads_[bid * dimension + tid]; + grad = CombineGrad(grad, feature_num); + + for (int j = 0; j < feature_num; ++j) { + float grad_i = grad; + int feature_offset = (value_offset + j) * dimension; + if (max_norm > 0.0f) { + float emb_element = 0.0f; // TODO(junqihu): get emb_weight + if (tid == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + grad_i *= max_norm / l2_norm; + } + } + args[idx].grads_output_[(value_offset + j) * dimension + tid] = + grad_i; + } + } + } + } +} + +template +__global__ void NormalComputeSparseGradFn( + const int batch_size, const float max_norm, const int num_lookups, + const int dimension, GroupEmbeddingBackWardArgs *args) { + __shared__ TValue l2_sum[1]; + + const auto &block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + for (int idx = 0; idx < num_lookups; ++idx) { + const int value_offset = args[idx].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[idx].nnz_ - value_offset; + } else { + feature_num = args[idx].offset_indices_[bid + 1] - value_offset; + } + + if (feature_num > 0) { + float grad = args[idx].grads_[bid * dimension + tid]; + grad = CombineGrad(grad, feature_num); + for (int i = 0; i < feature_num; i++) { + float grad_i = grad; + if (max_norm > 0.0f) { + int64_t indices = int(args[idx].sp_values_[value_offset + i]); + float emb_element = + args[idx].emb_variable_[indices * dimension + tid]; + if (tid == 0) { + l2_sum[0] = 0.0f; + } + __syncthreads(); + atomicAdd(l2_sum, emb_element * emb_element); + __syncthreads(); + float l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + grad_i *= max_norm / l2_norm; + } + } + args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i; + } + } + } +} + +} // namespace + +template +class GroupEmbeddingLookupBackWard { + public: + explicit GroupEmbeddingLookupBackWard(int dimension, int num_lookups, + float max_norm, + Allocator *gpu_allocator = nullptr) + : alloc_(gpu_allocator) { + d_args_ = + TypedAllocator::Allocate>( + gpu_allocator, num_lookups, AllocationAttributes()); + h_args_.reserve(num_lookups); + max_norm_ = max_norm; + nums_ = num_lookups; + dimension_ = dimension; + } + + void set(GroupEmbeddingBackWardArgs &arg) { + h_args_.emplace_back(arg); + } + + ~GroupEmbeddingLookupBackWard() { + TypedAllocator::Deallocate(alloc_, d_args_, nums_); + } + + template + inline void Backward(GradFn fn, int batch_size, int tile_size, + cudaStream_t stream) { + CK_CUDA_THROW_(cudaMemcpyAsync( + d_args_, h_args_.data(), + h_args_.size() * sizeof(GroupEmbeddingBackWardArgs), + cudaMemcpyHostToDevice, stream)); + + { + if (tile_size <= 32) { + const int block_size = batch_size * tile_size / 64 + 1; + + fn<<>>(batch_size, max_norm_, nums_, + dimension_, d_args_); + } else { + fn<<>>(batch_size, max_norm_, nums_, + dimension_, d_args_); + } + } + + CK_CUDA_THROW_(cudaGetLastError()); + } + + protected: + std::vector> h_args_; + GroupEmbeddingBackWardArgs *d_args_; + Allocator *alloc_; + float max_norm_; + int nums_; + int dimension_; +}; + +template +class GroupLookupBackWardBaseOp : public OpKernel { + public: + explicit GroupLookupBackWardBaseOp(OpKernelConstruction *c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_)); + OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_)); + OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_)); + } + + template + inline void compute(GroupEmbeddingLookupBackWard &lookuper, + const int batch_size, cudaStream_t stream) { + if (Isev) { + if (dimension_ <= 2) { + lookuper.Backward(ComputeEVGradFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Backward(ComputeEVGradFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Backward(ComputeEVGradFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Backward(ComputeEVGradFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Backward(ComputeEVGradFn, + batch_size, 32, stream); + } else { + lookuper.Backward(NormalComputeEVGradFn, + batch_size, dimension_, stream); + } + } else { + if (dimension_ <= 2) { + lookuper.Backward(ComputeSparseGradFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Backward(ComputeSparseGradFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Backward(ComputeSparseGradFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Backward(ComputeSparseGradFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Backward(ComputeSparseGradFn, + batch_size, 32, stream); + } else { + lookuper.Backward(NormalComputeSparseGradFn, + batch_size, dimension_, stream); + } + } + } + + protected: + std::string combiner_; + float max_norm_; + int num_lookups_; + int dimension_; +}; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc new file mode 100644 index 00000000..6ac80896 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc @@ -0,0 +1,264 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +template +class GroupEmbeddingVarLookupGradCpuOp : public OpKernel { + public: + explicit GroupEmbeddingVarLookupGradCpuOp(OpKernelConstruction* c) + : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_)); + OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_)); + OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_)); + } + + void Compute(OpKernelContext* ctx) override { + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + + for (int i = 0; i < num_lookups_; ++i) { + const Tensor grads_tensor = ctx->input(i); + auto* grads = grads_tensor.flat().data(); + const Tensor unique_keys_tensor = ctx->input(2 * num_lookups_ + i); + auto* unique_keys = unique_keys_tensor.flat().data(); + int unique_nnz = unique_keys_tensor.NumElements(); + + const Tensor sp_indices_tensor = ctx->input(3 * num_lookups_ + i); + auto* sp_indices = sp_indices_tensor.flat().data(); + const Tensor batch_nums_tensor = ctx->input(4 * num_lookups_ + i); + auto* batch_nums = batch_nums_tensor.flat().data(); + + Tensor* grads_sp_values_tensor; + TensorShape grads_sp_values_tensor_shape = + TensorShape(std::vector({unique_nnz, dimension_})); + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape, + &grads_sp_values_tensor)); + auto* grads_sp_values = grads_sp_values_tensor->flat().data(); + + int slice_bytes = unique_nnz * dimension_ * 1000; + if (combiner_ == "mean") { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + // Code Not Help + // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + // int segment_id = sp_indices[i]; + // int scale = batch_nums[segment_id]; + // __m512 _weights = _mm512_set1_ps(scale); + // for (int d = 0; d < dimension_; d+=16) { + // int index = d / 16; + // int remain = dimension_ - d; + // __mmask16 mask = (remain >= 16 ? 0xffff : (1 << + // remain) - 1); + // __m512 _grads = _mm512_set1_ps(grads[segment_id * + // dimension_ + d]); + // __m512 _item = _mm512_div_ps(_grads, _weights); + // _mm512_mask_storeu_ps(grads_sp_values + i * + // dimension_ + d, mask, _item); + // } + // #else + int segment_id = sp_indices[i]; + int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1]; + int scale = batch_nums[segment_id] - batch_offset; + for (int d = 0; d < dimension_; ++d) { + grads_sp_values[i * dimension_ + d] = + grads[segment_id * dimension_ + d] / scale; + } + // #endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, + embedding_var_grad_combiner); // Parallel on batch + } else if (combiner_ == "sum") { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + int segment_id = sp_indices[i]; + memcpy(grads_sp_values + i * dimension_, + grads + segment_id * dimension_, + sizeof(TValue) * dimension_); + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, embedding_var_grad_combiner); + } else { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + // int segment_id = sp_indices[i]; + // int scale = batch_nums[segment_id]; + // __m512 _weights = _mm512_set1_ps(sqrtf(scale)); + // for (int d = 0; d < dimension_; d += 16) { + // int index = d / 16; + // int remain = dimension_ - d; + // __mmask16 mask = (remain >= 16 ? 0xffff : (1 << + // remain) - 1); + // __m512 _grads = + // _mm512_set1_ps(grads[segment_id * dimension_ + + // d]); + // __m512 _item = _mm512_div_ps(_grads, _weights); + // _mm512_mask_storeu_ps(grads_sp_values + i * + // dimension_ + d, mask, + // _item); + // } + // #else + int segment_id = sp_indices[i]; + int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1]; + int scale = batch_nums[segment_id] - batch_offset; + for (int d = 0; d < dimension_; ++d) { + grads_sp_values[i * dimension_ + d] = + grads[segment_id * dimension_ + d] / sqrtf(scale); + } + // #endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, embedding_var_grad_combiner); + } + } + } + + private: + std::string combiner_; + float max_norm_; + int num_lookups_; + int dimension_; +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVariableLookupGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVarLookupGradCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +template +class GroupVariableLookupGradCpuOp : public OpKernel { + public: + explicit GroupVariableLookupGradCpuOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_)); + OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_)); + OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_)); + } + + void Compute(OpKernelContext* ctx) override { + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + for (int i = 0; i < num_lookups_; ++i) { + const Tensor grads_tensor = ctx->input(i); + auto* grads = grads_tensor.flat().data(); + const Tensor emb_variables_tensor = ctx->input(num_lookups_ + i); + const Tensor unique_keys_tensor = ctx->input(2 * num_lookups_ + i); + auto* unique_keys = unique_keys_tensor.flat().data(); + int unique_nnz = unique_keys_tensor.NumElements(); + + const Tensor sp_indices_tensor = ctx->input(3 * num_lookups_ + i); + auto* sp_indices = sp_indices_tensor.flat().data(); + const Tensor batch_nums_tensor = ctx->input(4 * num_lookups_ + i); + auto* batch_nums = batch_nums_tensor.flat().data(); + + Tensor* grads_sp_values_tensor; + TensorShape grads_sp_values_tensor_shape = + TensorShape(std::vector({unique_nnz, dimension_})); + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape, + &grads_sp_values_tensor)); + TValue* grads_sp_values = grads_sp_values_tensor->flat().data(); + + int slice_bytes = unique_nnz * dimension_ * 1000; + if (combiner_ == "mean") { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + int segment_id = sp_indices[i]; + int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1]; + int scale = batch_nums[segment_id] - batch_offset; + for (int d = 0; d < dimension_; ++d) { + grads_sp_values[i * dimension_ + d] = + grads[segment_id * dimension_ + d] / scale; + } + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, + embedding_var_grad_combiner); // Parallel on batch + } else if (combiner_ == "sum") { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + int segment_id = sp_indices[i]; + memcpy(grads_sp_values + i * dimension_, + grads + segment_id * dimension_, + sizeof(TValue) * dimension_); + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, + embedding_var_grad_combiner); // Parallel on batch + } else { + auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices, + grads, batch_nums](int64 start, + int64 end) { + for (int64 i = start; i < end; ++i) { + int segment_id = sp_indices[i]; + int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1]; + int scale = batch_nums[segment_id] - batch_offset; + for (int d = 0; d < dimension_; ++d) { + grads_sp_values[i * dimension_ + d] = + grads[segment_id * dimension_ + d] / sqrtf(scale); + } + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz, + slice_bytes /*cost*/, + embedding_var_grad_combiner); // Parallel on batch + } + } + } + + private: + std::string combiner_; + float max_norm_; + int num_lookups_; + int dimension_; +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER(Name("GroupVariableLookupGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupVariableLookupGradCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc new file mode 100644 index 00000000..16d99562 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc @@ -0,0 +1,176 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h" +#include "group_embedding_lookup_sparse_backward_base_ops.cu.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { + +using GPUDevice = Eigen::GpuDevice; + +template +class GroupVariableLookupBackwardOp + : public GroupLookupBackWardBaseOp { + public: + explicit GroupVariableLookupBackwardOp(OpKernelConstruction* c) + : GroupLookupBackWardBaseOp(c) {} + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + int batch_size = -1; + + Allocator* gpu_allocator = + ctx->device()->GetAllocator(AllocatorAttributes()); + GroupEmbeddingLookupBackWard lookuper( + this->dimension_, this->num_lookups_, this->max_norm_, gpu_allocator); + for (int i = 0; i < this->num_lookups_; ++i) { + const Tensor grads_tensor = ctx->input(i); + const Tensor emb_variables_tensor = ctx->input(this->num_lookups_ + i); + const Tensor sp_values_tensor = ctx->input(2 * this->num_lookups_ + i); + const Tensor sp_values_offset_tensor = + ctx->input(4 * this->num_lookups_ + i); + const int64_t nnz = sp_values_tensor.NumElements(); + + Tensor* grads_sp_values_tensor; + TensorShape grads_sp_values_tensor_shape = + TensorShape(std::vector({nnz, this->dimension_})); + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape, + &grads_sp_values_tensor)); + auto* grads_sp_values = grads_sp_values_tensor->flat().data(); + cudaMemsetAsync(grads_sp_values, 0, + sizeof(TValue) * nnz * this->dimension_, stream); + + if (i == 0) { + batch_size = sp_values_offset_tensor.shape().dim_size(0); + } + + GroupEmbeddingBackWardArgs args( + const_cast(grads_tensor.flat().data()), + const_cast(reinterpret_cast( + sp_values_tensor.flat().data())), + const_cast(emb_variables_tensor.flat().data()), + grads_sp_values, + const_cast(sp_values_offset_tensor.flat().data()), nnz); + lookuper.set(args); + } + + if (this->combiner_ == "mean") { + this->template compute(lookuper, batch_size, stream); + } else if (this->combiner_ == "sum") { + this->template compute(lookuper, batch_size, stream); + } else { + this->template compute(lookuper, batch_size, stream); + } + } +}; + +#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupVariableLookupGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupVariableLookupBackwardOp) + +REGISTER_GPU_KERNELS(int64, int64_t, float); +REGISTER_GPU_KERNELS(int32, int32_t, float); +#undef REGISTER_GPU_KERNELS + +template +class GroupEmbeddingVariableLookupBackwardOp + : public GroupLookupBackWardBaseOp { + public: + explicit GroupEmbeddingVariableLookupBackwardOp(OpKernelConstruction* c) + : GroupLookupBackWardBaseOp(c) {} + + void Compute(OpKernelContext* ctx) override { + auto stream = ctx->eigen_device().stream(); + int batch_size = -1; + + Allocator* gpu_allocator = + ctx->device()->GetAllocator(AllocatorAttributes()); + GroupEmbeddingLookupBackWard lookuper( + this->dimension_, this->num_lookups_, this->max_norm_, gpu_allocator); + for (int i = 0; i < this->num_lookups_; ++i) { + const Tensor grads_tensor = ctx->input(i); + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK( + ctx, LookupResource(ctx, HandleFromInput(ctx, this->num_lookups_ + i), + &ev)); + core::ScopedUnref unref_me(ev); + const Tensor sp_values_tensor = ctx->input(2 * this->num_lookups_ + i); + const Tensor sp_values_offset_tensor = + ctx->input(4 * this->num_lookups_ + i); + // int dimension = ev->ValueLen(); + if (i == 0) { + batch_size = sp_values_offset_tensor.shape().dim_size(0); + } + + const int64_t nnz = sp_values_tensor.NumElements(); + + Tensor* grads_sp_values_tensor; + TensorShape grads_sp_values_tensor_shape = + TensorShape(std::vector({nnz, this->dimension_})); + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape, + &grads_sp_values_tensor)); + auto* grads_sp_values = grads_sp_values_tensor->flat().data(); + cudaMemsetAsync(grads_sp_values, 0, + sizeof(TValue) * nnz * this->dimension_, stream); + + GroupEmbeddingBackWardArgs args( + const_cast(grads_tensor.flat().data()), + const_cast(reinterpret_cast( + sp_values_tensor.flat().data())), + nullptr /*fake*/, grads_sp_values, + const_cast(sp_values_offset_tensor.flat().data()), nnz); + lookuper.set(args); + } + + if (this->combiner_ == "mean") { + this->template compute(lookuper, batch_size, stream); + } else if (this->combiner_ == "sum") { + this->template compute(lookuper, batch_size, stream); + } else { + this->template compute(lookuper, batch_size, stream); + } + } +}; + +#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVariableLookupGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVariableLookupBackwardOp) + +REGISTER_GPU_KERNELS(int64, int64_t, float); +REGISTER_GPU_KERNELS(int32, int32_t, float); +#undef REGISTER_GPU_KERNELS + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h new file mode 100644 index 00000000..3091535d --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h @@ -0,0 +1,721 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#if GOOGLE_CUDA + +#include +#include + +#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/typed_allocator.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { + +namespace { + +template +struct GroupEmbeddingForWardArgs { + GroupEmbeddingForWardArgs() = default; + GroupEmbeddingForWardArgs(TValue* emb_variable, TValue* sp_weights, + TValue* emb_vector, TKey* sp_values, + int* offset_indices, int nnz) + : emb_variable_(emb_variable), + sp_weights_(sp_weights), + emb_vector_(emb_vector), + sp_values_(sp_values), + offset_indices_(offset_indices), + nnz_(nnz) {} + TValue* emb_variable_; + TValue* sp_weights_; + TValue* emb_vector_; + TKey* sp_values_; + int* offset_indices_; + int nnz_; +}; + +__global__ void SetToIntMaxSTG128(const int batch_size, int* values_offset) { + const int thread_offset = 4 * (blockIdx.x * blockDim.x + threadIdx.x); + const int int_max = 0x7fffffff; + if (thread_offset + 4 < batch_size) { + ::int4 four = make_int4(int_max, int_max, int_max, int_max); + *((::int4*)(values_offset + thread_offset)) = four; + } else if (thread_offset < batch_size) { + for (int i = thread_offset; i < batch_size; i++) { + values_offset[i] = int_max; + } + } +} + +__device__ void FilledEmptyRowNumber(int batch_size, + volatile int* values_offset) { + const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x; + const int int_max = 0x7fffffff; + if (thread_offset > 1) { + if (thread_offset < batch_size) { + while (values_offset[thread_offset] == int_max) { + const int compare = values_offset[thread_offset - 1]; + if (compare != int_max) { + atomicMin((int*)values_offset + thread_offset, compare); + } + } + } + } else { + if (values_offset[thread_offset] == int_max) { + values_offset[thread_offset] = 0; + } + } +} + +__global__ void CalcPerElementRowOffset(int batch_size, int nnz, int stride, + const int64_t* indices, + int* values_offset) { + const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_offset < nnz) { + const int64_t element_row = indices[stride * thread_offset]; + atomicMin((int*)values_offset + int(element_row), thread_offset); + } + __syncthreads(); + FilledEmptyRowNumber(batch_size, values_offset); +} + +inline void launch_cal_per_element_row_offset(const int batch_size, int nnz, + int stride, + const int64_t* sp_indices, + int* offset_indices, + cudaStream_t stream) { + static int threads = 1024; + int blocks = (batch_size - 1) / threads + 1; + + SetToIntMaxSTG128<<>>(batch_size, offset_indices); + blocks = (nnz - 1) / threads + 1; + CalcPerElementRowOffset<<>>( + batch_size, nnz, stride, sp_indices, offset_indices); +} + +template +__global__ void WeightedEmbeddingVarComputeFn( + const int batch_size, const int dimension, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + TValue l2_sum; + + const auto& block = cooperative_groups::this_thread_block(); + const auto& tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + + float out = 0.0f; + float total_batch_weight = 0.0f; + if (feature_num > 0) { + for (int j = 0; j < feature_num; ++j) { + size_t feature_indices = value_offset + j; + int64_t embedding_offset = feature_indices * dimension; + TValue sum = args[ev_id].emb_variable_[embedding_offset + tid]; + TValue sp_weights = args[ev_id].sp_weights_[feature_indices]; + total_batch_weight += sp_weights; + if (max_norm >= 0.0) { + if (tid == 0) { + l2_sum = 0.0; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, sum * sum); + tile.sync(); + TValue l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + sum *= max_norm / l2_norm; + } + } + out = __fmaf_rn(sum, sp_weights, out); + } + out = Combine(out, total_batch_weight); + } + args[ev_id].emb_vector_[bid * dimension + tid] = out; + } + } +} + +template +__global__ void WeightedVariableComputeFn( + const int batch_size, const int emb_vec_size, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + TValue l2_sum; + const auto& block = cooperative_groups::this_thread_block(); + const auto& tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < emb_vec_size) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + + TValue out = 0.0f; + TValue total_batch_weight = 0.0f; + const TValue* emb_variable = args[ev_id].emb_variable_; + // #pragma unroll + if (feature_num > 0) { + for (int i = 0; i < feature_num; i++) { + size_t feature_indices = value_offset + i; + int embedding_indices = int(args[ev_id].sp_values_[feature_indices]); + TValue sp_weights = args[ev_id].sp_weights_[embedding_indices]; + total_batch_weight += sp_weights; + TValue emb_element = emb_variable[feature_indices]; + if (max_norm >= 0.0f) { + // calc l2 norm of this emb row(per block) and compare with + // max_norm. + // if greater than max_norm, then clip every element with factor + // max_norm / l2norm + if (tid == 0) { + l2_sum = 0.0f; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, emb_element * emb_element); + tile.sync(); + TValue l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + out = __fmaf_rn(emb_element, sp_weights, out); + } + out = Combine(out, total_batch_weight); + } + args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out; + } + } +} + +template +__global__ void EmbeddingVarComputeFn( + const int batch_size, const int dimension, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + TValue l2_sum; + + const auto& block = cooperative_groups::this_thread_block(); + const auto& tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0; + + // #pragma unroll + if (feature_num > 0) { + for (int j = 0; j < feature_num; ++j) { + int64_t feature_offset = (value_offset + j) * dimension; + TValue sum = args[ev_id].emb_variable_[feature_offset + tid]; + if (max_norm >= 0.0) { + if (tid == 0) { + l2_sum = 0.0; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, sum * sum); + tile.sync(); + TValue l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + sum *= max_norm / l2_norm; + } + } + out += sum; + } + out = Combine(out, feature_num); + } + args[ev_id].emb_vector_[bid * dimension + tid] = out; + } + } +} + +template +__global__ void VariableComputeFn( + const int batch_size, const int emb_vec_size, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + TValue l2_sum; + const auto& block = cooperative_groups::this_thread_block(); + const auto& tile = cooperative_groups::tiled_partition(block); + // each block partition corresponding to one sample + const int bid = + block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); + // each thread corresponding to one element in the embedding vector + const int tid = tile.thread_rank(); + + if (bid < batch_size && tid < emb_vec_size) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0f; + + const TValue* emb_variable = args[ev_id].emb_variable_; + // #pragma unroll + if (feature_num > 0) { + for (int i = 0; i < feature_num; i++) { + int indices = int(args[ev_id].sp_values_[value_offset + i]); + TValue emb_element = emb_variable[indices * emb_vec_size + tid]; + // printf("indices is %d emb_element is %f\n", indices, emb_element); + if (max_norm >= 0.0f) { + // calc l2 norm of this emb row(per block) and compare with + // max_norm. + // if greater than max_norm, then clip every element with factor + // max_norm / l2norm + if (tid == 0) { + l2_sum = 0.0f; + } + tile.shfl(l2_sum, 0); + atomicAdd(&l2_sum, emb_element * emb_element); + tile.sync(); + TValue l2_norm = sqrtf(l2_sum); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + out += emb_element; + } + out = Combine(out, feature_num); + } + args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out; + } + } +} + +template +__global__ void NormalEmbeddingVarComputeFn( + const int batch_size, const int dimension, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + __shared__ TValue l2_sum[1]; + + const auto& block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0; + + // #pragma unroll + if (feature_num > 0) { + for (int j = 0; j < feature_num; ++j) { + int64_t feature_offset = (value_offset + j) * dimension; + TValue sum = args[ev_id].emb_variable_[feature_offset + tid]; + if (max_norm >= 0.0) { + if (tid == 0) { + l2_sum[0] = 0.0; + } + block.sync(); + atomicAdd(l2_sum, sum * sum); + block.sync(); + TValue l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + sum *= max_norm / l2_norm; + } + } + out += sum; + } + out = Combine(out, feature_num); + } + args[ev_id].emb_vector_[bid * dimension + tid] = out; + } + } +} + +template +__global__ void NormalVariableComputeFn( + const int batch_size, const int emb_vec_size, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + __shared__ TValue l2_sum[1]; + const auto& block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + if (bid < batch_size && tid < emb_vec_size) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0f; + + const TValue* emb_variable = args[ev_id].emb_variable_; + // #pragma unroll + if (feature_num > 0) { + for (int i = 0; i < feature_num; i++) { + int indices = int(args[ev_id].sp_values_[value_offset + i]); + TValue emb_element = emb_variable[indices * emb_vec_size + tid]; + // printf("indices is %d emb_element is %f\n", indices, emb_element); + if (max_norm >= 0.0f) { + // calc l2 norm of this emb row(per block) and compare with + // max_norm. + // if greater than max_norm, then clip every element with factor + // max_norm / l2norm + if (tid == 0) { + l2_sum[0] = 0.0f; + } + block.sync(); + atomicAdd(l2_sum, emb_element * emb_element); + block.sync(); + TValue l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + out += emb_element; + } + out = Combine(out, feature_num); + } + args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out; + } + } +} + +template +__global__ void NormalWeightedEmbeddingVarComputeFn( + const int batch_size, const int dimension, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + __shared__ TValue l2_sum[1]; + + const auto& block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + if (bid < batch_size && tid < dimension) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0f; + TValue total_batch_weight = 0.0f; + // #pragma unroll + if (feature_num > 0) { + for (int j = 0; j < feature_num; ++j) { + size_t feature_indices = value_offset + j; + int64_t embedding_offset = feature_indices * dimension; + TValue sum = args[ev_id].emb_variable_[embedding_offset + tid]; + TValue sp_weights = args[ev_id].sp_weights_[feature_indices]; + total_batch_weight += sp_weights; + if (max_norm >= 0.0) { + if (tid == 0) { + l2_sum[0] = 0.0; + } + block.sync(); + atomicAdd(l2_sum, sum * sum); + block.sync(); + TValue l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + sum *= max_norm / l2_norm; + } + } + out = __fmaf_rn(sum, sp_weights, out); + } + out = Combine(out, total_batch_weight); + } + args[ev_id].emb_vector_[bid * dimension + tid] = out; + } + } +} + +template +__global__ void NormalWeightedVariableComputeFn( + const int batch_size, const int emb_vec_size, const float max_norm, + const int num_lookups, GroupEmbeddingForWardArgs* args) { + __shared__ TValue l2_sum[1]; + const auto& block = cooperative_groups::this_thread_block(); + // each block partition corresponding to one sample + const int bid = block.group_index().x; + // each thread corresponding to one element in the embedding vector + const int tid = block.thread_rank(); + + if (bid < batch_size && tid < emb_vec_size) { + for (int ev_id = 0; ev_id < num_lookups; ++ev_id) { + int value_offset = args[ev_id].offset_indices_[bid]; + int feature_num; + if (bid == (batch_size - 1)) { + feature_num = args[ev_id].nnz_ - value_offset; + } else { + feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset; + } + TValue out = 0.0f; + TValue total_batch_weight = 0.0f; + const TValue* emb_variable = args[ev_id].emb_variable_; + + // #pragma unroll + if (feature_num > 0) { + for (int i = 0; i < feature_num; i++) { + size_t feature_indices = value_offset + i; + int embedding_indices = int(args[ev_id].sp_values_[feature_indices]); + TValue emb_element = + emb_variable[embedding_indices * emb_vec_size + tid]; + TValue sp_weights = args[ev_id].sp_weights_[feature_indices]; + total_batch_weight += sp_weights; + // printf("indices is %d emb_element is %f\n", indices, emb_element); + if (max_norm >= 0.0f) { + // calc l2 norm of this emb row(per block) and compare with + // max_norm. + // if greater than max_norm, then clip every element with factor + // max_norm / l2norm + if (tid == 0) { + l2_sum[0] = 0.0f; + } + block.sync(); + atomicAdd(l2_sum, emb_element * emb_element); + block.sync(); + TValue l2_norm = sqrtf(l2_sum[0]); + if (l2_norm > max_norm) { + emb_element *= max_norm / l2_norm; + } + } + out = __fmaf_rn(emb_element, sp_weights, out); + } + out = Combine(out, feature_num); + } + args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out; + } + } +} + +template +class GroupEmbeddingLookupForWard { + public: + explicit GroupEmbeddingLookupForWard(const int num_lookups, + const int dimension, + const float max_norm, + Allocator* gpu_allocator = nullptr) + : alloc_(gpu_allocator) { + max_norm_ = max_norm; + dimension_ = dimension; + ev_nums_ = num_lookups; + d_args_ = TypedAllocator::Allocate>( + gpu_allocator, num_lookups, AllocationAttributes()); + args_size_ = sizeof(GroupEmbeddingForWardArgs); + h_args_.reserve(ev_nums_); + } + + ~GroupEmbeddingLookupForWard() { + TypedAllocator::Deallocate(alloc_, d_args_, ev_nums_); + } + + void set(GroupEmbeddingForWardArgs& arg) { + h_args_.emplace_back(arg); + } + + template + inline void Lookup(ForwardFn compute_fn, const int batch_size, + const int tile_size, cudaStream_t stream) { + CK_CUDA_THROW_(cudaMemcpyAsync(d_args_, h_args_.data(), + ev_nums_ * args_size_, + cudaMemcpyHostToDevice, stream)); + + { + if (tile_size <= 32) { + const int block_size = batch_size * tile_size / 64 + 1; + compute_fn<<>>(batch_size, dimension_, + max_norm_, ev_nums_, d_args_); + } else { + compute_fn<<>>( + batch_size, dimension_, max_norm_, ev_nums_, d_args_); + } + } + + CK_CUDA_THROW_(cudaGetLastError()); + } + + protected: + std::vector> h_args_; + GroupEmbeddingForWardArgs* d_args_{nullptr}; + Allocator* alloc_; + float max_norm_{0.0f}; + int ev_nums_{0}; + int dimension_{0}; + size_t args_size_{0}; +}; + +template +class GroupEmbeddingLookupForwardBaseOp : public OpKernel { + public: + explicit GroupEmbeddingLookupForwardBaseOp(OpKernelConstruction* c) + : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_)); + OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_)); + OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_)); + OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_)); + OP_REQUIRES_OK(c, c->GetAttr("ignore_weights", &ignore_weights_)); + OP_REQUIRES_OK(c, c->GetAttr("is_sequence", &is_sequence_)); + } + + template + inline void compute(GroupEmbeddingLookupForWard& lookuper, + const int batch_size, cudaStream_t stream) { + if (isEv) { + if (ignore_weights_) { + if (dimension_ <= 2) { + lookuper.Lookup(EmbeddingVarComputeFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Lookup(EmbeddingVarComputeFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Lookup(EmbeddingVarComputeFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Lookup(EmbeddingVarComputeFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Lookup(EmbeddingVarComputeFn, + batch_size, 32, stream); + } else { + lookuper.Lookup(NormalEmbeddingVarComputeFn, + batch_size, dimension_, stream); + } + } else { + if (dimension_ <= 2) { + lookuper.Lookup( + WeightedEmbeddingVarComputeFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Lookup( + WeightedEmbeddingVarComputeFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Lookup( + WeightedEmbeddingVarComputeFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Lookup( + WeightedEmbeddingVarComputeFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Lookup( + WeightedEmbeddingVarComputeFn, + batch_size, 32, stream); + } else { + lookuper.Lookup( + NormalWeightedEmbeddingVarComputeFn, + batch_size, dimension_, stream); + } + } + } else { + if (ignore_weights_) { + if (dimension_ <= 2) { + lookuper.Lookup(VariableComputeFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Lookup(VariableComputeFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Lookup(VariableComputeFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Lookup(VariableComputeFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Lookup(VariableComputeFn, + batch_size, 32, stream); + } else { + lookuper.Lookup(NormalVariableComputeFn, + batch_size, dimension_, stream); + } + } else { + if (dimension_ <= 2) { + lookuper.Lookup(WeightedVariableComputeFn, + batch_size, 2, stream); + } else if (dimension_ <= 4) { + lookuper.Lookup(WeightedVariableComputeFn, + batch_size, 4, stream); + } else if (dimension_ <= 8) { + lookuper.Lookup(WeightedVariableComputeFn, + batch_size, 8, stream); + } else if (dimension_ <= 16) { + lookuper.Lookup(WeightedVariableComputeFn, + batch_size, 16, stream); + } else if (dimension_ <= 32) { + lookuper.Lookup(WeightedVariableComputeFn, + batch_size, 32, stream); + } else { + lookuper.Lookup( + NormalWeightedVariableComputeFn, + batch_size, dimension_, stream); + } + } + } + } + + protected: + std::string combiner_; + float max_norm_; + int num_lookups_; + int dimension_; + bool ignore_weights_; + bool is_sequence_; +}; + +} // namespace + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h new file mode 100644 index 00000000..c1395e00 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h @@ -0,0 +1,64 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#include "deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h", +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/training_op_helpers.h" + +namespace tensorflow { +// It's suggested that all CPU GroupEmbedding operations inherit from this base +// class. +template +class GroupLookupBaseCpuOp : public OpKernel { + public: + explicit GroupLookupBaseCpuOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("combiner", &m_combiner)); + OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &m_num_lookup)); + OP_REQUIRES_OK(c, c->GetAttr("dimension", &m_dimension)); + // OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_)); + OP_REQUIRES_OK(c, c->GetAttr("ignore_weights", &m_ignore_weights)); + OP_REQUIRES_OK(c, c->GetAttr("is_sequence", &m_is_sequence)); + OP_REQUIRES_OK(c, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, + kPartitionSize, &partition_size_)); + OP_REQUIRES( + c, partition_size_ > 0, + errors::InvalidArgument("Invaild PARTITION_SIZE=", partition_size_)); + OP_REQUIRES_OK(c, ReadBoolFromEnvVar(kUniqueOpSerialEnv, false, &serial_)); + OP_REQUIRES_OK( + c, ReadInt64FromEnvVar(kUniqueOpUniqRatioHint, kDefaultUniqueRatioHint, + &unique_ratio_hint_)); + OP_REQUIRES(c, unique_ratio_hint_ > 0, + errors::InvalidArgument("Invaild ", kUniqueOpUniqRatioHint, "=", + unique_ratio_hint_)); + } + + protected: + // float max_norm_; + int m_num_lookup; + int m_dimension; + bool m_is_use_default_value_tensor; + bool m_ignore_weights; + bool m_is_sequence; + std::string m_combiner; + bool serial_ = false; + int64 partition_size_ = 0; + int64 unique_ratio_hint_; + UniqueMaps map_flag_ = GOOGLE; // "GOOGLE" dense hash map is default + const int64 kDefaultUniqueRatioHint = 4; + const char* kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL"; + const char* kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT"; + const char* kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE"; +}; + +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc new file mode 100644 index 00000000..d4c61922 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc @@ -0,0 +1,690 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h" +#include "group_embedding_lookup_sparse_forward_base_ops.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/util/work_sharder.h" +namespace tensorflow { + +#define USING_BASE_CLASS_MEMBER \ + using GroupLookupBaseCpuOp::m_num_lookup; \ + using GroupLookupBaseCpuOp::m_dimension; \ + using GroupLookupBaseCpuOp::m_is_use_default_value_tensor; \ + using GroupLookupBaseCpuOp::m_is_sequence; + +using CPUDevice = Eigen::ThreadPoolDevice; + +template +class GroupEmbeddingVariableLookupCpuOp + : public GroupLookupBaseCpuOp { + USING_BASE_CLASS_MEMBER + + public: + explicit GroupEmbeddingVariableLookupCpuOp(OpKernelConstruction *c) + : GroupLookupBaseCpuOp(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &m_is_use_default_value_tensor)); + } + + void Compute(OpKernelContext *ctx) override { + /* + step 1: unique and assign unique output and index + step 2: doing unique value gather + step 3: assign unique embedding to batch result and pooling + */ + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + + for (int i = 0; i < m_num_lookup; ++i) { + EmbeddingVar *embedding_var = nullptr; + OP_REQUIRES_OK( + ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var)); + core::ScopedUnref unref_me(embedding_var); + + const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i); + const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i); + auto sp_indices = sp_indices_tensor.flat().data(); + int nnz = sp_values_tensor.NumElements(); + const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i); + auto dense_shape = dense_shape_tensor.flat().data(); + int64 batch_size = dense_shape[0]; + + OP_REQUIRES( + ctx, + !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() && + embedding_var->CacheSize() >= nnz), + errors::InvalidArgument("MultiLevel EV's Cache size ", + embedding_var->CacheSize(), + " should large than IDs in batch ", nnz)); + + // Stage 1 + Tensor unique_idx_tensor; + Tensor unique_tensor; + Tensor unique_counter; + + UniqueWithoutAxis(ctx, sp_values_tensor, &unique_idx_tensor, + &unique_tensor, &unique_counter, 0, + this->partition_size_, this->serial_, + this->unique_ratio_hint_, this->map_flag_); + + ctx->set_output(m_num_lookup + i, unique_tensor); + ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor); + + auto *unique = unique_tensor.flat().data(); + auto *unique_idx = unique_idx_tensor.flat().data(); + + int unique_nnz = unique_tensor.shape().dim_size(0); + TensorShape unique_shape{static_cast(unique_nnz)}; + + TensorShape batch_nums_tensor_shape = + TensorShape(std::vector({batch_size})); + Tensor *batch_nums_tensor = nullptr; + // allocate output + OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i, + batch_nums_tensor_shape, + &batch_nums_tensor)); + auto batch_nums = batch_nums_tensor->flat().data(); + memset(batch_nums, 0, batch_size * sizeof(int)); + for (int k = 0; k < nnz; ++k) { + int batch_id = sp_indices[k * dense_shape_tensor.NumElements()]; + batch_nums[batch_id] += 1; + } + for (int k = 1; k < batch_size; ++k) { + batch_nums[k] += batch_nums[k - 1]; + } + + // Stage 2 + Tensor unique_embedding; + unique_shape.AppendShape({static_cast(m_dimension)}); + AllocatorAttributes attr; + attr.set_on_host(true); + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DataTypeToEnum::v(), unique_shape, + &unique_embedding, attr)); + auto unique_embedding_data = unique_embedding.flat().data(); + EmbeddingVarContext ev_ctx(ctx); + if (m_is_use_default_value_tensor) { + embedding_var->GetEmbeddings( + ev_ctx, unique, unique_embedding_data, unique_nnz, + reinterpret_cast( + ctx->input(m_num_lookup * 4 + 1).data())); + } else { + embedding_var->GetEmbeddings(ev_ctx, unique, unique_embedding_data, + unique_nnz); + embedding_var->UpdateCache(unique_tensor, unique_counter, + true /*called_by_gather*/); + } + + std::vector default_weights(nnz, 1.0); + TValue *sp_weights = default_weights.data(); + if (!this->m_ignore_weights) { + const Tensor &sp_weights_tensor = + ctx->input(this->m_num_lookup * 3 + i); + sp_weights = + const_cast(sp_weights_tensor.flat().data()); + } + + // Stage 3 + TensorShape emb_vectors_tensor_shape; + // Special case for sequence categorical column output + if (m_is_sequence) { + emb_vectors_tensor_shape = TensorShape( + std::vector({batch_size, dense_shape[1], m_dimension})); + } else { + emb_vectors_tensor_shape = + TensorShape(std::vector({batch_size, m_dimension})); + } + Tensor *gather_embedding_tensor = nullptr; + // allocate output + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &gather_embedding_tensor)); + auto gather_embedding = gather_embedding_tensor->flat().data(); + + int slice_bytes = nnz / batch_size * m_dimension * 1000; + // todo: clean these redundant code + if (this->m_combiner == "mean") { + auto embedding_var_mean_combiner = [this, &gather_embedding, batch_nums, + unique_idx, unique, + unique_embedding_data, sp_weights]( + int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + __m512 batch_total_weights = _mm512_set1_ps(0.0f); + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + __m512 _weights = + _mm512_set1_ps(*(sp_weights + batch_offset + j)); + batch_total_weights = + _mm512_add_ps(batch_total_weights, _weights); + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + + if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f); + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + tmp_embedding[index] = + _mm512_div_ps(tmp_embedding[index], batch_total_weights); + _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d, + mask, tmp_embedding[index]); + } +#else + TValue batch_total_weights = 0.0f; + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + TValue sp_weight = sp_weights[batch_offset + j]; + batch_total_weights += sp_weight; + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(*(u_embedding + d), sp_weight, tmp_embedding[d]); + } + } + + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] /= batch_total_weights; + } + + memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, embedding_var_mean_combiner); + } else if (this->m_combiner == "sum") { + auto embedding_var_sum_combiner = [this, &gather_embedding, batch_nums, + unique_idx, unique, + unique_embedding_data, + sp_weights](int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + __m512 _weights = + _mm512_set1_ps(*(sp_weights + batch_offset + j)); + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d, + mask, tmp_embedding[index]); + } +#else + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(u_embedding[d], sp_weights[batch_offset + j], + tmp_embedding[d]); + } + } + memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, embedding_var_sum_combiner); + } else { + auto embedding_var_sqrtn_combiner = [this, &gather_embedding, + batch_nums, unique_idx, unique, + unique_embedding_data, sp_weights]( + int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + TValue batch_total_weights = 0.0f; + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + TValue local_weight = *(sp_weights + batch_offset + j); + __m512 _weights = _mm512_set1_ps(local_weight); + batch_total_weights = + std::fma(local_weight, local_weight, batch_total_weights); + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + __m512 _total_weights; + if (batch_num != 0) { + _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights)); + } else { + _total_weights = _mm512_set1_ps(1.0f); + } + + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + tmp_embedding[index] = + _mm512_div_ps(tmp_embedding[index], _total_weights); + _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d, + mask, tmp_embedding[index]); + } +#else + TValue batch_total_weights = 0.0f; + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + float *u_embedding = + unique_embedding_data + unique_indice * m_dimension; + TValue sp_weight = sp_weights[batch_offset + j]; + batch_total_weights = + std::fma(sp_weight, sp_weight, batch_total_weights); + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(u_embedding[d], sp_weight, tmp_embedding[d]); + } + } + + if (batch_num != 0) { + batch_total_weights = sqrtf(batch_total_weights); + } else { + batch_total_weights = 1.0f; + } + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] /= batch_total_weights; + } + + memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, embedding_var_sqrtn_combiner); + } + } + } +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVarLookup") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVariableLookupCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +template +class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp { + USING_BASE_CLASS_MEMBER + public: + explicit GroupVariableLookupCpuOp(OpKernelConstruction *c) + : GroupLookupBaseCpuOp(c) {} + + void Compute(OpKernelContext *ctx) override { + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + for (int i = 0; i < m_num_lookup; ++i) { + const Tensor &emb_variable_tensor = ctx->input(i); + const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i); + int nnz = sp_values_tensor.NumElements(); + auto embedding_variable = emb_variable_tensor.flat().data(); + + const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i); + auto sp_indices = sp_indices_tensor.flat().data(); + + const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i); + auto dense_shape = dense_shape_tensor.flat().data(); + int64 batch_size = dense_shape[0]; + + TensorShape batch_nums_tensor_shape = + TensorShape(std::vector({batch_size})); + Tensor *batch_nums_tensor = nullptr; + // allocate output + OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i, + batch_nums_tensor_shape, + &batch_nums_tensor)); + auto batch_nums = batch_nums_tensor->flat().data(); + memset(batch_nums, 0, batch_size * sizeof(int)); + for (int k = 0; k < nnz; ++k) { + int batch_id = sp_indices[k * dense_shape_tensor.NumElements()]; + batch_nums[batch_id] += 1; + } + for (int k = 1; k < batch_size; ++k) { + batch_nums[k] += batch_nums[k - 1]; + } + + TensorShape emb_vectors_tensor_shape; + // Special case for sequence categorical column output + if (m_is_sequence) { + emb_vectors_tensor_shape = TensorShape( + std::vector({batch_size, dense_shape[1], m_dimension})); + } else { + emb_vectors_tensor_shape = + TensorShape(std::vector({batch_size, m_dimension})); + } + + Tensor *emb_vectors_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &emb_vectors_tensor)); + auto emb_vectors = emb_vectors_tensor->flat().data(); + + // Stage 1 + Tensor unique_idx_tensor; + Tensor unique_tensor; + Tensor unique_counter; + + UniqueWithoutAxis(ctx, sp_values_tensor, &unique_idx_tensor, + &unique_tensor, &unique_counter, 0, + this->partition_size_, this->serial_, + this->unique_ratio_hint_, this->map_flag_); + + ctx->set_output(m_num_lookup + i, unique_tensor); + ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor); + + auto *unique = unique_tensor.flat().data(); + auto *unique_idx = unique_idx_tensor.flat().data(); + + std::vector default_weights(nnz, 1.0); + TValue *sp_weights = default_weights.data(); + if (!this->m_ignore_weights) { + const Tensor &sp_weights_tensor = + ctx->input(this->m_num_lookup * 3 + i); + sp_weights = + const_cast(sp_weights_tensor.flat().data()); + } + + int slice_bytes = nnz / batch_size * m_dimension * 1000; + if (this->m_combiner == "mean") { + auto do_var_mean = [this, &emb_vectors, batch_nums, unique_idx, unique, + sp_weights, + embedding_variable](int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + __m512 batch_total_weights = _mm512_set1_ps(0.0f); + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + __m512 _weights = + _mm512_set1_ps(*(sp_weights + batch_offset + j)); + batch_total_weights = + _mm512_add_ps(batch_total_weights, _weights); + const float *embedding_ptr = + embedding_variable + unique_id * m_dimension; + + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f); + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + tmp_embedding[index] = + _mm512_div_ps(tmp_embedding[index], batch_total_weights); + _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask, + tmp_embedding[index]); + } +#else + TValue batch_total_weights = 0.0f; + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + TValue sp_weight = sp_weights[batch_offset + j]; + batch_total_weights += sp_weight; + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(embedding_variable[unique_id * m_dimension + d], + sp_weight, tmp_embedding[d]); + } + } + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] /= batch_total_weights; + } + memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, do_var_mean); + } else if (this->m_combiner == "sum") { + auto do_var_sum = [this, &emb_vectors, batch_nums, unique_idx, unique, + sp_weights, + embedding_variable](int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + __m512 _weights = + _mm512_set1_ps(*(sp_weights + batch_offset + j)); + const float *embedding_ptr = + embedding_variable + unique_id * m_dimension; + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask, + tmp_embedding[index]); + } +#else + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(embedding_variable[unique_id * m_dimension + d], + sp_weights[batch_offset + j], tmp_embedding[d]); + } + } + memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, do_var_sum); + } else { + auto do_var_sqrtn = [this, &emb_vectors, batch_nums, unique_idx, unique, + sp_weights, + embedding_variable](int64 start, int64 end) { + for (int64 i = start; i < end; ++i) { +#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__) + TValue batch_total_weights = 0.0f; + int tmp_length = (m_dimension + 15) / 16; + __m512 tmp_embedding[tmp_length]; + for (int i = 0; i < tmp_length; ++i) { + tmp_embedding[i] = _mm512_set1_ps(0.0f); + } + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + TValue local_weight = *(sp_weights + batch_offset + j); + __m512 _weights = _mm512_set1_ps(local_weight); + batch_total_weights = + std::fma(local_weight, local_weight, batch_total_weights); + const float *embedding_ptr = + embedding_variable + unique_id * m_dimension; + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d); + tmp_embedding[index] = _mm512_mask3_fmadd_ps( + _item, _weights, tmp_embedding[index], mask); + } + } + + __m512 _total_weights; + if (batch_num != 0) { + _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights)); + } else { + _total_weights = _mm512_set1_ps(1.0f); + } + + for (int d = 0; d < m_dimension; d += 16) { + int index = d / 16; + int remain = m_dimension - d; + __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1); + tmp_embedding[index] = + _mm512_div_ps(tmp_embedding[index], _total_weights); + _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask, + tmp_embedding[index]); + } +#else + TValue batch_total_weights = 0.0f; + std::vector tmp_embedding(m_dimension, 0.0f); + int batch_offset = i == 0 ? 0 : batch_nums[i - 1]; + int batch_num = batch_nums[i] - batch_offset; + for (int j = 0; j < batch_num; ++j) { + int unique_indice = unique_idx[batch_offset + j]; + int unique_id = unique[unique_indice]; + TValue sp_weight = sp_weights[batch_offset + j]; + batch_total_weights = + std::fma(sp_weight, sp_weight, batch_total_weights); + for (int d = 0; d < m_dimension; ++d) { + tmp_embedding[d] = + std::fma(embedding_variable[unique_id * m_dimension + d], + sp_weight, tmp_embedding[d]); + } + } + if (batch_num != 0) { + batch_total_weights = sqrtf(batch_total_weights); + } else { + batch_total_weights = 1.0f; + } + memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(), + sizeof(float) * m_dimension); +#endif + } + }; + Shard(worker_threads->num_threads, worker_threads->workers, batch_size, + slice_bytes /*cost*/, do_var_sqrtn); + } + } + } +}; + +#define REGISTER_CPU_KERNELS(key_type, value_type) \ + REGISTER_KERNEL_BUILDER(Name("GroupVariableLookup") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupVariableLookupCpuOp) + +REGISTER_CPU_KERNELS(int32, float); +REGISTER_CPU_KERNELS(int64, float); +#undef REGISTER_CPU_KERNELS + +#undef USING_BASE_CLASS_MEMBER + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc new file mode 100644 index 00000000..0295e91e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc @@ -0,0 +1,309 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h" +#include "deepray/custom_ops/utils/spin_rw_lock.h" +#include "group_embedding_lookup_sparse_forward_base_ops.cu.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +template +class GroupEmbeddingVarLookupOp + : public GroupEmbeddingLookupForwardBaseOp { + public: + explicit GroupEmbeddingVarLookupOp(OpKernelConstruction* c) + : GroupEmbeddingLookupForwardBaseOp(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &is_use_default_value_tensor_)); + } + + void Compute(OpKernelContext* ctx) override { + const auto& device = ctx->eigen_device(); + TValue* default_v = nullptr; + int64 batch_size = -1; + + Allocator* gpu_allocator = + ctx->device()->GetAllocator(AllocatorAttributes()); + GroupEmbeddingLookupForWard lookuper( + this->num_lookups_, this->dimension_, this->max_norm_, gpu_allocator); + + std::vector tensor_list; + tensor_list.reserve(this->num_lookups_); + + for (int i = 0; i < this->num_lookups_; ++i) { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, i), &ev)); + core::ScopedUnref unref_me(ev); + int64 dimension = ev->ValueLen(); + + const Tensor& sp_values_tensor = ctx->input(this->num_lookups_ + i); + auto sp_values = sp_values_tensor.flat(); + int64 N = sp_values_tensor.NumElements(); + + const Tensor& sp_indices_tensor = ctx->input(this->num_lookups_ * 2 + i); + auto sp_indices = sp_indices_tensor.flat().data(); + int nnz = sp_indices_tensor.shape().dim_size(0); + const Tensor& dense_shape_tensor = ctx->input(this->num_lookups_ * 4 + i); + auto dense_shape = dense_shape_tensor.flat().data(); + int dense_shape_num = dense_shape_tensor.NumElements(); + batch_size = dense_shape[0]; + + TValue* default_v = nullptr; + if (is_use_default_value_tensor_) { + default_v = (TValue*)ctx->input(5 * this->num_lookups_).data(); + } else { + default_v = ev->GetDefaultValuePtr(); + } + + // DEBUG + const TFKey* key_base = sp_values.data(); + Tensor out_tensor; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::value, + {N * dimension}, &out_tensor)); + TValue* out_base = out_tensor.flat().data(); + + EmbeddingVarContext ev_ctx(ctx); + if (ev->IsSingleHbm()) { + if (is_use_default_value_tensor_) { + Tensor default_values(ctx->input(5 * this->num_lookups_)); + auto default_value_num = default_values.NumElements() / dimension; + auto default_values_matrix = + default_values.shaped({default_value_num, dimension}); + TValue* default_v_base = &default_values_matrix(0, 0); + ev->GetEmbeddings(ev_ctx, key_base, out_base, N); + } else { + ev->GetEmbeddings(ev_ctx, key_base, out_base, N); + } + } else { + TensorShape indices_host_shape = sp_values_tensor.shape(); + Tensor indices_host(sp_indices_tensor.dtype(), indices_host_shape); + // Copy ids from GPU to CPU for CPU Lookup. + auto stream = ctx->op_device_context()->stream(); + auto event_mgr = + ctx->device()->tensorflow_accelerator_device_info()->event_mgr; + se::DeviceMemoryBase gpu_src(const_cast(key_base), + N * sizeof(TFKey)); + stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TFKey)); + SyncWithEventMgr(stream, event_mgr); + EmbeddingVarContext ev_ctx(ctx); + ev->GetEmbeddings(ev_ctx, (TFKey*)indices_host.data(), out_base, N); + ev->UpdateCache(indices_host, true); + } + + TensorShape emb_vectors_tensor_shape; + // Special case for sequence categorical column output + if (this->is_sequence_) { + emb_vectors_tensor_shape = TensorShape( + std::vector({batch_size, dense_shape[1], dimension})); + } else { + emb_vectors_tensor_shape = + TensorShape(std::vector({batch_size, dimension})); + } + + Tensor* op_output_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &op_output_tensor)); + auto op_output = op_output_tensor->flat().data(); + + // allocate offset tensor + TensorShape values_offset_tensor_shape = + TensorShape(std::vector({batch_size})); + + // Fake Output + Tensor* unique_keys_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( + {this->num_lookups_ + i}, this->num_lookups_ + i, + sp_values_tensor.shape(), &unique_keys_tensor)); + + Tensor* unique_idx_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 2 + i, + values_offset_tensor_shape, + &unique_idx_tensor)); + + Tensor* values_offset_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 3 + i, + values_offset_tensor_shape, + &values_offset_tensor)); + auto values_offset = values_offset_tensor->flat().data(); + + launch_cal_per_element_row_offset( + batch_size, nnz, dense_shape_num, + reinterpret_cast(sp_indices), values_offset, + device.stream()); + + TValue* sp_weights = nullptr; + if (!this->ignore_weights_) { + const Tensor& sp_weights_tensor = + ctx->input(this->num_lookups_ * 3 + i); + sp_weights = + const_cast(sp_weights_tensor.flat().data()); + } + + GroupEmbeddingForWardArgs group_embedding_args( + out_base, sp_weights, op_output, + const_cast(reinterpret_cast(key_base)), + values_offset, nnz); + + lookuper.set(group_embedding_args); + tensor_list.emplace_back(out_tensor); + } + + if (this->combiner_ == "sum") { + this->template compute(lookuper, batch_size, device.stream()); + } else if (this->combiner_ == "mean") { + this->template compute(lookuper, batch_size, device.stream()); + } else { + this->template compute(lookuper, batch_size, + device.stream()); + } + } + + private: + bool is_use_default_value_tensor_; +}; + +#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype_tf, dtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("GroupEmbeddingVarLookup") \ + .Device(DEVICE_GPU) \ + .HostMemory("dense_shape") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupEmbeddingVarLookupOp) + +REGISTER_GPU_KERNELS(int64, int64_t, float, float); +REGISTER_GPU_KERNELS(int32, int32_t, float, float); +#undef REGISTER_GPU_KERNELS + +template +class GroupVariableLookupOp + : public GroupEmbeddingLookupForwardBaseOp { + public: + explicit GroupVariableLookupOp(OpKernelConstruction* c) + : GroupEmbeddingLookupForwardBaseOp(c) {} + + void Compute(OpKernelContext* ctx) override { + const cudaStream_t stream = ctx->eigen_device().stream(); + Allocator* gpu_allocator = + ctx->device()->GetAllocator(AllocatorAttributes()); + GroupEmbeddingLookupForWard lookuper( + this->num_lookups_, this->dimension_, this->max_norm_, gpu_allocator); + int64 batch_size = -1; + + for (int i = 0; i < this->num_lookups_; ++i) { + const Tensor& emb_variable_tensor = ctx->input(i); + const Tensor& sp_values_tensor = ctx->input(this->num_lookups_ + i); + int64 emb_vec_size = emb_variable_tensor.shape().dim_size(1); + + const Tensor& sp_indices_tensor = ctx->input(this->num_lookups_ * 2 + i); + auto sp_indices = sp_indices_tensor.flat().data(); + int nnz = sp_indices_tensor.shape().dim_size(0); + const Tensor& dense_shape_tensor = ctx->input(this->num_lookups_ * 4 + i); + auto dense_shape = dense_shape_tensor.flat().data(); + int dense_shape_num = dense_shape_tensor.NumElements(); + batch_size = dense_shape[0]; + + TensorShape emb_vectors_tensor_shape; + // Special case for sequence categorical column output + if (this->is_sequence_) { + emb_vectors_tensor_shape = TensorShape( + std::vector({batch_size, dense_shape[1], emb_vec_size})); + } else { + emb_vectors_tensor_shape = + TensorShape(std::vector({batch_size, emb_vec_size})); + } + Tensor* emb_vectors_tensor = nullptr; + // allocate output + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape, + &emb_vectors_tensor)); + auto emb_vectors = emb_vectors_tensor->flat().data(); + + // allocate offset tensor + TensorShape values_offset_tensor_shape = + TensorShape(std::vector({batch_size})); + // Fake Output + Tensor* unique_keys_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( + {this->num_lookups_ + i}, this->num_lookups_ + i, + sp_values_tensor.shape(), &unique_keys_tensor)); + + Tensor* unique_idx_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 2 + i, + values_offset_tensor_shape, + &unique_idx_tensor)); + Tensor* values_offset_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 3 + i, + values_offset_tensor_shape, + &values_offset_tensor)); + auto values_offset = values_offset_tensor->flat().data(); + launch_cal_per_element_row_offset( + batch_size, nnz, dense_shape_num, + reinterpret_cast(sp_indices), values_offset, stream); + + TValue* sp_weights = nullptr; + if (!this->ignore_weights_) { + const Tensor& sp_weights_tensor = + ctx->input(this->num_lookups_ * 3 + i); + sp_weights = + const_cast(sp_weights_tensor.flat().data()); + } + GroupEmbeddingForWardArgs group_embedding_args( + const_cast(emb_variable_tensor.flat().data()), + sp_weights, emb_vectors, + const_cast(reinterpret_cast( + sp_values_tensor.flat().data())), + values_offset, nnz); + lookuper.set(group_embedding_args); + } + + if (this->combiner_ == "sum") { + this->template compute(lookuper, batch_size, stream); + } else if (this->combiner_ == "mean") { + this->template compute(lookuper, batch_size, stream); + } else { + this->template compute(lookuper, batch_size, stream); + } + } +}; + +#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype_tf, dtype) \ + REGISTER_KERNEL_BUILDER(Name("GroupVariableLookup") \ + .Device(DEVICE_GPU) \ + .HostMemory("dense_shape") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + GroupVariableLookupOp) + +REGISTER_GPU_KERNELS(int64, int64_t, float, float); +REGISTER_GPU_KERNELS(int32, int32_t, float, float); +#undef REGISTER_GPU_KERNELS + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc new file mode 100644 index 00000000..d91852cb --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc @@ -0,0 +1,493 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "incr_save_restore_ops.h" + +#include "tensorflow/core/framework/resource_handle.h" + +namespace tensorflow { + +template +class RecordSparseIndicesOp : public OpKernel { + public: + explicit RecordSparseIndicesOp(OpKernelConstruction* context) + : OpKernel(context), auto_record_(false) { + OP_REQUIRES_OK(context, + context->GetAttr("var_name", &sparse_incr_res_name_)); + OP_REQUIRES_OK(context, context->GetAttr("auto_record", &auto_record_)); + } + + void Compute(OpKernelContext* ctx) override { + IndicesIncrRecorder* sparse_incr_res = nullptr; + auto rm = ctx->resource_manager(); + OP_REQUIRES_OK( + ctx, rm->LookupOrCreate>( + "", sparse_incr_res_name_ + "_sparse_incr", &sparse_incr_res, + [this](IndicesIncrRecorder** ptr) { + *ptr = + new IndicesIncrRecorder(sparse_incr_res_name_); + if (auto_record_) { + (*ptr)->UpdateGlobalVersion(); + } + VLOG(2) << "sparse_incr_res created, name:" + << sparse_incr_res_name_; + return OkStatus(); + })); + sparse_incr_res->UpdateIndices(ctx->input(0), ctx); + } + + private: + string sparse_incr_res_name_; + bool auto_record_; +}; + +REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices") + .Device(DEVICE_CPU) + .TypeConstraint("TIndex"), + RecordSparseIndicesOp); + +REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices") + .Device(DEVICE_CPU) + .TypeConstraint("TIndex"), + RecordSparseIndicesOp); + +REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices") + .Device(DEVICE_GPU) + .TypeConstraint("TIndex"), + RecordSparseIndicesOp); + +REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices") + .Device(DEVICE_GPU) + .TypeConstraint("TIndex"), + RecordSparseIndicesOp); + +class ActivateSparseRecorderOp : public OpKernel { + public: + explicit ActivateSparseRecorderOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_names = context->input(0); + const auto& tensor_names_flat = tensor_names.flat(); + const int num_tensors = static_cast(tensor_names.NumElements()); + + auto rm = context->resource_manager(); + for (int i = 0; i < num_tensors; ++i) { + const string& tensor_name = tensor_names_flat(i); + // cast forcely to IndicesIncrRecorder for incr cpkt + string incr_res_name = tensor_name + "_sparse_incr"; + IndicesIncrRecorder* sparse_incr_res = nullptr; + rm->Lookup("", incr_res_name, &sparse_incr_res); + if (sparse_incr_res != nullptr) { + sparse_incr_res->UpdateGlobalVersion(); + } else { + IndicesIncrRecorder* sparse_incr_res = nullptr; + rm->Lookup("", incr_res_name, &sparse_incr_res); + if (sparse_incr_res != nullptr) { + sparse_incr_res->UpdateGlobalVersion(); + } else { + LOG(WARNING) << tensor_name << "_sparse_incr" + << " Resource NOT FOUND"; + } + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("ActivateSparseRecorder").Device(DEVICE_CPU), + ActivateSparseRecorderOp); + +class IncrSaveOp : public OpKernel { + public: + explicit IncrSaveOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_)); + } + + void Compute(OpKernelContext* context) override { + const int kFixedInputs = 4; // Prefix, tensor names, is_sparse + const Tensor& prefix = context->input(0); + const string& prefix_string = prefix.scalar()(); + const Tensor& tensor_names = context->input(1); + const Tensor& shape_and_slices = context->input(2); + const Tensor& is_sparse = context->input(3); + const int num_tensors = static_cast(tensor_names.NumElements()); + const auto& tensor_names_flat = tensor_names.flat(); + const auto& is_sparse_flat = is_sparse.flat(); + const auto& shape_and_slices_flat = shape_and_slices.flat(); + LOG(INFO) << "prefix_string: " << prefix_string + << "num tensors:" << num_tensors; + auto rm = context->resource_manager(); + BundleWriter writer(Env::Default(), prefix_string); + + for (int i = 0; i < num_tensors; i++) { + const string& tensor_name = tensor_names_flat(i); + if (is_sparse_flat(i)) { + IndicesIncrRecorder* sparse_incr_res = nullptr; + rm->Lookup("", tensor_name + "_sparse_incr", &sparse_incr_res); + if (sparse_incr_res != nullptr) { + DumpIncrSparse(context, i, kFixedInputs, tensor_name, &writer, + sparse_incr_res); + } else { + IndicesIncrRecorder* sparse_incr_res = nullptr; + rm->Lookup("", tensor_name + "_sparse_incr", &sparse_incr_res); + if (sparse_incr_res != nullptr) { + DumpIncrSparse(context, i, kFixedInputs, tensor_name, + &writer, sparse_incr_res); + } else { + LOG(WARNING) << tensor_name << "_sparse_incr" + << " Resource NOT FOUND"; + } + } + } else { + const Tensor& tensor = context->input(i + kFixedInputs); + + if (!shape_and_slices_flat(i).empty()) { + const string& shape_spec = shape_and_slices_flat(i); + TensorShape shape; + TensorSlice slice(tensor.dims()); + TensorShape slice_shape; + + OP_REQUIRES_OK(context, + checkpoint::ParseShapeAndSlice(shape_spec, &shape, + &slice, &slice_shape)); + OP_REQUIRES( + context, slice_shape.IsSameSize(tensor.shape()), + errors::InvalidArgument( + "Slice in shape_and_slice " + "specification does not match the " + "shape of the tensor to save: ", + shape_spec, ", tensor: ", tensor.shape().DebugString())); + + OP_REQUIRES_OK(context, + writer.AddSlice(tensor_name, shape, slice, tensor)); + } else { + OP_REQUIRES_OK(context, writer.Add(tensor_name, tensor)); + } + } + } + OP_REQUIRES_OK(context, writer.Finish()); + } + + private: + template + void DumpIncrSparse(OpKernelContext* context, int i, const int& kFixedInputs, + const string& tensor_name, BundleWriter* writer, + IndicesIncrRecorder* sparse_incr_res) { + if (tensor_types_[i] == DT_RESOURCE) { + // ev, must be sparse + EmbeddingVar* variable = nullptr; + OP_REQUIRES_OK( + context, + LookupResource(context, HandleFromInput(context, i + kFixedInputs), + &variable)); + core::ScopedUnref unref_variable(variable); + OP_REQUIRES_OK(context, sparse_incr_res->DumpSparseEmbeddingTensor( + tensor_name, variable, writer, context)); + } else { + const Tensor& sparse_var = context->input(i + kFixedInputs); + OP_REQUIRES_OK(context, sparse_incr_res->DumpSparseNormalTensor( + tensor_name, sparse_var, writer)); + } + } + + private: + DataTypeVector tensor_types_; +}; + +REGISTER_KERNEL_BUILDER(Name("IncrSave").Device(DEVICE_CPU), IncrSaveOp); + +class IncrRestoreOp : public OpKernel { + public: + explicit IncrRestoreOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& prefix = context->input(0); + const string& prefix_string = prefix.scalar()(); + const Tensor& tensor_names = context->input(1); + const Tensor& shape_and_slices = context->input(2); + const Tensor& is_sparse_tensor = context->input(3); + const bool& is_sparse = is_sparse_tensor.scalar()(); + const auto& shape_and_slices_flat = shape_and_slices.flat(); + const int num_tensors = static_cast(tensor_names.NumElements()); + if (is_sparse) { + BundleReader reader(Env::Default(), prefix_string); + OP_REQUIRES_OK(context, reader.status()); + VLOG(1) << "BundleReader incr, prefix_string: " << prefix_string; + LOG(INFO) << "BundleReader incr, prefix_string: " << prefix_string; + const auto& tensor_names_flat = tensor_names.flat(); + if (num_tensors > 1) { + // EV + if (num_tensors != 3) { + OP_REQUIRES_OK( + context, + errors::InvalidArgument( + "Incr cpkt restore for ev must has 3 tensors, actually ", + num_tensors, " given")); + } + + const string& ev_keys_name = tensor_names_flat(0); + string incr_tensor_name = + ev_keys_name.substr(0, ev_keys_name.find("-keys")); + // 1 read keys, values and versions + TensorShape incr_shape; + Tensor* incr_keys_tensor = nullptr; + Tensor* incr_values_tensor = nullptr; + Tensor* incr_versions_tensor = nullptr; + OP_REQUIRES_OK( + context, reader.LookupTensorShape( + incr_tensor_name + "-sparse_incr_keys", &incr_shape)); + OP_REQUIRES_OK(context, context->allocate_output(0, incr_shape, + &incr_keys_tensor)); + OP_REQUIRES_OK(context, + reader.Lookup(incr_tensor_name + "-sparse_incr_keys", + incr_keys_tensor)); + OP_REQUIRES_OK(context, reader.LookupTensorShape( + incr_tensor_name + "-sparse_incr_values", + &incr_shape)); + OP_REQUIRES_OK(context, context->allocate_output(1, incr_shape, + &incr_values_tensor)); + OP_REQUIRES_OK(context, + reader.Lookup(incr_tensor_name + "-sparse_incr_values", + incr_values_tensor)); + + OP_REQUIRES_OK(context, reader.LookupTensorShape( + incr_tensor_name + "-sparse_incr_versions", + &incr_shape)); + OP_REQUIRES_OK(context, context->allocate_output( + 2, incr_shape, &incr_versions_tensor)); + OP_REQUIRES_OK(context, + reader.Lookup(incr_tensor_name + "-sparse_incr_versions", + incr_versions_tensor)); + } else { + // 1 Read keys from incr ckpt + TensorShape keys_shape; + Tensor keys_tensor; + DataType key_type; + + const string& tensor_name = tensor_names_flat(0); + OP_REQUIRES_OK(context, reader.LookupDtypeAndShape( + tensor_name + "-sparse_incr_keys", + &key_type, &keys_shape)); + + OP_REQUIRES_OK(context, context->allocate_temp(key_type, keys_shape, + &keys_tensor)); + + OP_REQUIRES_OK(context, reader.Lookup(tensor_name + "-sparse_incr_keys", + &keys_tensor)); + + LOG(INFO) << "Finished restoring incr normal sparse keys tensor:" + << tensor_name.data() + << ", size:" << keys_tensor.TotalBytes(); + + // 2 Read values from incr ckpt + TensorShape values_shape; + Tensor values_tensor; + + OP_REQUIRES_OK(context, + reader.LookupTensorShape( + tensor_name + "-sparse_incr_values", &values_shape)); + + OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, values_shape, + &values_tensor)); + + OP_REQUIRES_OK( + context, + reader.Lookup(tensor_name + "-sparse_incr_values", &values_tensor)); + + LOG(INFO) << "Finished restoring incr normal sparse values tensor:" + << tensor_name.data() + << ", size:" << values_tensor.TotalBytes(); + // 3 do incr update + const Tensor& orig_sparse_tensor = context->input(4); + Tensor* new_sparse_tensor = nullptr; + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {4}, 0, orig_sparse_tensor.shape(), + &new_sparse_tensor)); + + // 3.1 update specific rows + auto incr_values_flat = values_tensor.template matrix(); + auto new_values_flat = new_sparse_tensor->template matrix(); + auto limit = new_sparse_tensor->dim_size(1); + + for (auto i = 0; i < keys_tensor.NumElements(); i++) { + if (key_type == DT_INT32) { + auto incr_key = + keys_tensor.flat::Type>()(i); + if (incr_key >= new_sparse_tensor->dim_size(0)) continue; + for (auto j = 0; j < limit; j++) { + new_values_flat(incr_key, j) = incr_values_flat(i, j); + } + } else { + auto incr_key = + keys_tensor.flat::Type>()(i); + if (incr_key >= new_sparse_tensor->dim_size(0)) continue; + for (auto j = 0; j < limit; j++) { + new_values_flat(incr_key, j) = incr_values_flat(i, j); + } + } + } + LOG(INFO) << "Finished restoring normal sparse tensor(full+incr):" + << tensor_name.data() + << ", size:" << new_sparse_tensor->TotalBytes(); + } + } else { + RestoreTensorsV2(context, prefix, tensor_names, shape_and_slices, + tensor_types_); + } + } + + private: + DataTypeVector tensor_types_; +}; + +REGISTER_KERNEL_BUILDER(Name("IncrRestore").Device(DEVICE_CPU), IncrRestoreOp); + +class CollectSparseIndicesOp : public OpKernel { + public: + explicit CollectSparseIndicesOp(OpKernelConstruction* context) + : OpKernel(context), update_count_thd_(0) { + string config_str; + OP_REQUIRES_OK(context, context->GetAttr("config", &config_str)); + OP_REQUIRES_OK(context, ParseConfig(config_str)); + OP_REQUIRES_OK(context, context->GetAttr("ktype", &tensor_type_)); + OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_)); + + int64 part_idx, part_count, hash_bucket_size; + OP_REQUIRES_OK(context, context->GetAttr("part_idx", &part_idx)); + OP_REQUIRES_OK(context, context->GetAttr("part_count", &part_count)); + OP_REQUIRES_OK(context, + context->GetAttr("hash_bucket_size", &hash_bucket_size)); + + if (part_count > 0 && hash_bucket_size > 0) { + string part_mode_str; + OP_REQUIRES_OK(context, context->GetAttr("part_mode", &part_mode_str)); + if (part_mode_str == "mod") { + partitioner_ = std::move(std::unique_ptr( + new ModSparsePartitioner(part_count, part_idx, hash_bucket_size))); + } else { + partitioner_ = std::move(std::unique_ptr( + new DivSparsePartitioner(part_count, part_idx, hash_bucket_size))); + } + } + } + + void Compute(OpKernelContext* context) override { + if (tensor_type_ == DT_INT32) { + OP_REQUIRES_OK(context, + ExportSparseIndices(tensor_name_, context)); + } else if (tensor_type_ == DT_INT64) { + OP_REQUIRES_OK(context, + ExportSparseIndices(tensor_name_, context)); + } else { + LOG(WARNING) << "Not support key type:" << DataTypeString(tensor_type_); + } + } + + private: + template + Status ExportSparseIndices(const string& tensor_name, + OpKernelContext* context) { + auto rm = context->resource_manager(); + string resource_name = tensor_name + "_sparse_incr"; + IndicesIncrRecorder* sparse_incr_res = nullptr; + rm->Lookup("", resource_name, &sparse_incr_res); + if (sparse_incr_res == nullptr) { + LOG(WARNING) << tensor_name << " Resource NOT FOUND"; + return OkStatus(); + } + return DoExportSparseIndices(sparse_incr_res, context); + } + + template + Status DoExportSparseIndices(IndicesIncrRecorder* sparse_incr_res, + OpKernelContext* ctx) { + std::unordered_map indices; + sparse_incr_res->SwapIndices(indices); + std::vector filtered_indices; + FilterIndices(indices, filtered_indices); + + Tensor* keys_out = nullptr; + Tensor* global_keys_out = nullptr; + TF_RETURN_IF_ERROR(ctx->allocate_output( + 0, TensorShape({(int64)filtered_indices.size()}), &keys_out)); + + TF_RETURN_IF_ERROR(ctx->allocate_output( + 1, TensorShape({(int64)filtered_indices.size()}), &global_keys_out)); + + auto keys_out_flat = keys_out->flat(); + auto global_keys_out_flat = global_keys_out->flat(); + for (size_t i = 0; i < filtered_indices.size(); i++) { + KeyType k = filtered_indices[i]; + KeyType global_k = k; + if (partitioner_) { + global_k = (KeyType)partitioner_->CalcGlobalOffset(k); + VLOG(2) << partitioner_->toString() << ", key:" << k + << ", global key:" << global_k; + } + keys_out_flat(i) = k; + global_keys_out_flat(i) = global_k; + } + return OkStatus(); + } + + template + void FilterIndices(const std::unordered_map& indices, + std::vector& filtered_indices) { + filtered_indices.reserve(indices.size()); + for (const auto& it : indices) { + const auto& key = it.first; + uint64 update_count = it.second; + if (update_count >= update_count_thd_) { + filtered_indices.push_back(key); + } + } + } + + Status ParseConfig(const string& config_str) { + LOG(INFO) << "Collect sparse indices config:" << config_str; + std::vector configs = str_util::Split(config_str, ","); + for (size_t i = 0; i < configs.size(); i++) { + const string& s = configs[i]; + std::vector kv = str_util::Split(s, "="); + if (kv.size() < 2) { + LOG(WARNING) << "invalid config:" << s; + continue; + } + if (kv[0] == "update_count_thd") { + if (!strings::safe_strtou64(kv[1], &update_count_thd_)) { + LOG(WARNING) << "invalid config:" << s; + } + } + } + + LOG(INFO) << "Parse collect sparse indices config success," + << "update_cound_thd=" << update_count_thd_; + + return OkStatus(); + } + + private: + std::string tensor_name_; + DataType tensor_type_; + uint64 update_count_thd_; + std::unique_ptr partitioner_; +}; + +REGISTER_KERNEL_BUILDER(Name("CollectSparseIndices").Device(DEVICE_CPU), + CollectSparseIndicesOp); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h new file mode 100644 index 00000000..71671cd8 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h @@ -0,0 +1,553 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_ +#define TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_ + +#include +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/save_restore_tensor.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/saved_tensor_slice_util.h" +#include "tensorflow/core/util/tensor_slice_reader.h" + +namespace tensorflow { +template +class ThreadSafeHashMap { + public: + ThreadSafeHashMap() {} + ~ThreadSafeHashMap() {} + + public: + void Update(const Tensor& indices, int64 start, int64 end) { + mutex_lock l(lock_); + auto indices_flat = indices.flat(); + for (int64 idx = start; idx < end; idx++) { + auto indice = indices_flat(idx); + auto it = hash_map_.find(indice); + if (it == hash_map_.end()) { + hash_map_[indice] = 1; + } else { + it->second = it->second + 1; + } + } + } + + void Swap(std::unordered_map& out) { + mutex_lock l(lock_); + hash_map_.swap(out); + } + + void GetKeys(std::set& key_set) { + mutex_lock l(lock_); + for (auto it : hash_map_) { + key_set.insert(it.first); + } + } + + void Clear() { + mutex_lock l(lock_); + hash_map_.clear(); + } + + private: + std::unordered_map hash_map_; + mutex lock_; +}; + +template +class ParallelHashMap { + public: + explicit ParallelHashMap(int min_part_size = 128, int part_count = 32) + : part_count_(part_count), min_part_size_(min_part_size) { + hash_maps_.resize(part_count_); + } + + void Update(const Tensor& indices, OpKernelContext* ctx) { + const int64 N = indices.NumElements(); + auto thread_pool = *(ctx->device()->tensorflow_cpu_worker_threads()); + + std::vector> parts; + SplitParallelParts( + N, std::min(part_count_, thread_pool.workers->NumThreads()), parts); + + int part_count = parts.size(); + BlockingCounter counter(part_count); + for (int i = 0; i < part_count; i++) { + int64 start = parts[i].first; + int64 end = parts[i].second; + thread_pool.workers->Schedule([this, indices, i, start, end, &counter]() { + hash_maps_[i].Update(indices, start, end); + counter.DecrementCount(); + }); + } + counter.Wait(); + } + + void Swap(std::unordered_map& indices) { + std::vector> tmp_maps; + tmp_maps.resize(part_count_); + for (int i = 0; i < part_count_; i++) { + hash_maps_[i].Swap(tmp_maps[i]); + } + + indices.clear(); + for (int i = 0; i < part_count_; i++) { + for (auto it : tmp_maps[i]) { + auto indiceIt = indices.find(it.first); + if (indiceIt == indices.end()) { + indices[it.first] = it.second; + } else { + indices[it.first] += it.second; + } + } + } + } + + void Clear() { + for (size_t i = 0; i < part_count_; i++) { + hash_maps_[i].Clear(); + } + } + + void GetKeys(std::set& key_set) { + for (size_t i = 0; i < part_count_; i++) { + hash_maps_[i].GetKeys(key_set); + } + } + + void SplitParallelParts(int64 total_num, int64 part_count, + std::vector>& parts) { + if (total_num == 0) { + return; + } + + int64 actual_part_count = part_count; + int64 part_size = total_num / actual_part_count; + if (part_size < min_part_size_) { + actual_part_count = total_num / min_part_size_; + actual_part_count = actual_part_count == 0 ? 1 : actual_part_count; + } + + part_size = total_num / actual_part_count; + int64 left = total_num % actual_part_count; + int64 start = 0; + for (int i = 0; i < actual_part_count; i++) { + int64 end = start + part_size + (left > 0 ? 1 : 0); + parts.push_back(std::make_pair(start, end)); + start = end; + left -= 1; + } + } + + private: + std::vector> hash_maps_; + int part_count_; + int min_part_size_; +}; + +template +class IncrKeyDumpIterator : public DumpIterator { + public: + explicit IncrKeyDumpIterator(std::vector& incr_keys) + : incr_keys_(incr_keys) { + keys_iter_ = incr_keys_.begin(); + } + + bool HasNext() const { return keys_iter_ != incr_keys_.end(); } + + K Next() { return *keys_iter_++; } + + private: + std::vector& incr_keys_; + typename std::vector::iterator keys_iter_; +}; + +template +class IncrEVValueDumpIterator : public DumpIterator { + public: + IncrEVValueDumpIterator(std::vector& incr_keys, + EmbeddingVar*& emb_var) + : incr_keys_(incr_keys), emb_var_(emb_var) { + keys_iter_ = incr_keys_.begin(); + keys_idx_ = 1; + col_idx_ = 0; + } + + bool HasNext() const { + if (keys_iter_ != incr_keys_.end()) { + if (keys_idx_ < incr_keys_.size()) { + return true; + } else { + return col_idx_ < emb_var_->ValueLen(); + } + } else { + return false; + } + } + + T Next() { + if (col_idx_ >= emb_var_->ValueLen()) { + keys_iter_++; + keys_idx_++; + col_idx_ = 0; + } + void* value_ptr = NULL; + TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr)); + return emb_var_->flat(value_ptr)(col_idx_++); + } + + private: + int64 keys_idx_; + int64 col_idx_; + typename std::vector::iterator keys_iter_; + std::vector& incr_keys_; + EmbeddingVar* emb_var_; +}; + +template +class IncrEVVersionDumpIterator : public DumpIterator { + public: + IncrEVVersionDumpIterator(std::vector& incr_keys, + EmbeddingVar*& emb_var) + : incr_keys_(incr_keys), emb_var_(emb_var) { + keys_iter_ = incr_keys_.begin(); + } + + bool HasNext() const { return keys_iter_ != incr_keys_.end(); } + + T Next() { + if (emb_var_->StepsToLive() == 0) { + keys_iter_++; + return 0; + } else { + K key = *keys_iter_; + int64 dump_version = emb_var_->GetVersion(key); + keys_iter_++; + return dump_version; + } + } + + private: + std::vector& incr_keys_; + typename std::vector::iterator keys_iter_; + EmbeddingVar* emb_var_; +}; + +template +class IncrEVFreqDumpIterator : public DumpIterator { + public: + IncrEVFreqDumpIterator(std::vector& incr_keys, + EmbeddingVar*& emb_var) + : incr_keys_(incr_keys), emb_var_(emb_var) { + keys_iter_ = incr_keys_.begin(); + } + + bool HasNext() const { return keys_iter_ != incr_keys_.end(); } + + T Next() { + K key = *keys_iter_; + int64 dump_version = emb_var_->GetFreq(key); + keys_iter_++; + return dump_version; + } + + private: + std::vector& incr_keys_; + typename std::vector::iterator keys_iter_; + EmbeddingVar* emb_var_; +}; + +template +class IncrNormalValueDumpIterator : public DumpIterator { + public: + IncrNormalValueDumpIterator(std::vector& incr_keys, const Tensor& variable) + : incr_keys_(incr_keys), variable_(variable) { + var_data_ = (T*)variable.flat().data(); + keys_iter_ = incr_keys_.begin(); + keys_idx_ = 1; + col_idx_ = 0; + } + + bool HasNext() const { + if (keys_iter_ != incr_keys_.end()) { + if (keys_idx_ < incr_keys_.size()) { + return true; + } else { + return col_idx_ < variable_.dim_size(1); + } + } else { + return false; + } + } + + T Next() { + if (col_idx_ >= variable_.dim_size(1)) { + keys_iter_++; + keys_idx_++; + col_idx_ = 0; + } + T val = var_data_[(*keys_iter_) * variable_.dim_size(1) + col_idx_]; + col_idx_++; + return val; + } + + private: + std::vector& incr_keys_; + T* var_data_; + int64 col_limit_; + int64 keys_idx_; + typename std::vector::iterator keys_iter_; + int64 col_idx_; + const Tensor& variable_; +}; + +template +class IndicesIncrRecorder : public ResourceBase { + public: + explicit IndicesIncrRecorder(const std::string& name, int32 part_count = 16, + int32 min_part_size = 128) + : name_(name), incr_indices_(min_part_size, part_count) {} + + void UpdateIndices(const Tensor& indices, OpKernelContext* ctx) { + if (global_version_ == -1) { + return; + } + + incr_indices_.Update(indices, ctx); + } + + void UpdateGlobalVersion() { + global_version_ = 1; + mutex_lock l(mu_); + incr_indices_.Clear(); + } + + void SwapIndices(std::unordered_map& indices) { + incr_indices_.Swap(indices); + } + + Status DumpSparseNormalTensor(const string& tensor_name, + const Tensor& variable, BundleWriter* writer) { + mutex_lock l(mu_); + size_t bytes_limit = 8 << 20; + char* dump_buffer = (char*)malloc(sizeof(char) * bytes_limit); + + std::set incr_keys_set; + incr_indices_.GetKeys(incr_keys_set); + std::vector incr_keys; + incr_keys.assign(incr_keys_set.begin(), incr_keys_set.end()); + + IncrKeyDumpIterator key_dump_iter(incr_keys); + Status st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_keys", writer, dump_buffer, bytes_limit, + &key_dump_iter, TensorShape({incr_keys.size()})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + + IncrNormalValueDumpIterator value_dump_iter(incr_keys, variable); + st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_values", writer, dump_buffer, bytes_limit, + &value_dump_iter, + TensorShape({incr_keys.size(), variable.dim_size(1)})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + + free(dump_buffer); + return OkStatus(); + } + + Status DumpSparseEmbeddingTensor(const string& tensor_name, + EmbeddingVar* emb_var, + BundleWriter* writer, + OpKernelContext* context) { + mutex_lock l(mu_); + size_t bytes_limit = 8 << 20; + char* dump_buffer = (char*)malloc(sizeof(char) * bytes_limit); + + std::set incr_keys; + incr_indices_.GetKeys(incr_keys); + + std::vector> incr_keys_parts; + incr_keys_parts.resize(kSavedPartitionNum); + + for (auto& ik : incr_keys) { + for (int partid = 0; partid < kSavedPartitionNum; partid++) { + if (ik % kSavedPartitionNum == partid && + emb_var->GetFreq(ik) >= emb_var->MinFreq()) { + incr_keys_parts[partid].push_back(ik); + break; + } + } + } + + std::vector partitioned_incr_keys; + Tensor part_offset_tensor; + context->allocate_temp(DT_INT32, TensorShape({kSavedPartitionNum + 1}), + &part_offset_tensor); + auto part_offset_flat = part_offset_tensor.flat(); + part_offset_flat(0) = 0; + int ptsize = 0; + for (int partid = 0; partid < kSavedPartitionNum; partid++) { + std::vector& key_list = incr_keys_parts[partid]; + + ptsize += key_list.size(); + for (int inpid = 0; inpid < key_list.size(); inpid++) { + partitioned_incr_keys.push_back(key_list[inpid]); + } + + part_offset_flat(partid + 1) = part_offset_flat(partid) + key_list.size(); + } + writer->Add(tensor_name + "-incr_partition_offset", part_offset_tensor); + + IncrKeyDumpIterator key_dump_iter(partitioned_incr_keys); + Status st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_keys", writer, dump_buffer, bytes_limit, + &key_dump_iter, TensorShape({partitioned_incr_keys.size()})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + + IncrEVValueDumpIterator ev_value_dump_iter(partitioned_incr_keys, + emb_var); + st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_values", writer, dump_buffer, bytes_limit, + &ev_value_dump_iter, + TensorShape( + {(uint64)partitioned_incr_keys.size(), emb_var->ValueLen()})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + + IncrEVVersionDumpIterator ev_version_dump_iter( + partitioned_incr_keys, emb_var); + st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_versions", writer, dump_buffer, bytes_limit, + &ev_version_dump_iter, + TensorShape({(uint64)partitioned_incr_keys.size()})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + IncrEVFreqDumpIterator ev_freq_dump_iter(partitioned_incr_keys, + emb_var); + st = SaveTensorWithFixedBuffer( + tensor_name + "-sparse_incr_freqs", writer, dump_buffer, bytes_limit, + &ev_freq_dump_iter, + TensorShape({(uint64)partitioned_incr_keys.size()})); + if (!st.ok()) { + free(dump_buffer); + return st; + } + free(dump_buffer); + return OkStatus(); + } + + string DebugString() const { return "IndicesIncrRecorder"; } + + string GetName() { return name_; } + + private: + mutex mu_; + string name_; + ParallelHashMap incr_indices_; + std::atomic global_version_ = {-1}; + + TF_DISALLOW_COPY_AND_ASSIGN(IndicesIncrRecorder); +}; + +class SparsePartitioner { + public: + SparsePartitioner(int64 part_count, int64_t part_idx, int64 hash_bucket_size) + : part_count_(part_count), + part_idx_(part_idx), + hash_bucket_size_(hash_bucket_size) { + assert(part_idx_ >= part_count_); + } + + virtual int64 CalcGlobalOffset(int64 part_offset) = 0; + + std::string toString() const { + return strings::Printf( + "part_mode:%s, part_count:%lld, part_idx:%ld, hash_bucket_size:%ld", + part_mode_.c_str(), part_count_, part_idx_, (long)hash_bucket_size_); + } + + protected: + std::string part_mode_; + int64 part_count_; + int64 part_idx_; + int64 hash_bucket_size_; +}; + +class DivSparsePartitioner : public SparsePartitioner { + public: + DivSparsePartitioner(int64 part_count, int64 part_idx, int64 hash_bucket_size) + : SparsePartitioner(part_count, part_idx, hash_bucket_size) { + part_mode_ = "div"; + int64 ids_per_part = hash_bucket_size_ / part_count_; + int64 extras = hash_bucket_size_ % part_count_; + + part_offset_start_ = 0; + for (int i = 0; i < part_idx; i++) { + part_offset_start_ += (i < extras ? (ids_per_part + 1) : ids_per_part); + } + } + + int64 CalcGlobalOffset(int64 part_offset) { + return part_offset_start_ + part_offset; + } + + private: + int64 part_offset_start_; +}; + +class ModSparsePartitioner : public SparsePartitioner { + public: + ModSparsePartitioner(int64 part_count, int64 part_idx, int64 hash_bucket_size) + : SparsePartitioner(part_count, part_idx, hash_bucket_size) { + part_mode_ = "mod"; + } + + int64 CalcGlobalOffset(int64 part_offset) { + return part_offset * part_count_ + part_idx_; + } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc new file mode 100644 index 00000000..fc175dbf --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc @@ -0,0 +1,256 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "incr_save_restore_ops.h" + +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "deepray/custom_ops/utils/fake_input.h" +#include "deepray/custom_ops/utils/ops_testutil.h" +#include "deepray/custom_ops/utils/tensor_testutil.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" + +namespace tensorflow { +namespace { + +void doTestSplitParallelParts( + int part_count, int min_part_size, int total_num, + std::vector> expect_parts) { + ParallelHashMap parallel_hashmap(min_part_size, part_count); + + std::vector> parts; + parallel_hashmap.SplitParallelParts(total_num, part_count, parts); + + ASSERT_EQ(expect_parts.size(), parts.size()); + for (size_t i = 0; i < parts.size(); i++) { + EXPECT_EQ(expect_parts[i].first, parts[i].first); + EXPECT_EQ(expect_parts[i].second, parts[i].second); + } +} + +TEST(ParallelHashMapTest, TestSplitParallelParts) { + doTestSplitParallelParts(4, 3, 0, {}); + doTestSplitParallelParts(4, 3, 1, {{0, 1}}); + doTestSplitParallelParts(4, 3, 8, {{0, 4}, {4, 8}}); + doTestSplitParallelParts(4, 3, 12, {{0, 3}, {3, 6}, {6, 9}, {9, 12}}); + doTestSplitParallelParts(4, 3, 13, {{0, 4}, {4, 7}, {7, 10}, {10, 13}}); + doTestSplitParallelParts(4, 3, 15, {{0, 4}, {4, 8}, {8, 12}, {12, 15}}); + doTestSplitParallelParts(4, 3, 16, {{0, 4}, {4, 8}, {8, 12}, {12, 16}}); + doTestSplitParallelParts(4, 3, 17, {{0, 5}, {5, 9}, {9, 13}, {13, 17}}); +} + +TEST(ParallelHashMapTest, TestUpdateAndSwap) { + ParallelHashMap parallel_hashmap(2); + Tensor t(DT_INT32, TensorShape({5})); + test::FillValues(&t, {1, 2, 3, 2, 3}); + + std::unique_ptr device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + std::unique_ptr context(new OpKernelContext(¶ms, 3)); + + parallel_hashmap.Update(t, context.get()); + + std::unordered_map out_indices; + parallel_hashmap.Swap(out_indices); + EXPECT_EQ(3, out_indices.size()); + EXPECT_EQ(1, out_indices[1]); + EXPECT_EQ(2, out_indices[2]); + EXPECT_EQ(2, out_indices[3]); +} + +TEST(ParallelHashMapTest, TestGetKeys) { + ParallelHashMap parallel_hashmap(2); + Tensor t(DT_INT32, TensorShape({5})); + test::FillValues(&t, {1, 2, 3, 2, 3}); + + std::unique_ptr device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + std::unique_ptr context(new OpKernelContext(¶ms, 3)); + + parallel_hashmap.Update(t, context.get()); + + std::set keys; + parallel_hashmap.GetKeys(keys); + EXPECT_EQ(3, keys.size()); + EXPECT_TRUE(keys.find(1) != keys.end()); + EXPECT_TRUE(keys.find(2) != keys.end()); + EXPECT_TRUE(keys.find(3) != keys.end()); +} + +TEST(IndicesIncrRecorderTest, TestUpdateAndSwap) { + Tensor t(DT_INT32, TensorShape({5})); + test::FillValues(&t, {1, 2, 3, 2, 3}); + + std::unique_ptr device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + std::unique_ptr context(new OpKernelContext(¶ms, 3)); + + IndicesIncrRecorder recorder("test", 16, 2); + recorder.UpdateGlobalVersion(); + recorder.UpdateIndices(t, context.get()); + + std::unordered_map out_indices; + recorder.SwapIndices(out_indices); + EXPECT_EQ(3, out_indices.size()); + EXPECT_EQ(1, out_indices[1]); + EXPECT_EQ(2, out_indices[2]); + EXPECT_EQ(2, out_indices[3]); +} + +TEST(DivSparsePartitionerTest, TestCalcGlobalOffset) { + // part_count: 4, hash_bucket_size: 15 + // [0, 4), [4, 8), [8, 12), [12, 15) + + { + DivSparsePartitioner p(4, 0, 15); + EXPECT_EQ(0, p.CalcGlobalOffset(0)); + EXPECT_EQ(1, p.CalcGlobalOffset(1)); + EXPECT_EQ(2, p.CalcGlobalOffset(2)); + EXPECT_EQ(3, p.CalcGlobalOffset(3)); + } + + { + DivSparsePartitioner p(4, 1, 15); + EXPECT_EQ(4, p.CalcGlobalOffset(0)); + EXPECT_EQ(5, p.CalcGlobalOffset(1)); + EXPECT_EQ(6, p.CalcGlobalOffset(2)); + EXPECT_EQ(7, p.CalcGlobalOffset(3)); + } + + { + DivSparsePartitioner p(4, 2, 15); + EXPECT_EQ(8, p.CalcGlobalOffset(0)); + EXPECT_EQ(9, p.CalcGlobalOffset(1)); + EXPECT_EQ(10, p.CalcGlobalOffset(2)); + EXPECT_EQ(11, p.CalcGlobalOffset(3)); + } + + { + DivSparsePartitioner p(4, 3, 15); + EXPECT_EQ(12, p.CalcGlobalOffset(0)); + EXPECT_EQ(13, p.CalcGlobalOffset(1)); + EXPECT_EQ(14, p.CalcGlobalOffset(2)); + } +} + +class CollectOpTest : public OpsTestBase { + protected: + void MakeOp(const string &config_str, const string &tensor_name, + DataType ktype, const string &part_mode = "div", + int64 part_idx = 0, int64 part_count = 0, + int64 hash_bucket_size = 0) { + TF_EXPECT_OK(NodeDefBuilder("collect_op", "CollectSparseIndices") + .Attr("tensor_name", tensor_name) + .Attr("config", config_str) + .Attr("part_idx", part_idx) + .Attr("part_count", part_count) + .Attr("hash_bucket_size", hash_bucket_size) + .Attr("part_mode", part_mode) + .Attr("ktype", ktype) + .Finalize(node_def())); + + TF_EXPECT_OK(InitOp()); + } + + template + void CheckCollect() { + string tensor_name = "test_tensor_name"; + DataType key_type = DataTypeToEnum::v(); + MakeOp("", tensor_name, key_type); + + // prepare context to run the op + context_.reset(nullptr); + + params_.reset(new OpKernelContext::Params); + params_.get()->device = device_; + params_.get()->frame_iter = FrameAndIter(0, 0); + params_.get()->inputs = &inputs_; + params_.get()->op_kernel = kernel_.get(); + step_container_.reset(new ScopedStepContainer(0, [](const string &) {})); + params_->step_container = step_container_.get(); + std::vector attrs; + test::SetOutputAttrs(params_.get(), &attrs); + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; + params_.get()->slice_reader_cache = &slice_reader_cache_wrapper; + params_.get()->resource_manager = device_->resource_manager(); + + context_.reset(new OpKernelContext(params_.get())); + + IndicesIncrRecorder *sparse_incr_res = nullptr; + auto rm = device_->resource_manager(); + + Status s = rm->LookupOrCreate>( + "", tensor_name + "_sparse_incr", &sparse_incr_res, + [this, tensor_name](IndicesIncrRecorder **ptr) { + *ptr = new IndicesIncrRecorder(tensor_name); + (*ptr)->UpdateGlobalVersion(); + return OkStatus(); + }); + ASSERT_TRUE(s.ok()); + + Tensor indices(allocator(), key_type, TensorShape({5})); + test::FillValues( + &indices, {(KeyType)1, (KeyType)2, (KeyType)3, (KeyType)4, (KeyType)5}); + sparse_incr_res->UpdateIndices(indices, context_.get()); + + device_->Compute(kernel_.get(), context_.get()); + + Tensor output_keys = *GetOutput(0); + Tensor output_global_keys = *GetOutput(1); + EXPECT_EQ(5, output_keys.NumElements()); + EXPECT_EQ(5, output_global_keys.NumElements()); + test::ExpectTensorEqual(output_keys, output_global_keys); + } +}; + +#define TEST_COLLECT(kt) \ + TEST_F(CollectOpTest, TestCollect##_##kt) { CheckCollect(); } + +TEST_COLLECT(int64); +TEST_COLLECT(int32); + +} // namespace +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc b/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc new file mode 100644 index 00000000..66aa6d68 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include "hotness_calculate.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" + +namespace stream_executor { +namespace gpu { +cudaStream_t AsGpuStreamValue(Stream* stream); +} // namespace gpu +} // namespace stream_executor + +namespace tensorflow { + +// ----------------------------------------------------------------------------------------------- +// HotnessCalculate +// ----------------------------------------------------------------------------------------------- +template +class HotnessCalculateOp : public OpKernel { + public: + explicit HotnessCalculateOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + launcher_.initialize(); + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_lookups", &num_lookups_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_gpus", &num_gpus_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* row_length_send_buffer = nullptr; + OP_REQUIRES_OK(ctx, + ctx->input("row_length_buffer", &row_length_send_buffer)); + int64_t input_len = row_length_send_buffer->dim_size(0); + OP_REQUIRES( + ctx, input_len % (num_lookups_ * num_gpus_) == 0, + errors::InvalidArgument("input_len%(num_lookups_*num_gpus_) != 0")); + size_t local_batchsize = input_len / num_lookups_ / num_gpus_; + Tensor* hotness = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {num_lookups_}, &hotness)); + + // temp buffer + Tensor device_buffer; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DT_INT32, {num_lookups_}, &device_buffer)); + + // stream + auto device_ctx = ctx->op_device_context(); + OP_REQUIRES(ctx, device_ctx != nullptr, + errors::Aborted("No valid device context.")); + cudaStream_t stream = + stream_executor::gpu::AsGpuStreamValue(device_ctx->stream()); + + // cuda kernel + launcher_(row_length_send_buffer->data(), local_batchsize, num_lookups_, + num_gpus_, device_buffer.data(), hotness->data(), stream); + } + + private: + sok::HotnessCalLauncher launcher_; + int num_lookups_; + int num_gpus_; +}; + +#define REGISTER_GPU_KERNELS(dtype_tf, dtype) \ + REGISTER_KERNEL_BUILDER(Name("HotnessCalculate") \ + .Device(DEVICE_GPU) \ + .HostMemory("hotness") \ + .TypeConstraint("Tindices"), \ + HotnessCalculateOp) + +REGISTER_GPU_KERNELS(int64_t, int64_t); +REGISTER_GPU_KERNELS(int32_t, int32_t); + +#undef REGISTER_GPU_KERNELS + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc new file mode 100644 index 00000000..0222bdcf --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hotness_calculate.h" + +#include "deepray/custom_ops/utils/check.h" + +namespace sok { + +template +__global__ void hotnessCalKernel(const DType *row_length_recv_buffer, + size_t local_batchsize, int num_lookup, + int num_gpus, int *outputs) { + size_t thread_cnt = blockDim.x * gridDim.x; + size_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x; + size_t items = local_batchsize * num_lookup * num_gpus; + extern __shared__ int smem[]; + for (size_t i = threadIdx.x; i < num_lookup; i += blockDim.x) { + smem[i] = 0; + } + + __syncthreads(); + for (size_t i = thread_idx; i < items; i += thread_cnt) { + size_t num_lookup_id = (i / local_batchsize) % num_lookup; + int value = (int)(row_length_recv_buffer[i]); + atomicMax(smem + num_lookup_id, value); + } + + __syncthreads(); + for (size_t i = threadIdx.x; i < num_lookup; i += blockDim.x) { + atomicMax(outputs + i, smem[i]); + } +} + +template +void HotnessCalLauncher::initialize() { + int device; + CUDACHECK(cudaGetDevice(&device)); + CUDACHECK(cudaDeviceGetAttribute(&sm_count_, cudaDevAttrMultiProcessorCount, + device)); +} + +template +void HotnessCalLauncher::operator()( + const void *row_length_recv_buffer, size_t local_batchsize, int num_lookup, + int num_gpus, void *output_device, void *output_host, cudaStream_t stream) { + const DType *t_row_length_recv_buffer = + reinterpret_cast(row_length_recv_buffer); + int32_t *t_output_device = reinterpret_cast(output_device); + int32_t *t_output_host = reinterpret_cast(output_host); + + dim3 grid_dim(2 * sm_count_); + dim3 block_dim(1024ul); + CUDACHECK(cudaMemsetAsync(t_output_device, 0, sizeof(int32_t) * num_lookup, + stream)); + hotnessCalKernel + <<>>( + t_row_length_recv_buffer, local_batchsize, num_lookup, num_gpus, + t_output_device); + CUDACHECK(cudaMemcpyAsync(t_output_host, t_output_device, + sizeof(int32_t) * num_lookup, + cudaMemcpyDeviceToHost, stream)); + CUDACHECK(cudaStreamSynchronize(stream)); + + // CUDACHECK(cudaGetLastError()); +} + +template class HotnessCalLauncher; +template class HotnessCalLauncher; + +} // namespace sok diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h new file mode 100644 index 00000000..8b1d473a --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOTNESS_KERNEL_H +#define HOTNESS_KERNEL_H + +#include +#include + +#include + +namespace sok { + +template +class HotnessCalLauncher { + public: + void initialize(); + void operator()(const void* row_length_recv_buffer, size_t local_batchsize, + int num_lookup, int num_gpus, void* output_device, + void* output_host, cudaStream_t stream = 0); + + private: + int sm_count_; +}; + +} // namespace sok + +#endif diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc new file mode 100644 index 00000000..349b941b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc @@ -0,0 +1,593 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/dense_update_functor.h" +#include "tensorflow/core/kernels/gather_functor.h" +#include "tensorflow/core/kernels/scatter_functor.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/core/util/work_sharder.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +#if GOOGLE_CUDA +using se::DeviceMemoryBase; +using se::Stream; +#endif // GOOGLE_CUDA + +template +class KvResourceLookupResourceOp : public OpKernel { + public: + explicit KvResourceLookupResourceOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + Tensor* output; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {1}, &output)); + auto output_scalar = output->scalar(); + output_scalar() = (int64)ev; + } +}; + +#define REGISTER_KV_LOOKUP_RESOURCE(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceLookupResource") \ + .Device(DEVICE_##dev) \ + .HostMemory("output") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvResourceLookupResourceOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KV_LOOKUP_RESOURCE(dev, int32, type) \ + REGISTER_KV_LOOKUP_RESOURCE(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KV_LOOKUP_RESOURCE + +template +class KvResourceLookupIDOp : public OpKernel { + public: + explicit KvResourceLookupIDOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* c) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = c->input(1); + const int64 N = indices.NumElements(); + + TensorShape result_shape = indices.shape(); + + Tensor* out = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out)); + + if (N > 0) { + auto out_flat = out->flat(); + int64* out_base = &out_flat(0); + + auto indices_flat = indices.flat(); + const int64 indices_size = static_cast(indices_flat.dimension(0)); + EmbeddingVarContext ev_ctx(c); + ev->GetOrCreateKey(ev_ctx, indices, reinterpret_cast(out_base), + indices_size); + } + } +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceLookupIDOp) +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID") \ + .Device(DEVICE_##dev) \ + .HostMemory("indices") \ + .HostMemory("pointer") \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceLookupIDOp) +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +template +class KvResourceCollectEmbeddingOp : public OpKernel { + public: + explicit KvResourceCollectEmbeddingOp(OpKernelConstruction* c) + : OpKernel(c) {} + + void Compute(OpKernelContext* c) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = c->input(1); + const Tensor& pointer = c->input(2); + const int64 N = indices.NumElements(); + + TensorShape result_shape = indices.shape(); + TensorShape value_shape({ev->ValueLen()}); + result_shape.AppendShape(value_shape); + + Tensor* out = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out)); + + if (N > 0) { + auto out_flat = out->shaped({N, out->NumElements() / N}); + TValue* out_base = &out_flat(0, 0); + + auto indices_flat = indices.flat(); + auto pointer_flat = pointer.flat(); + const int64 indices_size = static_cast(indices_flat.dimension(0)); + const int64 slice_elems = out_flat.dimension(1); + OP_REQUIRES( + c, ev->ValueLen() == slice_elems, + errors::InvalidArgument( + "ev's value_len should same with output's dimension(1)", + std::to_string(slice_elems), std::to_string(ev->ValueLen()))); + OP_REQUIRES( + c, + !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N), + errors::InvalidArgument("MultiLevel EV's Cache size ", + ev->CacheSize(), + " should large than IDs in batch ", N)); + const size_t slice_bytes = slice_elems * sizeof(TValue); + EmbeddingVarContext ev_ctx(c); + ev->GatherEmbeddings(ev_ctx, indices, (void**)pointer.data(), out_base, + N); + } + } +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceCollectEmbedding") \ + .Device(DEVICE_##dev) \ + .HostMemory("resource") \ + .HostMemory("indices") \ + .HostMemory("pointer") \ + .HostMemory("default_value") \ + .HostMemory("output") \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceCollectEmbeddingOp) + +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceCollectEmbedding") \ + .Device(DEVICE_##dev) \ + .HostMemory("indices") \ + .HostMemory("pointer") \ + .HostMemory("default_value") \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceCollectEmbeddingOp) + +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +template +class KvResourceGatherOp : public OpKernel { + public: + explicit KvResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &is_use_default_value_tensor_)); + } + + void Compute(OpKernelContext* c) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = c->input(1); + const int64 N = indices.NumElements(); + + TensorShape result_shape = indices.shape(); + TensorShape value_shape({ev->ValueLen()}); + result_shape.AppendShape(value_shape); + + Tensor* out = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out)); + + if (N > 0) { + auto out_flat = out->shaped({N, out->NumElements() / N}); + TValue* out_base = &out_flat(0, 0); + + const int64 slice_elems = out_flat.dimension(1); + OP_REQUIRES( + c, ev->ValueLen() == slice_elems, + errors::InvalidArgument( + "ev's value_len should same with output's dimension(1)", + std::to_string(slice_elems), std::to_string(ev->ValueLen()))); + OP_REQUIRES( + c, + !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N), + errors::InvalidArgument("MultiLevel EV's Cache size ", + ev->CacheSize(), + " should large than IDs in batch ", N)); + + EmbeddingVarContext ev_ctx(c); + if (is_use_default_value_tensor_) { + ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N, + reinterpret_cast(c->input(2).data())); + } else { + ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N); + if (has_counts) { + const Tensor& indices_counts = c->input(2); + ev->UpdateCache(indices, indices_counts, true); + } else { + ev->UpdateCache(indices, true); + } + } + } + } + + private: + bool is_use_default_value_tensor_; +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceGather") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceGatherOp) + +#define REGISTER_KERNELS_ALL_INDICES(type) \ + REGISTER_KERNELS(CPU, int32, type); \ + REGISTER_KERNELS(CPU, int64, type) + +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDICES) +#undef REGISTER_KERNELS_ALL_INDICES +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceGatherV1") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceGatherOp) + +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +template +class KvResourceGatherGPUOp : public OpKernel { + public: + explicit KvResourceGatherGPUOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor", + &is_use_default_value_tensor_)); + } + + void Compute(OpKernelContext* c) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = c->input(1); + const int64 N = indices.NumElements(); + + TensorShape result_shape = indices.shape(); + TensorShape value_shape({ev->ValueLen()}); + result_shape.AppendShape(value_shape); + + Tensor* out = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out)); + + if (N > 0) { + auto out_flat = out->shaped({N, out->NumElements() / N}); + TValue* out_base = &out_flat(0, 0); + + auto indices_flat = indices.flat(); + const int64 indices_size = static_cast(indices_flat.dimension(0)); + const int64 slice_elems = out_flat.dimension(1); + TValue* default_v = nullptr; + if (is_use_default_value_tensor_) { + default_v = (TValue*)c->input(2).data(); + } else { + default_v = ev->GetDefaultValuePtr(); + } + OP_REQUIRES( + c, ev->ValueLen() == slice_elems, + errors::InvalidArgument( + "ev's value_len should same with output's dimension(1)", + std::to_string(slice_elems), std::to_string(ev->ValueLen()))); + OP_REQUIRES( + c, + !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N), + errors::InvalidArgument("MultiLevel EV's Cache size ", + ev->CacheSize(), + " should large than IDs in batch ", N)); + const size_t slice_bytes = slice_elems * sizeof(TValue); + EmbeddingVarContext ev_ctx(c); + if (ev->IsSingleHbm()) { + const TKey* key_base = &indices_flat(0); + const Device& device = c->eigen_device(); + if (is_use_default_value_tensor_) { + Tensor default_values(c->input(2)); + auto default_value_num = + default_values.NumElements() / ev->ValueLen(); + auto default_values_matrix = default_values.shaped( + {default_value_num, ev->ValueLen()}); + TValue* default_v_base = &default_values_matrix(0, 0); + ev->GetEmbeddings(ev_ctx, key_base, out_base, N); + } else { + ev->GetEmbeddings(ev_ctx, key_base, out_base, N); + } + } else { + Tensor indices_host(indices.dtype(), indices.shape()); + // Copy ids from GPU to CPU for CPU Lookup. + auto stream = c->op_device_context()->stream(); + auto event_mgr = + c->device()->tensorflow_accelerator_device_info()->event_mgr; + se::DeviceMemoryBase gpu_src(const_cast(&indices_flat(0)), + N * sizeof(TKey)); + stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TKey)); + SyncWithEventMgr(stream, event_mgr); + + EmbeddingVarContext ev_ctx(c); + ev->GetEmbeddings(ev_ctx, (TKey*)indices_host.data(), out_base, N); + if (has_counts) { + const Tensor& indices_counts = c->input(2); + ev->UpdateCache(indices_host, indices_counts, true); + } else { + ev->UpdateCache(indices_host, true); + } + } + } + } + + private: + bool is_use_default_value_tensor_; +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceGather") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceGatherGPUOp) + +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU); +#undef REGISTER_KERNELS_GPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceGatherV1") \ + .Device(DEVICE_##dev) \ + .HostMemory("counts") \ + .TypeConstraint("dtype") \ + .TypeConstraint("Tkeys"), \ + KvResourceGatherGPUOp) + +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type); \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +template +class EVGetFrequencyOp : public OpKernel { + public: + explicit EVGetFrequencyOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = ctx->input(1); + auto indices_flat = indices.flat(); + + Tensor* output; + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, {indices.NumElements()}, &output)); + for (int i = 0; i < indices.NumElements(); ++i) { + int64 f = ev->GetFreq(indices_flat(i)); + output->flat()(i) = f; + } + } +}; + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("EVGetFrequency") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("Tvalues"), \ + EVGetFrequencyOp); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL) +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +class EVGetVersionOp : public OpKernel { + public: + explicit EVGetVersionOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = ctx->input(1); + auto indices_flat = indices.flat(); + + Tensor* output; + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, {indices.NumElements()}, &output)); + for (int i = 0; i < indices.NumElements(); ++i) { + int64 v = ev->GetVersion(indices_flat(i)); + output->flat()(i) = v; + } + } +}; + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("EVGetVersion") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("Tvalues"), \ + EVGetVersionOp); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL) +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +class KvResourceLookupTierOp : public OpKernel { + public: + explicit KvResourceLookupTierOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + core::ScopedUnref unref_me(ev); + const Tensor& indices = ctx->input(1); + auto indices_flat = indices.flat(); + + Tensor* output; + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, {indices.NumElements()}, &output)); + for (int i = 0; i < indices.NumElements(); ++i) { + int v = ev->storage()->LookupTier(indices_flat(i)); + output->flat()(i) = v; + } + } +}; + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceLookupTier") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvResourceLookupTierOp); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL) +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceLookupTier") \ + .Device(DEVICE_GPU) \ + .HostMemory("ids") \ + .HostMemory("output") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvResourceLookupTierOp); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_ALL) +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc new file mode 100644 index 00000000..922c2122 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc @@ -0,0 +1,620 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "training_ali_ops_gpu.h" +#endif + +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/dense_update_functor.h" +#include "tensorflow/core/kernels/gather_functor.h" +#include "tensorflow/core/kernels/scatter_functor.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/core/util/work_sharder.h" + +// Please use the appropriate namespace for your project +namespace tensorflow { + +using ::tensorflow::OpKernel; +using ::tensorflow::OpKernelConstruction; +using ::tensorflow::OpKernelContext; +using ::tensorflow::Tensor; +using ::tensorflow::errors::InvalidArgument; + +// ----------------------------------------------------------------------------------------------- +// KvVarHandle +// ----------------------------------------------------------------------------------------------- +template +class KvVarHandleOp : public OpKernel { + public: + explicit KvVarHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Tkeys", &key_type_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_and_shape_.dtype)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &dtype_and_shape_.shape)); + OP_REQUIRES(ctx, dtype_and_shape_.shape.dims() == 1, + errors::Aborted("len(shape) must be 1")); + OP_REQUIRES(ctx, dtype_and_shape_.shape.dim_size(0) > 0, + errors::Aborted("shape[0] must > 0")); + + info_ = Info(); + is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME; + + // Use const_tensor_ if the variable is non-anonymous. + if (!is_anonymous_) { + AllocatorAttributes attr; + attr.set_on_host(true); + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), + &const_tensor_, attr)); + const_tensor_.scalar()() = + MakeResourceHandle>( + ctx, container_, name_, + std::vector{dtype_and_shape_}); + std::cout << "[EV INFO] Create non-anonymous " + info_ << std::endl; + } + } + + void Compute(OpKernelContext* ctx) override { + if (is_anonymous_) { + // throw std::invalid_argument("EV cannot be ANONYMOUS!"); + OP_REQUIRES(ctx, false, + errors::InvalidArgument("EV cannot be ANONYMOUS!")); + } else { + ctx->set_output(0, const_tensor_); + } + } + + const Tensor* const_tensor() const override { + return is_anonymous_ ? nullptr : &const_tensor_; + } + + private: + bool is_anonymous_; + std::string container_; + std::string name_; + std::string info_; + DataType key_type_; + DtypeAndPartialTensorShape dtype_and_shape_; + Tensor const_tensor_; + + std::string Info() { + std::string dtype = DataTypeString(dtype_and_shape_.dtype); + std::string key_type = DataTypeString(key_type_); + std::string dim_0 = std::to_string(dtype_and_shape_.shape.dim_size(0)); + std::string shape = "[" + dim_0 + "]"; + std::string info = + " handle: " + container_ + "/" + name_ + ", "; + info += "key_type: " + key_type + ", dtype: " + dtype + ", shape: " + shape; + return info; + } +}; + +#define REGISTER_KV_VAR_HANDLE(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvVarHandleOp") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvVarHandleOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KV_VAR_HANDLE(dev, int32, type) \ + REGISTER_KV_VAR_HANDLE(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KV_VAR_HANDLE + +template +class KvVariableShapeOp : public OpKernel { + public: + explicit KvVariableShapeOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + core::ScopedUnref unref_me(ev); + TensorShape shape({ev->Size(), ev->ValueLen()}); + Tensor* output; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {shape.dims()}, &output)); + for (int i = 0; i < shape.dims(); ++i) { + output->flat()(i) = shape.dim_size(i); + } + } +}; + +#define REGISTER_KERNELS(dev, type, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvVariableShape") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("out_type") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype") \ + .HostMemory("output"), \ + KvVariableShapeOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, int32, type) \ + REGISTER_KERNELS(dev, int32, int64, type) \ + REGISTER_KERNELS(dev, int64, int32, type) \ + REGISTER_KERNELS(dev, int64, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +class DestroyKvResourceOp : public OpKernel { + public: + explicit DestroyKvResourceOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, + ctx->GetAttr("ignore_lookup_error", &ignore_lookup_error_)); + } + + void Compute(OpKernelContext* ctx) override { + const ResourceHandle& p = HandleFromInput(ctx, 0); + Status status = DeleteResource(ctx, p); + if (ignore_lookup_error_ && errors::IsNotFound(status)) { + return; + } + OP_REQUIRES_OK(ctx, status); + } + + private: + bool ignore_lookup_error_; +}; + +REGISTER_KERNEL_BUILDER(Name("DestroyKvResourceOp").Device(DEVICE_CPU), + DestroyKvResourceOp); + +template +class InitializeKvVariableOp : public OpKernel { + public: + explicit InitializeKvVariableOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(c, c->GetAttr("counter_type", &counter_type_)); + OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_)); + OP_REQUIRES(c, shape_.dims() == 1, + errors::InvalidArgument("KvVariable dimension must be 1")); + OP_REQUIRES_OK(c, c->GetAttr("emb_index", &emb_index_)); + OP_REQUIRES_OK(c, c->GetAttr("block_num", &block_num_)); + OP_REQUIRES_OK(c, c->GetAttr("slot_index", &slot_index_)); + OP_REQUIRES_OK(c, c->GetAttr("steps_to_live", &steps_to_live_)); + OP_REQUIRES_OK(c, c->GetAttr("filter_freq", &filter_freq_)); + OP_REQUIRES_OK(c, c->GetAttr("max_freq", &max_freq_)); + OP_REQUIRES_OK(c, c->GetAttr("max_element_size", &max_element_size_)); + OP_REQUIRES_OK(c, c->GetAttr("false_positive_probability", + &false_positive_probability_)); + OP_REQUIRES_OK(c, c->GetAttr("l2_weight_threshold", &l2_weight_threshold_)); + OP_REQUIRES_OK(c, c->GetAttr("default_value_dim", &default_value_dim_)); + OP_REQUIRES_OK(c, c->GetAttr("default_value_no_permission", + &default_value_no_permission_)); + OP_REQUIRES_OK(c, c->GetAttr("slot_num", &slot_num_)); + OP_REQUIRES_OK(c, c->GetAttr("record_freq", &record_freq_)); + OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_)); + int embedding_var_type = 0; + Status s = c->GetAttr("embedding_variable_type", &embedding_var_type); + if (!s.ok()) { + // Not InitializeKvVariableV2Op! + embedding_var_type = embedding::EmbeddingVariableType::MUTABLE; + } + is_inference_ = false; + TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_)); + is_inference_ |= + (embedding_var_type == embedding::EmbeddingVariableType::IMMUTABLE); + + // initial_num_buckets is useless, so is used to set is_set_initialized_. + int64 initial_num_buckets = 0; + OP_REQUIRES_OK(c, c->GetAttr("initial_num_buckets", &initial_num_buckets)); + is_set_initialized_ = true; + if (initial_num_buckets == + embedding::IsSetInitialized::NOT_SET_INITAILIZED) { + is_set_initialized_ = false; + } + + int64 storage_type = 0; + OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type)); + storage_type_ = static_cast(storage_type); + device_type_str_ = c->device_type().type_string(); + if (storage_type_ == embedding::DEFAULT) { + if (device_type_str_ == "CPU") { + storage_type_ = embedding::DRAM; + } else { + storage_type_ = embedding::HBM; + } + } + + bool if_op_on_gpu = (device_type_str_ == "GPU"); + bool if_embedding_on_hbm = (storage_type_ == embedding::HBM || + storage_type_ == embedding::HBM_DRAM || + storage_type_ == embedding::HBM_DRAM_SSDHASH); + OP_REQUIRES( + c, if_op_on_gpu == if_embedding_on_hbm, + errors::InvalidArgument("Storage of EV and device of Op mismatch.")); + + OP_REQUIRES_OK(c, c->GetAttr("storage_path", &storage_path_)); + OP_REQUIRES_OK(c, c->GetAttr("storage_size", &storage_size_)); + + if (filter_freq_ < 0) { + LOG(INFO) << "filter_freq < 0 is invalid, feature filter is disabled."; + filter_freq_ = 0; + } + + record_freq_ |= (storage_type > 5); + record_version_ |= (storage_type > 5); + + OP_REQUIRES(c, steps_to_live_ >= 0, + errors::InvalidArgument("steps_to_live must >= 0, ", + std::to_string(steps_to_live_))); + + OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_)); + OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& default_values = context->input(2); + + OP_REQUIRES(context, dtype_ == default_values.dtype(), + errors::InvalidArgument( + "Variable and value dtypes don't match; respectively, ", + dtype_, " and ", default_values.dtype())); + + ResourceHandle handle_self = HandleFromInput(context, 0); + ResourceHandle handle_primary = HandleFromInput(context, 1); + std::string opname = handle_self.name(); + + EmbeddingVar* ev = nullptr; + + if (handle_self.name() == handle_primary.name() && + handle_self.container() == handle_primary.container()) { + OP_REQUIRES_OK( + context, + LookupOrCreateResource>( + context, handle_self, &ev, + [this, default_values, opname, context, + handle_self](EmbeddingVar** ptr) { + Allocator* allocator = + context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, emb_index_, + block_num_, slot_num_, opname + "-primary", steps_to_live_, + filter_freq_, max_freq_, l2_weight_threshold_, + max_element_size_, false_positive_probability_, + counter_type_, default_value_dim_, + default_value_no_permission_, record_freq_, record_version_, + is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(storage_type_, storage_path_, + storage_size_, embedding_config), + alloc_for_ev, feat_desc, handle_self.name()); + *ptr = new EmbeddingVar(handle_self.name(), + storage, embedding_config, + alloc_for_ev, feat_desc); + return (*ptr)->Init(default_values, default_value_dim_); + })); + } else { + EmbeddingVar* primary_variable = nullptr; + OP_REQUIRES_OK( + context, + LookupOrCreateResource>( + context, handle_primary, &primary_variable, + [this, default_values, opname, handle_primary, + context](EmbeddingVar** ptr) { + int64 primary_slot_index(0), primary_emb_index(0); + Allocator* allocator = + context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + primary_emb_index + block_num_ * primary_slot_index, + primary_emb_index, block_num_, slot_num_, + opname + "-primary", steps_to_live_, filter_freq_, + max_freq_, l2_weight_threshold_, max_element_size_, + false_positive_probability_, counter_type_, 0, record_freq_, + record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(storage_type_, storage_path_, + storage_size_, embedding_config), + alloc_for_ev, feat_desc, handle_primary.name()); + *ptr = new EmbeddingVar(handle_primary.name(), + storage, embedding_config, + alloc_for_ev, feat_desc); + // default_values is slot value, should not to initialize + // primary value + return OkStatus(); + })); + + OP_REQUIRES_OK( + context, + LookupOrCreateResource>( + context, handle_self, &ev, + [this, default_values, opname, primary_variable, handle_self, + context](EmbeddingVar** ptr) { + Allocator* allocator = + context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, emb_index_, + block_num_, slot_num_, opname, steps_to_live_, filter_freq_, + max_freq_, l2_weight_threshold_, max_element_size_, + false_positive_probability_, counter_type_, + default_value_dim_, default_value_no_permission_, + record_freq_, record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + *ptr = new EmbeddingVar( + handle_self.name(), primary_variable->storage(), + embedding_config, alloc_for_ev, + primary_variable->feature_descriptor()); + return (*ptr)->Init(default_values, default_value_dim_); + })); + core::ScopedUnref unref_me(primary_variable); + } + core::ScopedUnref unref_me(ev); + if (is_set_initialized_) { + ev->SetInitialized(); + } + } + + private: + DataType dtype_; + DataType counter_type_; + TensorShape shape_; + int64 steps_to_live_; + int64 emb_index_; + int64 block_num_; + int64 slot_index_; + int64 slot_num_; + std::string ht_type_; + int64 ht_partition_num_; + int64 filter_freq_; + int64 max_freq_; + float l2_weight_threshold_; + int64 max_element_size_; + float false_positive_probability_; + embedding::StorageType storage_type_; + std::string storage_path_; + std::vector storage_size_; + int64 default_value_dim_; + float default_value_no_permission_; + bool record_freq_; + bool record_version_; + bool is_inference_; + bool is_set_initialized_; + std::string device_type_str_; +}; + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableOp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + InitializeKvVariableOp); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + InitializeKvVariableOp); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableOp") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + InitializeKvVariableOp); + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type); +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + InitializeKvVariableOp); + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type); +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +template +class KvResourceIsInitializedOp : public OpKernel { + public: + explicit KvResourceIsInitializedOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + Tensor* output; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output)); + EmbeddingVar* ev = nullptr; + bool found; + if (LookupResource>(ctx, HandleFromInput(ctx, 0), + &ev) + .ok()) { + found = ev->IsInitialized(); + ev->Unref(); + } else { + found = false; + } + + output->flat()(0) = found; + } +}; +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvVarIsInitializedOp") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype") \ + .HostMemory("is_initialized") \ + .Device(DEVICE_##dev), \ + KvResourceIsInitializedOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type) \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +class KvResourceIsAllSlotInitializedOp : public OpKernel { + public: + explicit KvResourceIsAllSlotInitializedOp(OpKernelConstruction* c) + : OpKernel(c) {} + + void Compute(OpKernelContext* ctx) override { + Tensor* output; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output)); + EmbeddingVar* ev = nullptr; + bool found; + if (LookupResource>(ctx, HandleFromInput(ctx, 0), + &ev) + .ok()) { + found = ev->IsAllSlotInitialized(); + ev->Unref(); + } else { + found = false; + } + output->flat()(0) = found; + } +}; +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvVarIsAllSlotInitializedOp") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype") \ + .HostMemory("is_all_slot_initialized") \ + .Device(DEVICE_##dev), \ + KvResourceIsAllSlotInitializedOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type) \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +class KvResourceInitCacheStrategyOp : public OpKernel { + public: + explicit KvResourceInitCacheStrategyOp(OpKernelConstruction* c) + : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("cache_strategy", &cache_strategy_)); + } + + void Compute(OpKernelContext* ctx) override { + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev)); + core::ScopedUnref unref_me(ev); + ev->InitCache(static_cast(cache_strategy_)); + } + + private: + int cache_strategy_; +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceInitCacheStrategyOp") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype") \ + .Device(DEVICE_##dev), \ + KvResourceInitCacheStrategyOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type) \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc new file mode 100644 index 00000000..3f19372a --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc @@ -0,0 +1,259 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include "xla/stream_executor/cuda/cuda_activation.h" +using stream_executor::cuda::ScopedActivateExecutorContext; +#elif TENSORFLOW_USE_ROCM +#include "tensorflow/core/platform/rocm.h" +using stream_executor::rocm::ScopedActivateExecutorContext; + +#endif + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/dense_update_functor.h" +#include "tensorflow/core/kernels/gather_functor.h" +#include "tensorflow/core/kernels/scatter_functor.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +constexpr int64 DEFAULT_RESTORE_THREAD_NUM = 4; + +class KvRestoreThreadPool { + public: + KvRestoreThreadPool() { + TF_CHECK_OK(ReadInt64FromEnvVar("TF_EV_RESTORE_THREAD_NUM", + DEFAULT_RESTORE_THREAD_NUM, &thread_num_)); + } + + static thread::ThreadPool* GetInstance() { + static thread::ThreadPool tp(Env::Default(), "restore_ev_threadpool", + thread_num_); + return &tp; + } + + private: + static int64 thread_num_; +}; + +int64 KvRestoreThreadPool::thread_num_ = DEFAULT_RESTORE_THREAD_NUM; + +template +class KvResourceImportV3Op : public AsyncOpKernel { + public: + explicit KvResourceImportV3Op(OpKernelConstruction* c) : AsyncOpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_)); + OP_REQUIRES(c, shape_.dims() == 1, + errors::InvalidArgument("KvVariable dimension must be 1")); + OP_REQUIRES_OK(c, c->GetAttr("partition_id", &partition_id_)); + OP_REQUIRES(c, partition_id_ >= 0, + errors::InvalidArgument("partition_id must >= 0, ", + std::to_string(partition_id_))); + OP_REQUIRES_OK(c, c->GetAttr("partition_num", &partition_num_)); + OP_REQUIRES(c, partition_num_ >= 1, + errors::InvalidArgument("partition_num must >= 1, ", + std::to_string(partition_num_))); + OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_)); + bool reset_version = false; + TF_CHECK_OK( + ReadBoolFromEnvVar("TF_EV_RESET_VERSION", false, &reset_version)); + reset_version_ = reset_version_ || reset_version; + + TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true, + &ev_async_restore_)); + } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + const Tensor& file_name = context->input(0); + const std::string file_name_string = file_name.scalar()(); + const Tensor& name = context->input(2); + const std::string name_string = name.scalar()(); + + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(context, + LookupResource(context, HandleFromInput(context, 1), &ev)); + + core::ScopedUnref unref_me(ev); + + // EV should not be initialized at this time. + if (ev->IsInitialized()) { + LOG(WARNING) << "EV (" << name_string + << ") has already been initialized."; + } + + auto do_compute = [this, context, file_name_string, ev, name_string, + done]() { + BundleReader reader(Env::Default(), file_name_string); + auto s = reader.status(); + if (!s.ok()) { + LOG(FATAL) << "Restore EV failure, create BundleReader error:" + << s.ToString(); + done(); + } + + if (ev->IsSingleHbm()) { +#if GOOGLE_CUDA + ScopedActivateExecutorContext scoped_activation{ + context->op_device_context()->stream()->parent()}; + const Eigen::GpuDevice& device = context->eigen_gpu_device(); + ev->Restore(name_string, file_name_string, partition_id_, + partition_num_, false, &reader, reset_version_, &device); +#endif + } else { + ev->Restore(name_string, file_name_string, partition_id_, + partition_num_, false, &reader, reset_version_, nullptr); + } + ev->SetInitialized(); + done(); + }; + + if (ev_async_restore_) { + auto tp = KvRestoreThreadPool::GetInstance(); + tp->Schedule(do_compute); + } else { + do_compute(); + } + } + + private: + int64 partition_id_; + int64 partition_num_; + DataType dtype_; + TensorShape shape_; + bool reset_version_; + bool ev_async_restore_; +}; + +#define REGISTER_KERNELS(dev, ktype, vtype, device) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceImportV3") \ + .Device(DEVICE_##dev) \ + .HostMemory("prefix") \ + .HostMemory("tensor_names") \ + .HostMemory("empty_key") \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvResourceImportV3Op); +#define REGISTER_KERNELS_ALL(dev, type, device) \ + REGISTER_KERNELS(dev, int32, type, device) \ + REGISTER_KERNELS(dev, int64, type, device) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type, CPUDevice) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type, GPUDevice) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +class KvResourceIncrImportOp : public AsyncOpKernel { + public: + explicit KvResourceIncrImportOp(OpKernelConstruction* c) : AsyncOpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_)); + + OP_REQUIRES_OK(c, c->GetAttr("partition_id", &partition_id_)); + OP_REQUIRES(c, partition_id_ >= 0, + errors::InvalidArgument("partition_id must >= 0, ", + std::to_string(partition_id_))); + OP_REQUIRES_OK(c, c->GetAttr("partition_num", &partition_num_)); + OP_REQUIRES(c, partition_num_ >= 1, + errors::InvalidArgument("partition_num must >= 1, ", + std::to_string(partition_num_))); + } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + const Tensor& file_name = context->input(0); + const std::string file_name_string = file_name.scalar()(); + const Tensor& name = context->input(2); + const std::string name_string = name.scalar()(); + + EmbeddingVar* ev = nullptr; + OP_REQUIRES_OK(context, + LookupResource(context, HandleFromInput(context, 1), &ev)); + + core::ScopedUnref unref_me(ev); + + BundleReader reader(Env::Default(), file_name_string); + OP_REQUIRES_OK(context, reader.status()); + + LOG(INFO) << "incr import, evname:" << name_string + << "partition_num:" << partition_num_; + + ev->Restore(name_string, file_name_string, partition_id_, partition_num_, + true, &reader); + ev->SetInitialized(); + done(); + } + + private: + int64 partition_id_; + int64 partition_num_; + DataType dtype_; + TensorShape shape_; + int64 steps_to_live_; + bool restore_versions_; + string ht_type_; + int64 ht_partition_num_; +}; + +#define REGISTER_KERNELS(dev, ktype, vtype) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceIncrImport") \ + .Device(DEVICE_##dev) \ + .TypeConstraint("Tkeys") \ + .TypeConstraint("dtype"), \ + KvResourceIncrImportOp); +#define REGISTER_KERNELS_ALL(dev, type) \ + REGISTER_KERNELS(dev, int32, type) \ + REGISTER_KERNELS(dev, int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU + +#if GOOGLE_CUDA +#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU) +#undef REGISTER_KERNELS_GPU +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc new file mode 100644 index 00000000..63d7760d --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc @@ -0,0 +1,69 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + +#include "kv_variable_util.h" + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h" +#include "deepray/custom_ops/embedding_variable/config.pb.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +Status MoveMatchingFiles(Env* env, const tstring& pattern, + const tstring& merged_prefix, + int64 input_prefix_size) { + std::vector file_vec; + TF_RETURN_IF_ERROR(env->GetMatchingPaths(pattern, &file_vec)); + for (int64 i = 0; i < file_vec.size(); i++) { + const tstring& filename = tstring(file_vec[i].substr(input_prefix_size)); + TF_RETURN_IF_ERROR(env->RenameFile(file_vec[i], merged_prefix + filename)); + } + return OkStatus(); +} + +Status MoveSsdFiles(Env* env, const gtl::ArraySlice& input_prefixes, + const tstring& merged_prefix) { + for (auto input_prefix : input_prefixes) { + const tstring& input_ssd_record_pattern = input_prefix + "*-ssd_record*"; + TF_RETURN_IF_ERROR(MoveMatchingFiles(env, input_ssd_record_pattern, + merged_prefix, input_prefix.size())); + + const tstring& input_emb_files_pattern = input_prefix + "*-emb_files"; + TF_RETURN_IF_ERROR(MoveMatchingFiles(env, input_emb_files_pattern, + merged_prefix, input_prefix.size())); + } + return OkStatus(); +} + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h new file mode 100644 index 00000000..a44c0d8e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h @@ -0,0 +1,165 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_ +#define TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_ + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +using GPUDevice = Eigen::GpuDevice; + +template +class EVKeyDumpIterator : public DumpIterator { + public: + EVKeyDumpIterator(std::vector& key_list) : key_list_(key_list) { + keys_idx_ = 0; + } + + bool HasNext() const { return keys_idx_ < key_list_.size(); } + + T Next() { return key_list_[keys_idx_++]; } + + private: + int64 keys_idx_; + std::vector& key_list_; +}; + +template +class EVValueDumpIterator : public DumpIterator { + public: + EVValueDumpIterator(EmbeddingVar*& ev, std::vector& valueptr_list) + : ev_(ev), valueptr_list_(valueptr_list) { + keys_idx_ = 0; + col_idx_ = 0; + } + + bool HasNext() const { + if (keys_idx_ < valueptr_list_.size()) { + if (keys_idx_ < valueptr_list_.size() - 1) + return true; + else + return col_idx_ < ev_->ValueLen(); + } else + return false; + } + + T Next() { + if (col_idx_ >= ev_->ValueLen()) { + keys_idx_++; + col_idx_ = 0; + } + Eigen::array dims({ev_->ValueLen()}); + typename TTypes::Flat value_flat = + typename TTypes::Flat(valueptr_list_[keys_idx_], dims); + return value_flat(col_idx_++); + } + + private: + EmbeddingVar* ev_; + std::vector& valueptr_list_; + int64 keys_idx_; + int64 col_idx_; +}; + +template +class EVVersionDumpIterator : public DumpIterator { + public: + EVVersionDumpIterator(std::vector& version_list) + : version_list_(version_list) { + keys_idx_ = 0; + } + + bool HasNext() const { return keys_idx_ < version_list_.size(); } + + T Next() { return version_list_[keys_idx_++]; } + + private: + std::vector& version_list_; + int64 keys_idx_; +}; + +template +class EVFreqDumpIterator : public DumpIterator { + public: + EVFreqDumpIterator(std::vector& freq_list) : freq_list_(freq_list) { + keys_idx_ = 0; + } + + bool HasNext() const { return keys_idx_ < freq_list_.size(); } + + T Next() { return freq_list_[keys_idx_++]; } + + private: + std::vector& freq_list_; + int64 keys_idx_; +}; + +template +class EVOffsetDumpIterator : public DumpIterator { + public: + EVOffsetDumpIterator(std::vector& offset_list) + : offset_list_(offset_list) { + keys_idx_ = 0; + } + + bool HasNext() const { return keys_idx_ < offset_list_.size(); } + + T Next() { return offset_list_[keys_idx_++]; } + + private: + std::vector& offset_list_; + int64 keys_idx_; +}; + +template +Status GetInputEmbeddingVar(OpKernelContext* ctx, int input, + EmbeddingVar** var) { + if (LookupResource(ctx, HandleFromInput(ctx, input), var).ok()) { + return OkStatus(); + } else { + return errors::Internal("Invalid versioned variable reference."); + } +} + +Status MoveMatchingFiles(Env* env, const tstring& pattern, + const tstring& merged_prefix, int64 input_prefix_size); + +/*Move two files and one directory: +1. xxxxx-ssd_record.index +2. xxxxx-ssd_record.data +3. xxxxxx-emb_files/ +1 and 2 record the meta data of SSDHash, +and 3 records the embeddings on SSD*/ +Status MoveSsdFiles(Env* env, const gtl::ArraySlice& input_prefixes, + const tstring& merged_prefix); +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc new file mode 100644 index 00000000..9a19319a --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc @@ -0,0 +1,176 @@ +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/saved_tensor_slice_util.h" + +namespace tensorflow { + +namespace { + +// Shared validations of the inputs to the SaveV2 and RestoreV2 ops. +void ValidateInputs(bool is_save_op, OpKernelContext* context, + const Tensor& prefix, const Tensor& tensor_names, + const Tensor& shape_and_slices, const int kFixedInputs) { + const int num_tensors = static_cast(tensor_names.NumElements()); + OP_REQUIRES( + context, prefix.NumElements() == 1, + errors::InvalidArgument("Input prefix should have a single element, got ", + prefix.NumElements(), " instead.")); + OP_REQUIRES(context, + TensorShapeUtils::IsVector(tensor_names.shape()) && + TensorShapeUtils::IsVector(shape_and_slices.shape()), + errors::InvalidArgument( + "Input tensor_names and shape_and_slices " + "should be an 1-D tensors, got ", + tensor_names.shape().DebugString(), " and ", + shape_and_slices.shape().DebugString(), " instead.")); + OP_REQUIRES(context, + tensor_names.NumElements() == shape_and_slices.NumElements(), + errors::InvalidArgument("tensor_names and shape_and_slices " + "have different number of elements: ", + tensor_names.NumElements(), " vs. ", + shape_and_slices.NumElements())); + OP_REQUIRES(context, + FastBoundsCheck(tensor_names.NumElements() + kFixedInputs, + std::numeric_limits::max()), + errors::InvalidArgument("Too many inputs to the op")); + OP_REQUIRES( + context, shape_and_slices.NumElements() == num_tensors, + errors::InvalidArgument("Expected ", num_tensors, + " elements in shapes_and_slices, but got ", + context->input(2).NumElements())); + if (is_save_op) { + OP_REQUIRES(context, context->num_inputs() == num_tensors + kFixedInputs, + errors::InvalidArgument( + "Got ", num_tensors, " tensor names but ", + context->num_inputs() - kFixedInputs, " tensors.")); + OP_REQUIRES(context, context->num_inputs() == num_tensors + kFixedInputs, + errors::InvalidArgument( + "Expected a total of ", num_tensors + kFixedInputs, + " inputs as input #1 (which is a string " + "tensor of saved names) contains ", + num_tensors, " names, but received ", context->num_inputs(), + " inputs")); + } +} + +} // namespace + +class SaveV3 : public OpKernel { + public: + explicit SaveV3(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_)); + OP_REQUIRES_OK(context, context->GetAttr("ev_key_types", &ev_key_types_)); + OP_REQUIRES_OK(context, context->GetAttr("has_ev", &has_ev_)); + } + + template + void DumpEvWithGlobalStep(OpKernelContext* context, const string& tensor_name, + EmbeddingVar* ev, + BundleWriter& writer, DataType global_step_type) { + if (global_step_type == DT_INT32) { + DumpEv(context, ev, tensor_name, writer); + } else { + DumpEv(context, ev, tensor_name, writer); + } + } + + template + void DumpEv(OpKernelContext* context, EmbeddingVar* variable, + const string& tensor_name, BundleWriter& writer) { + const Tensor& global_step = context->input(5); + TGlobalStep global_step_scalar = global_step.scalar()(); + core::ScopedUnref s(variable); + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = global_step_scalar; + const Tensor& prefix = context->input(0); + const string& prefix_string = prefix.scalar()(); + OP_REQUIRES_OK(context, variable->Save(tensor_name, prefix_string, &writer, + shrink_args)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& prefix = context->input(0); + const Tensor& tensor_names = context->input(1); + const Tensor& shape_and_slices = context->input(2); + const Tensor& ev_names = context->input(3); + const Tensor& ev_resources = context->input(4); + const int kFixedInputs = 5; + ValidateInputs(true /* is save op */, context, prefix, tensor_names, + shape_and_slices, kFixedInputs); + if (!context->status().ok()) return; + // Prefix, tensor names, shape_and_slices, ev names, ev resources. + const int num_tensors = static_cast(tensor_names.NumElements()); + const int num_ev = static_cast(ev_names.NumElements()); + const string& prefix_string = prefix.scalar()(); + const auto& tensor_names_flat = tensor_names.flat(); + const auto& ev_names_flat = ev_names.flat(); + const auto& ev_resources_flat = ev_resources.flat(); + const auto& shape_and_slices_flat = shape_and_slices.flat(); + + BundleWriter writer(Env::Default(), prefix_string); + OP_REQUIRES_OK(context, writer.status()); + VLOG(1) << "BundleWriter, prefix_string: " << prefix_string; + + int start_index = 0; + if (has_ev_) { + start_index = 1; + } + + for (int i = 0; i < num_ev; i++) { + const string& ev_name = ev_names_flat(i); + if (ev_key_types_[i] == DT_INT32) { + EmbeddingVar* ev = + reinterpret_cast*>(ev_resources_flat(i)); + DumpEvWithGlobalStep(context, ev_name, ev, writer, tensor_types_[0]); + } else if (ev_key_types_[i] == DT_INT64) { + EmbeddingVar* ev = + reinterpret_cast*>(ev_resources_flat(i)); + DumpEvWithGlobalStep(context, ev_name, ev, writer, tensor_types_[0]); + } + } + + for (int i = start_index; i < num_tensors; ++i) { + const string& tensor_name = tensor_names_flat(i); + if (tensor_types_[i] == DT_RESOURCE) { + auto& handle = HandleFromInput(context, i + kFixedInputs); + + } else { + const Tensor& tensor = context->input(i + kFixedInputs); + + if (!shape_and_slices_flat(i).empty()) { + const string& shape_spec = shape_and_slices_flat(i); + TensorShape shape; + TensorSlice slice(tensor.dims()); + TensorShape slice_shape; + + OP_REQUIRES_OK(context, + checkpoint::ParseShapeAndSlice(shape_spec, &shape, + &slice, &slice_shape)); + OP_REQUIRES( + context, slice_shape.IsSameSize(tensor.shape()), + errors::InvalidArgument( + "Slice in shape_and_slice " + "specification does not match the " + "shape of the tensor to save: ", + shape_spec, ", tensor: ", tensor.shape().DebugString())); + + OP_REQUIRES_OK(context, + writer.AddSlice(tensor_name, shape, slice, tensor)); + } else { + OP_REQUIRES_OK(context, writer.Add(tensor_name, tensor)); + } + } + } + OP_REQUIRES_OK(context, writer.Finish()); + } + + private: + DataTypeVector tensor_types_; + DataTypeVector ev_key_types_; + bool has_ev_; +}; +REGISTER_KERNEL_BUILDER(Name("SaveV3").Device(DEVICE_CPU), SaveV3); + +} // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h new file mode 100644 index 00000000..4b3a5fa1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h @@ -0,0 +1,82 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_ +#define TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_ + +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "tensorflow/core/util/tensor_slice_reader.h" +#include "tensorflow/core/util/tensor_slice_writer.h" + +namespace tensorflow { + +class OpKernelContext; + +template +class DumpIterator { + public: + virtual ~DumpIterator() {} + virtual bool HasNext() const = 0; + virtual T Next() = 0; +}; + +template +Status SaveTensorWithFixedBuffer(const string& tensor_name, + BundleWriter* writer, char* dump_buffer, + size_t bytes_limit, DumpIterator* dump_iter, + const TensorShape& dump_tensor_shape, + bool use_shape = true) { + bool dump_happened = false; + size_t bytes_written = 0; + int buffer_idx = 0; + Status st; + int64 total_bytes_written = 0; + T* key_dump_buffer = (T*)dump_buffer; + if (use_shape) + st = writer->AddTensorHeader(tensor_name, DataTypeToEnum::v(), + dump_tensor_shape); + if (!st.ok()) return st; + + while (dump_iter->HasNext()) { + T key = dump_iter->Next(); + if (bytes_written + sizeof(T) > bytes_limit) { + dump_happened = true; + TF_CHECK_OK(writer->AppendSegmentData(dump_buffer, bytes_written)); + bytes_written = 0; + buffer_idx = 0; + } + key_dump_buffer[buffer_idx] = key; + buffer_idx++; + bytes_written += sizeof(T); + total_bytes_written += sizeof(T); + } + + if (!dump_happened) { + VLOG(1) << tensor_name + << " only one buffer written, size:" << bytes_written; + TF_CHECK_OK(writer->AddCompeleteData(dump_buffer, bytes_written)); + } else { + VLOG(1) << tensor_name + << " mutiple buffer written, size:" << total_bytes_written + << ", bytes written:" << bytes_written; + TF_CHECK_OK(writer->AppendSegmentData(dump_buffer, bytes_written)); + writer->EndSegmentData(total_bytes_written, bytes_written); + } + return OkStatus(); +} + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc new file mode 100644 index 00000000..46e72845 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc @@ -0,0 +1,383 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA +#include + +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/bfloat16/bfloat16.h" +#include "tensorflow/core/util/work_sharder.h" +#include "training_ali_op_helpers.h" + +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "training_ali_ops_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +using SYCLDevice = Eigen::SyclDevice; + +template +class KvSparseApplyAdagradOp : public OpKernel { + public: + explicit KvSparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1}); + + EmbeddingVar* var = NULL; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + EmbeddingVar* accum = NULL; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum)); + core::ScopedUnref unref_accum(accum); + + const Tensor& lr = ctx->input(2); + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + const Tensor& global_step = ctx->input(5); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (has_counts) { + const Tensor& counts_tensor = ctx->input(6); + indices_counts = (int64*)counts_tensor.data(); + get_count_fn = [](int64* counts, int64 index) { return counts[index]; }; + } else { + get_count_fn = [](int64* counts, int64 index) { return 1; }; + } + + if (N > 0) { + if (inner_dim > 0) { + auto indices_vec = indices.vec(); + auto grad_flat = grad.flat_outer_dims(); + T lr_scalar = lr.scalar()(); + int64 gs = global_step.scalar()(); + auto do_work = [this, ctx, &indices_vec, var, accum, &grad_flat, &gs, + &lr_scalar, indices_counts, + get_count_fn](int64 start_i, int64 limit_i) { + for (int64 i = start_i; i < limit_i; i++) { + const Tindex index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK(ctx, + var->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + var->UpdateVersion(value_ptr, gs); + if (is_filter) { + auto var_i = var->flat(value_ptr); + auto a = accum->flat(value_ptr); + auto g = grad_flat.template chip<0>(i); + a += g.square(); + var_i -= g.constant(lr_scalar) * g * a.rsqrt(); + } + } + }; + const int64 cost = 1000; // very unreliable estimate for cost per step. + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + + if (has_counts && !indices_as_pointer) { + const Tensor& indices_counts = ctx->input(6); + var->UpdateCache(indices, indices_counts); + } + } + } + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradWithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradWithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); + +TF_CALL_float(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +template +class KvSparseApplyAdagradGPUOp : public OpKernel { + public: + explicit KvSparseApplyAdagradGPUOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + + int num_worker_threads = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; + thread_copy_id_alloc_.reset( + new IntraThreadCopyIdAllocator(num_worker_threads)); + } + + void ApplyGradients(EmbeddingVar* var, + EmbeddingVar* accum, T** var_ptr, T** acc_ptr, + T lr_scalar, const T* grad_base, const int64 task_size, + se::Stream* stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + // Send pointers of embeddings to GPU + T** dev_var_ptr = (T**)var->GetBuffer(task_size * 2); + T** dev_acc_ptr = dev_var_ptr + task_size; + CHECK(dev_var_ptr); + CHECK(dev_acc_ptr); + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 2); + stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 2); + + int block_size = 128; + int embedding_dim = var->ValueLen(); + functor::KvSparseApplyAdagradHbm()( + block_size, embedding_dim, dev_acc_ptr, dev_var_ptr, grad_base, + lr_scalar, task_size, gpu_device); + SyncWithEventMgr(stream, event_mgr); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1}); + + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + EmbeddingVar* accum = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum)); + core::ScopedUnref unref_accum(accum); + + const Tensor& lr = ctx->input(2); + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + const Tensor& global_step = ctx->input(5); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + if (inner_dim > 0) { + auto indices_flat = indices.flat(); + auto grad_flat = grad.flat_outer_dims(); + int64 gs = global_step.scalar()(); + T lr_scalar = lr.scalar()(); + if (var->IsSingleHbm()) { + const Tindex* key_base = &indices_flat(0); + const T* grad_base = &grad_flat(0); + const Device& device = ctx->eigen_device(); + + functor::KvSparseApplyAdagrad()( + N, ctx->get_allocator(AllocatorAttributes()), var, accum, + key_base, grad_base, lr_scalar, gs, device); + } else { + Tensor indices_temp_host(indices.dtype(), indices.shape()); + const Tensor* indices_host_ptr = nullptr; + // Copy ids from GPU to CPU for CPU Lookup. + auto stream = ctx->op_device_context()->stream(); + auto event_mgr = + ctx->device()->tensorflow_accelerator_device_info()->event_mgr; + if (!indices_as_pointer) { + indices_host_ptr = &indices_temp_host; + se::DeviceMemoryBase gpu_src(const_cast(&indices_flat(0)), + N * sizeof(Tindex)); + stream->ThenMemcpy(indices_host_ptr->data(), gpu_src, + N * sizeof(Tindex)); + SyncWithEventMgr(stream, event_mgr); + } else { + indices_host_ptr = &indices; + } + + int counts_index = has_counts ? 6 : -1; + T** var_ptr = new T*[N * 2]; + T** acc_ptr = var_ptr + N; + std::vector*, T**>> vars(2); + vars[0] = std::pair*, T**>(var, var_ptr); + vars[1] = std::pair*, T**>(accum, acc_ptr); + GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs, + indices_as_pointer, counts_index, N, + thread_copy_id_alloc_.get()); + + ApplyGradients(var, accum, var_ptr, acc_ptr, lr_scalar, &grad_flat(0), + N, stream, event_mgr, ctx->eigen_device()); + + if (has_counts && !indices_as_pointer) { + const Tensor& counts_tensor = ctx->input(counts_index); + var->UpdateCache(*indices_host_ptr, counts_tensor); + } + + delete[] var_ptr; + } + } + } + } + + private: + bool use_exclusive_lock_; + std::unique_ptr thread_copy_id_alloc_; +}; + +namespace functor { +#define DECLARE_GPU_SPEC(T, Tindex) \ + template <> \ + void KvSparseApplyAdagrad::operator()( \ + int32 num_items, Allocator* alloc, EmbeddingVar* var, \ + EmbeddingVar* accum, const Tindex* key_base, const T* grad, \ + T lr, int64 gs, const GPUDevice& device); \ + extern template struct KvSparseApplyAdagrad; +DECLARE_GPU_SPEC(float, int32); +DECLARE_GPU_SPEC(double, int32); +DECLARE_GPU_SPEC(float, int64); +DECLARE_GPU_SPEC(double, int64); +#undef DECLARE_GPU_SPEC +} // end of namespace functor + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdagrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("lr") \ + .HostMemory("global_step") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdagrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("global_step") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdagradWithCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("lr") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdagradWithCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyAdagradGPUOp); +#define REGISTER_GPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); + +TF_CALL_float(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS +#endif // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc new file mode 100644 index 00000000..0517788c --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc @@ -0,0 +1,603 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h" +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/bfloat16/bfloat16.h" +#include "tensorflow/core/util/work_sharder.h" +#include "training_ali_op_helpers.h" + +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "training_ali_ops_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +using SYCLDevice = Eigen::SyclDevice; + +template +class KvSparseApplyAdamAsyncOp : public OpKernel { + public: + explicit KvSparseApplyAdamAsyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK( + ctx, ctx->GetAttr("apply_sparse_rmsprop", &apply_sparse_rmsprop_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1, 2, 3, 4}); + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + + EmbeddingVar* m = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m)); + core::ScopedUnref unref_m(m); + + EmbeddingVar* v = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v)); + core::ScopedUnref unref_v(v); + + Tensor beta1_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 3, use_exclusive_lock_, true, &beta1_power)); + + Tensor beta2_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 4, use_exclusive_lock_, true, &beta2_power)); + OP_REQUIRES( + ctx, beta1_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(3))); + OP_REQUIRES( + ctx, beta2_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(4))); + + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + const Tensor& grad = ctx->input(9); + const Tensor& indices = ctx->input(10); + const Tensor& global_step = ctx->input(11); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (has_counts) { + const Tensor& counts_tensor = ctx->input(12); + indices_counts = (int64*)counts_tensor.data(); + get_count_fn = [](int64* counts, int64 index) { return counts[index]; }; + } else { + get_count_fn = [](int64* counts, int64 index) { return 1; }; + } + if (N > 0) { + if (apply_sparse_rmsprop_) { + auto indices_vec = indices.vec(); + + auto grad_flat = grad.flat_outer_dims(); + const T lr_scalar = lr.scalar()(); + const T beta1_scalar = beta1.scalar()(); + const T beta2_scalar = beta2.scalar()(); + const T epsilon_scalar = epsilon.scalar()(); + + auto do_work = [this, ctx, &indices_vec, &var, v, m, &grad_flat, + &beta2_scalar, &beta1_scalar, &epsilon_scalar, + &lr_scalar, &global_step, get_count_fn, + indices_counts](int64 start_i, int64 limit_i) { + Tstep gs = global_step.scalar()(); + for (int64 i = start_i; i < limit_i; i++) { + const Tindex index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK(ctx, + var->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + var->UpdateVersion(value_ptr, gs); + if (is_filter) { + auto v_ = v->flat(value_ptr); + auto m_ = m->flat(value_ptr); + auto grad_ = grad_flat.template chip<0>(i); + + v_ = v_ * v_.constant(beta2_scalar) + + grad_.square() * grad_.constant(T(1) - beta2_scalar); + m_ = m_ * m_.constant(beta1_scalar) + + (v_ + v_.constant(epsilon_scalar)).rsqrt() * + v_.constant(lr_scalar) * grad_; + + auto v = var->flat(value_ptr); + v -= m_; + } + } + }; + const int64 cost = 1000; + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + } else { + auto beta1_power_scalar = beta1_power.scalar(); + auto beta2_power_scalar = beta2_power.scalar(); + T lr_scalar = lr.scalar()(); + T beta1_scalar = beta1.scalar()(); + T beta2_scalar = beta2.scalar()(); + T epsilon_scalar = epsilon.scalar()(); + const T alpha = + lr_scalar * + Eigen::numext::sqrt(static_cast(1) - beta2_power_scalar()) / + (static_cast(1) - beta1_power_scalar()); + + auto do_work = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices, + &lr_scalar, &beta1_scalar, &beta1_power, &beta2_power, + &beta2_scalar, &epsilon_scalar, &alpha, &global_step, + get_count_fn, + indices_counts](int64 start_i, int64 limit_i) { + if (inner_dim > 0) { + auto grad_flat = grad.flat_outer_dims(); + auto indices_vec = indices.vec(); + Tstep gs = global_step.scalar()(); + + for (int64 i = start_i; i < limit_i; i++) { + const Tindex index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK( + ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + var->UpdateVersion(value_ptr, gs); + if (is_filter) { + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); + auto g = grad_flat.template chip<0>(i); + auto var_i = var->flat(value_ptr); + + m_a = + m_a * beta1_scalar + g * (static_cast(1) - beta1_scalar); + v_a = v_a * beta2_scalar + + g.square() * (static_cast(1) - beta2_scalar); + var_i -= (m_a * alpha) / (v_a.sqrt() + epsilon_scalar); + } + } + } + }; + + const int64 cost = 1000; + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + + beta1_power_scalar() *= beta1_scalar; + beta2_power_scalar() *= beta2_scalar; + } + if (has_counts && !indices_as_pointer) { + const Tensor& indices_counts = ctx->input(12); + var->UpdateCache(indices, indices_counts); + } + } + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; + bool apply_sparse_rmsprop_; +}; + +#define REGISTER_KERNELS(D, T, Tindices, Tstep) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdamAsync") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdamAsync") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdamAsyncWithCounts") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncOp); + +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(CPU, T, int32, int32); \ + REGISTER_KERNELS(CPU, T, int64, int32); \ + REGISTER_KERNELS(CPU, T, int32, int64); \ + REGISTER_KERNELS(CPU, T, int64, int64); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_bfloat16(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +template +class KvSparseApplyAdamAsyncGPUOp : public OpKernel { + public: + explicit KvSparseApplyAdamAsyncGPUOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK( + ctx, ctx->GetAttr("apply_sparse_rmsprop", &apply_sparse_rmsprop_)); + + int num_worker_threads = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; + thread_copy_id_alloc_.reset( + new IntraThreadCopyIdAllocator(num_worker_threads)); + } + + void ApplyGradients(EmbeddingVar* var, EmbeddingVar* m, + EmbeddingVar* v, T** var_ptr, T** m_ptr, + T** v_ptr, T beta1, T beta2, T epsilon, T lr, + typename TTypes::Scalar beta1_power_scalar, + typename TTypes::Scalar beta2_power_scalar, + const T* grad_base, const int64 task_size, + se::Stream* stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + // Send pointers of embeddings to GPU + T** dev_var_ptr = (T**)var->GetBuffer(task_size * 3); + T** dev_m_ptr = dev_var_ptr + task_size; + T** dev_v_ptr = dev_m_ptr + task_size; + CHECK(dev_var_ptr); + CHECK(dev_m_ptr); + CHECK(dev_v_ptr); + + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); + stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3); + + int block_size = 128; + int embedding_dim = var->ValueLen(); + T* beta1_power_ptr = beta1_power_scalar.data(); + T* beta2_power_ptr = beta2_power_scalar.data(); + if (apply_sparse_rmsprop_) { + functor::KvSparseApplyAdamAsyncSparseRmspropHbm()( + block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr, + grad_base, lr, beta1, beta2, epsilon, task_size, gpu_device); + } else { + functor::KvSparseApplyAdamAsyncHbm()( + block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr, + grad_base, lr, beta1, beta2, epsilon, beta1_power_ptr, + beta2_power_ptr, task_size, gpu_device); + } + SyncWithEventMgr(stream, event_mgr); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1, 2, 3, 4}); + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + + EmbeddingVar* m = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m)); + core::ScopedUnref unref_m(m); + + EmbeddingVar* v = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v)); + core::ScopedUnref unref_v(v); + + Tensor beta1_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 3, use_exclusive_lock_, true, &beta1_power)); + + Tensor beta2_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 4, use_exclusive_lock_, true, &beta2_power)); + OP_REQUIRES( + ctx, beta1_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(3))); + OP_REQUIRES( + ctx, beta2_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(4))); + + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + const Tensor& grad = ctx->input(9); + const Tensor& indices = ctx->input(10); + const Tensor& global_step = ctx->input(11); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + if (var->IsSingleHbm()) { + const Device& device = ctx->eigen_device(); + OP_REQUIRES_OK( + ctx, functor::KvSparseApplyAdamAsync()( + device, var, m, v, beta1_power.scalar(), + beta2_power.scalar(), indices.vec(), + grad.flat_outer_dims(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + global_step.scalar(), apply_sparse_rmsprop_, + inner_dim, ctx->get_allocator(AllocatorAttributes()))); + } else { + auto indices_vec = indices.vec(); + auto grad_flat = grad.flat_outer_dims(); + Tstep gs = global_step.scalar()(); + const T lr_scalar = lr.scalar()(); + const T beta1_scalar = beta1.scalar()(); + const T beta2_scalar = beta2.scalar()(); + const T epsilon_scalar = epsilon.scalar()(); + auto beta1_power_scalar = beta1_power.scalar(); + auto beta2_power_scalar = beta2_power.scalar(); + + Tensor indices_temp_host(indices.dtype(), indices.shape()); + const Tensor* indices_host_ptr = nullptr; + // Copy ids from GPU to CPU for CPU Lookup. + auto stream = ctx->op_device_context()->stream(); + auto event_mgr = + ctx->device()->tensorflow_accelerator_device_info()->event_mgr; + if (!indices_as_pointer) { + indices_host_ptr = &indices_temp_host; + se::DeviceMemoryBase gpu_src(const_cast(&indices_vec(0)), + N * sizeof(Tindex)); + stream->ThenMemcpy(indices_host_ptr->data(), gpu_src, + N * sizeof(Tindex)); + SyncWithEventMgr(stream, event_mgr); + } else { + indices_host_ptr = &indices; + } + + int counts_index = has_counts ? 12 : -1; + T** var_ptr = new T*[N * 3]; + T** m_ptr = var_ptr + N; + T** v_ptr = m_ptr + N; + std::vector*, T**>> vars(3); + vars[0] = std::pair*, T**>(var, var_ptr); + vars[1] = std::pair*, T**>(m, m_ptr); + vars[2] = std::pair*, T**>(v, v_ptr); + GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs, + indices_as_pointer, counts_index, N, + thread_copy_id_alloc_.get()); + + ApplyGradients(var, m, v, var_ptr, m_ptr, v_ptr, beta1_scalar, + beta2_scalar, epsilon_scalar, lr_scalar, + beta1_power_scalar, beta2_power_scalar, &grad_flat(0), N, + stream, event_mgr, ctx->eigen_device()); + + if (has_counts && !indices_as_pointer) { + const Tensor& counts_tensor = ctx->input(counts_index); + var->UpdateCache(*indices_host_ptr, counts_tensor); + } + + delete[] var_ptr; + } + } + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; + bool apply_sparse_rmsprop_; + std::unique_ptr thread_copy_id_alloc_; +}; + +#define REGISTER_KERNELS(D, T, Tindices, Tstep) \ + REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsync") \ + .Device(DEVICE_##D) \ + .HostMemory("lr") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncGPUOp); \ + REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsync") \ + .Device(DEVICE_##D) \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncGPUOp); \ + REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsyncWithCounts") \ + .Device(DEVICE_##D) \ + .HostMemory("lr") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts") \ + .Device(DEVICE_##D) \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamAsyncGPUOp); +#define REGISTER_GPU_KERNELS(T) \ + REGISTER_KERNELS(GPU, T, int32, int32); \ + REGISTER_KERNELS(GPU, T, int64, int32); \ + REGISTER_KERNELS(GPU, T, int32, int64); \ + REGISTER_KERNELS(GPU, T, int64, int64); + +TF_CALL_float(REGISTER_GPU_KERNELS); +TF_CALL_double(REGISTER_GPU_KERNELS); + +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, Tindex, Tstep) \ + template <> \ + Status KvSparseApplyAdamAsync::operator()( \ + const GPUDevice& d, EmbeddingVar* var, \ + EmbeddingVar* m, EmbeddingVar* v, \ + typename TTypes::Scalar beta1_power_scalar, \ + typename TTypes::Scalar beta2_power_scalar, \ + typename TTypes::ConstVec indices_vec, \ + typename TTypes::ConstMatrix grad, \ + typename TTypes::ConstScalar lr_scalar, \ + typename TTypes::ConstScalar beta1_scalar, \ + typename TTypes::ConstScalar beta2_scalar, \ + typename TTypes::ConstScalar epsilon_scalar, \ + typename TTypes::ConstScalar global_step_scalar, \ + bool apply_sparse_rmsprop, const int64 inner_dim, Allocator* alloc); \ + extern template struct KvSparseApplyAdamAsync; + +#define DECLARE_GPU_SPEC_TYPE(T) \ + DECLARE_GPU_SPEC(T, int32, int32); \ + DECLARE_GPU_SPEC(T, int32, int64); \ + DECLARE_GPU_SPEC(T, int64, int32); \ + DECLARE_GPU_SPEC(T, int64, int64); + +DECLARE_GPU_SPEC_TYPE(float); +DECLARE_GPU_SPEC_TYPE(double); + +#undef DECLARE_GPU_SPEC_TYPE +#undef DECLARE_GPU_SPEC +} // end of namespace functor + +#endif // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc new file mode 100644 index 00000000..7dd80c73 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc @@ -0,0 +1,529 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h" +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/bfloat16/bfloat16.h" +#include "tensorflow/core/util/work_sharder.h" +#include "training_ali_op_helpers.h" + +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "training_ali_ops_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +using SYCLDevice = Eigen::SyclDevice; + +template +class KvSparseApplyAdamOp : public OpKernel { + public: + explicit KvSparseApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1, 2}); + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + + EmbeddingVar* m = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m)); + core::ScopedUnref unref_m(m); + + EmbeddingVar* v = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v)); + core::ScopedUnref unref_v(v); + + const Tensor& beta1_power = ctx->input(3); + const Tensor& beta2_power = ctx->input(4); + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + const Tensor& grad = ctx->input(9); + const Tensor& indices = ctx->input(10); + const Tensor& global_step = ctx->input(11); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()), + errors::InvalidArgument("beta2_power is not a scalar: ", + beta2_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (has_counts) { + const Tensor& counts_tensor = ctx->input(12); + indices_counts = (int64*)counts_tensor.data(); + get_count_fn = [](int64* counts, int64 index) { return counts[index]; }; + } else { + get_count_fn = [](int64* counts, int64 index) { return 1; }; + } + if (N > 0) { + T beta1_power_scalar = beta1_power.scalar()(); + T beta2_power_scalar = beta2_power.scalar()(); + T lr_scalar = lr.scalar()(); + T beta1_scalar = beta1.scalar()(); + T beta2_scalar = beta2.scalar()(); + T epsilon_scalar = epsilon.scalar()(); + const T alpha = + lr_scalar * + Eigen::numext::sqrt(static_cast(1) - beta2_power_scalar) / + (static_cast(1) - beta1_power_scalar); + + auto do_work = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices, + &lr_scalar, &beta1_scalar, &beta1_power, &beta2_power, + &beta2_scalar, &epsilon_scalar, &alpha, &global_step, + get_count_fn, + indices_counts](int64 start_i, int64 limit_i) { + if (inner_dim > 0) { + auto grad_flat = grad.flat_outer_dims(); + auto indices_vec = indices.vec(); + Tstep gs = global_step.scalar()(); + + for (int64 i = start_i; i < limit_i; i++) { + const Tindex index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK(ctx, + var->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + var->UpdateVersion(value_ptr, gs); + if (is_filter) { + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); + auto g = grad_flat.template chip<0>(i); + auto var_i = var->flat(value_ptr); + + m_a = m_a * beta1_scalar + g * (static_cast(1) - beta1_scalar); + v_a = v_a * beta2_scalar + + g.square() * (static_cast(1) - beta2_scalar); + var_i -= (m_a * alpha) / (v_a.sqrt() + epsilon_scalar); + } + } + } + }; + + const int64 cost = 1000; + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + if (has_counts && !indices_as_pointer) { + const Tensor& indices_counts = ctx->input(12); + var->UpdateCache(indices, indices_counts); + } + } + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(D, T, Tindices, Tstep) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdam") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdam") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdamWithCounts") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdamWithCounts") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamOp); + +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(CPU, T, int32, int32); \ + REGISTER_KERNELS(CPU, T, int64, int32); \ + REGISTER_KERNELS(CPU, T, int32, int64); \ + REGISTER_KERNELS(CPU, T, int64, int64); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_bfloat16(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +template +class KvSparseApplyAdamGPUOp : public OpKernel { + public: + explicit KvSparseApplyAdamGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + + int num_worker_threads = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; + thread_copy_id_alloc_.reset( + new IntraThreadCopyIdAllocator(num_worker_threads)); + } + + void ApplyGradients(EmbeddingVar* var, EmbeddingVar* m, + EmbeddingVar* v, T** var_ptr, T** m_ptr, + T** v_ptr, T beta1, T beta2, T epsilon, T lr, + T beta1_power, T beta2_power, const T* grad_base, + const int64 task_size, se::Stream* stream, + EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device) { + // Send pointers of embeddings to GPU + T** dev_var_ptr = (T**)var->GetBuffer(task_size * 3); + T** dev_m_ptr = dev_var_ptr + task_size; + T** dev_v_ptr = dev_m_ptr + task_size; + CHECK(dev_var_ptr); + CHECK(dev_m_ptr); + CHECK(dev_v_ptr); + + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); + stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3); + + int block_size = 128; + int embedding_dim = var->ValueLen(); + functor::KvSparseApplyAdamHbm()( + block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr, grad_base, + lr, beta1, beta2, epsilon, beta1_power, beta2_power, task_size, + gpu_device); + SyncWithEventMgr(stream, event_mgr); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1, 2}); + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + + EmbeddingVar* m = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m)); + core::ScopedUnref unref_m(m); + + EmbeddingVar* v = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v)); + core::ScopedUnref unref_v(v); + + const Tensor& beta1_power = ctx->input(3); + const Tensor& beta2_power = ctx->input(4); + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + const Tensor& grad = ctx->input(9); + const Tensor& indices = ctx->input(10); + const Tensor& global_step = ctx->input(11); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()), + errors::InvalidArgument("beta2_power is not a scalar: ", + beta2_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + if (var->IsSingleHbm()) { + const Device& device = ctx->eigen_device(); + OP_REQUIRES_OK( + ctx, functor::KvSparseApplyAdam()( + device, var, m, v, beta1_power.scalar(), + beta2_power.scalar(), indices.vec(), + grad.flat_outer_dims(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + global_step.scalar(), inner_dim, + ctx->get_allocator(AllocatorAttributes()))); + } else { + auto indices_vec = indices.vec(); + auto grad_flat = grad.flat_outer_dims(); + Tstep gs = global_step.scalar()(); + const T lr_scalar = lr.scalar()(); + const T beta1_scalar = beta1.scalar()(); + const T beta2_scalar = beta2.scalar()(); + const T epsilon_scalar = epsilon.scalar()(); + const T beta1_power_scalar = beta1_power.scalar()(); + const T beta2_power_scalar = beta2_power.scalar()(); + + Tensor indices_temp_host(indices.dtype(), indices.shape()); + const Tensor* indices_host_ptr = nullptr; + // Copy ids from GPU to CPU for CPU Lookup. + auto stream = ctx->op_device_context()->stream(); + auto event_mgr = + ctx->device()->tensorflow_accelerator_device_info()->event_mgr; + if (!indices_as_pointer) { + indices_host_ptr = &indices_temp_host; + se::DeviceMemoryBase gpu_src(const_cast(&indices_vec(0)), + N * sizeof(Tindex)); + stream->ThenMemcpy(indices_host_ptr->data(), gpu_src, + N * sizeof(Tindex)); + SyncWithEventMgr(stream, event_mgr); + } else { + indices_host_ptr = &indices; + } + + int counts_index = has_counts ? 12 : -1; + T** var_ptr = new T*[N * 3]; + T** m_ptr = var_ptr + N; + T** v_ptr = m_ptr + N; + std::vector*, T**>> vars(3); + vars[0] = std::pair*, T**>(var, var_ptr); + vars[1] = std::pair*, T**>(m, m_ptr); + vars[2] = std::pair*, T**>(v, v_ptr); + GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs, + indices_as_pointer, counts_index, N, + thread_copy_id_alloc_.get()); + + ApplyGradients(var, m, v, var_ptr, m_ptr, v_ptr, beta1_scalar, + beta2_scalar, epsilon_scalar, lr_scalar, + beta1_power_scalar, beta2_power_scalar, &grad_flat(0), N, + stream, event_mgr, ctx->eigen_device()); + + if (has_counts && !indices_as_pointer) { + const Tensor& counts_tensor = ctx->input(counts_index); + var->UpdateCache(*indices_host_ptr, counts_tensor); + } + + delete[] var_ptr; + } + } + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; + std::unique_ptr thread_copy_id_alloc_; +}; + +#define REGISTER_KERNELS(D, T, Tindices, Tstep) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdam") \ + .Device(DEVICE_##D) \ + .HostMemory("lr") \ + .HostMemory("beta1_power") \ + .HostMemory("beta2_power") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdam") \ + .Device(DEVICE_##D) \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("beta1_power") \ + .HostMemory("beta2_power") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyAdamWithCounts") \ + .Device(DEVICE_##D) \ + .HostMemory("lr") \ + .HostMemory("beta1_power") \ + .HostMemory("beta2_power") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamGPUOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyAdamWithCounts") \ + .Device(DEVICE_##D) \ + .HostMemory("indices") \ + .HostMemory("lr") \ + .HostMemory("beta1_power") \ + .HostMemory("beta2_power") \ + .HostMemory("beta1") \ + .HostMemory("beta2") \ + .HostMemory("epsilon") \ + .HostMemory("global_step") \ + .HostMemory("indices_counts") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvSparseApplyAdamGPUOp); +#define REGISTER_GPU_KERNELS(T) \ + REGISTER_KERNELS(GPU, T, int32, int32); \ + REGISTER_KERNELS(GPU, T, int64, int32); \ + REGISTER_KERNELS(GPU, T, int32, int64); \ + REGISTER_KERNELS(GPU, T, int64, int64); + +TF_CALL_float(REGISTER_GPU_KERNELS); +TF_CALL_double(REGISTER_GPU_KERNELS); + +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, Tindex, Tstep) \ + template <> \ + Status KvSparseApplyAdam::operator()( \ + const GPUDevice& d, EmbeddingVar* var, \ + EmbeddingVar* m, EmbeddingVar* v, \ + typename TTypes::ConstScalar beta1_power_scalar, \ + typename TTypes::ConstScalar beta2_power_scalar, \ + typename TTypes::ConstVec indices_vec, \ + typename TTypes::ConstMatrix grad, \ + typename TTypes::ConstScalar lr_scalar, \ + typename TTypes::ConstScalar beta1_scalar, \ + typename TTypes::ConstScalar beta2_scalar, \ + typename TTypes::ConstScalar epsilon_scalar, \ + typename TTypes::ConstScalar global_step_scalar, \ + const int64 inner_dim, Allocator* alloc); \ + extern template struct KvSparseApplyAdam; + +#define DECLARE_GPU_SPEC_TYPE(T) \ + DECLARE_GPU_SPEC(T, int32, int32); \ + DECLARE_GPU_SPEC(T, int32, int64); \ + DECLARE_GPU_SPEC(T, int64, int32); \ + DECLARE_GPU_SPEC(T, int64, int64); + +DECLARE_GPU_SPEC_TYPE(float); +DECLARE_GPU_SPEC_TYPE(double); + +#undef DECLARE_GPU_SPEC_TYPE +#undef DECLARE_GPU_SPEC +} // end of namespace functor + +#endif // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h new file mode 100644 index 00000000..3136c30b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h @@ -0,0 +1,182 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_ +#define TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_ + +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/variant_op_registry.h" +#include "tensorflow/core/kernels/dense_update_functor.h" +#include "tensorflow/core/kernels/variable_ops.h" + +namespace tensorflow { + +// ********************************************************************** +// TODO: candy.dc +// this code is duplicated from training_op_helpers.h +// Once this function and Class support template, this duplicated code +// should be removed +// ********************************************************************** + +// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`. +// +// If `input` corresponds to a `DT_RESOURCE`-type variable input, +// `*maybe_resource` will be updated to contain the underlying resource, and the +// caller will be responsible for calling `Unref()` on that resource. +template +mutex* GetTrainingEmbeddingVariableMutex(OpKernelContext* ctx, int input, + EmbeddingVar** maybe_resource) { + *maybe_resource = nullptr; + if (ctx->input_dtype(input) == DT_RESOURCE) { + if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) { + return (*maybe_resource)->mu(); + } else { + ctx->CtxFailureWithWarning( + errors::Internal("Invalid variable reference.")); + return nullptr; + } + } + return ctx->input_ref_mutex(input); +} + +// Utility structure that releases a sequence of borrowed mutexes when it is +// deleted. +template +struct EmbeddingVariableInputLockHolder { + public: + EmbeddingVariableInputLockHolder( + std::vector*> vars, + std::unique_ptr> locks) + : vars_(std::move(vars)), locks_(std::move(locks)) {} + + EmbeddingVariableInputLockHolder(EmbeddingVariableInputLockHolder&& other) + : vars_(std::move(other.vars_)), locks_(std::move(other.locks_)) {} + + ~EmbeddingVariableInputLockHolder() { + // Release the locks before unreffing the Vars, because each lock + // is potentially borrowed from a Var in vars_. + locks_.reset(); + for (EmbeddingVar* var : vars_) { + var->Unref(); + } + } + + private: + std::vector*> vars_; + // NOTE: Use a `std::unique_ptr` instead of moving in a vector directly, + // because a `std::vector` is not movable on all platforms. + std::unique_ptr> locks_; +}; + +template +EmbeddingVariableInputLockHolder +MaybeLockEmbeddingVariableInputMutexesInOrder( + OpKernelContext* ctx, bool do_lock, const std::vector& input_ids) { + if (!do_lock) { + return EmbeddingVariableInputLockHolder({}, {}); + } + std::vector*> vars; + std::vector mutexes; + std::vector acquire_order; + for (auto input : input_ids) { + EmbeddingVar* var; + mutex* mutex = GetTrainingEmbeddingVariableMutex(ctx, input, &var); + if (var) vars.push_back(var); + // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3). + if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) { + acquire_order.push_back(mutexes.size()); + mutexes.push_back(mutex); + } + } + std::sort(acquire_order.begin(), acquire_order.end(), + [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; }); + + std::unique_ptr> locks = + std::make_unique>(); + locks->reserve(acquire_order.size()); + + for (auto input : acquire_order) { + EmbeddingVar* var; + mutex* mu = GetTrainingEmbeddingVariableMutex(ctx, input, &var); + core::ScopedUnref scoped_unref(var); + if (mu != nullptr) { + locks->emplace_back(*mu); + } + } + return EmbeddingVariableInputLockHolder(std::move(vars), + std::move(locks)); +} + +template +void LookupKeyAndSetVersion(OpKernelContext* ctx, EmbeddingVar* var, + void** value_ptrs, Tstep gs, const K* indices, + int64 task_size, bool indices_as_pointer, + int counts_index) { + EmbeddingVarContext ev_ctx(ctx); + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (counts_index != -1) { + const Tensor& counts_tensor = ctx->input(counts_index); + indices_counts = (int64*)counts_tensor.data(); + } + var->LookupOrCreateKey(ev_ctx, indices, value_ptrs, task_size, indices_counts, + indices_as_pointer); + + auto update_version_fn = [var, value_ptrs, gs](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + var->UpdateVersion(value_ptrs[i], gs); + } + }; + const int64 unit_cost = 1000; // very unreliable estimate for cost per step. + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads->num_threads, worker_threads->workers, task_size, + unit_cost, update_version_fn); +} + +template +void LookupEmbedding(OpKernelContext* ctx, + std::vector*, V**>>& vars, + void** value_ptrs, const K* indices, int64 num_of_keys) { + for (auto it : vars) { + EmbeddingVar* var = it.first; + V** var_ptr = it.second; + auto lookup_emb_fn = [var, var_ptr, value_ptrs](int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + var_ptr[i] = var->GetValuePtr(value_ptrs[i]); + } + }; + const int64 unit_cost = 1000; // very unreliable estimate for cost per + // step. + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys, + unit_cost, lookup_emb_fn); + } +} + +template +void GetEmbeddingPointers( + OpKernelContext* ctx, + std::vector*, V**>>& vars, const K* indices, + Tstep gs, bool indices_as_pointer, int counts_index, int64 num_of_keys, + IntraThreadCopyIdAllocator* thread_copy_id_alloc) { + std::vector value_ptrs(num_of_keys); + LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(), gs, indices, + num_of_keys, indices_as_pointer, counts_index); + LookupEmbedding(ctx, vars, value_ptrs.data(), indices, num_of_keys); +} +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc new file mode 100644 index 00000000..41eb2631 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc @@ -0,0 +1,650 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include "training_ali_ops_gpu.h" + +#include "deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { +template +__device__ T impl_sqrt(T x) { + return sqrt(x); +} +template +__device__ T impl_rsqrt(T x) { + return rsqrt(x); +} +template <> +__device__ Eigen::half impl_sqrt(Eigen::half x) { + return __float2half(sqrt(__half2float(x))); +} +template <> +__device__ Eigen::half impl_rsqrt(Eigen::half x) { + return __float2half(rsqrt(__half2float(x))); +} + +template +__global__ void kv_sparse_apply_adagrad_kernel( + const Tindex* key_base, int32* item_idxs, int64 dim, Value** d_banks, + bool** d_flags, int32 var_slot_idx, int32 acc_slot_idx, int32 slot_num, + int32 bank_size, Value lr, const Value* grad, Value* var_default_v, + Value* acc_default_v, int32 var_default_v_num, int32 acc_default_v_num) { + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto var_slot_offset = bank_idx * slot_num + var_slot_idx; + auto acc_slot_offset = bank_idx * slot_num + acc_slot_idx; + bool var_stored = d_flags[var_slot_offset][offset_in_bank]; + bool acc_stored = d_flags[acc_slot_offset][offset_in_bank]; + __syncthreads(); + + if (var_default_v != nullptr && var_stored == false) { + d_flags[var_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[var_slot_offset][offset_in_bank * dim + id] = + var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + + id]; + } + } + if (acc_default_v != nullptr && acc_stored == false) { + d_flags[acc_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[acc_slot_offset][offset_in_bank * dim + id] = + acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim + + id]; + } + } + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + Value g = grad[item_idx * dim + id]; + Value* acc = &d_banks[acc_slot_offset][tmp_offset]; + (*acc) += g * g; + d_banks[var_slot_offset][tmp_offset] -= lr * g * rsqrtf(*acc); + } +} + +template +struct KvSparseApplyAdagrad { + void operator()(int32 num_items, Allocator* alloc, + EmbeddingVar* var, EmbeddingVar* accum, + const Tindex* key_base, const T* grad, T lr, int64 gs, + const GPUDevice& device) { + int32* item_idxs = TypedAllocator::Allocate(alloc, num_items, + AllocationAttributes()); + var->LookupOrCreateKey(key_base, item_idxs, num_items, device, gs); + auto const block_size = 256; + auto const grid_size = num_items; + GPUHashTable* hashtable = var->HashTable(); + TF_CHECK_OK(GpuLaunchKernel( + kv_sparse_apply_adagrad_kernel, grid_size, block_size, 0, + device.stream(), key_base, item_idxs, var->ValueLen(), + hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs, var->EmbIdx(), + accum->EmbIdx(), var->SlotNum(), hashtable->initial_bank_size, lr, grad, + var->GetDefaultValuePtr(), accum->GetDefaultValuePtr(), + var->GetDefaultValueDim(), accum->GetDefaultValueDim())); + TypedAllocator::Deallocate(alloc, item_idxs, num_items); + } +}; + +template +struct KvSparseApplyAdagradHbm { + void operator()(int block_size, int embedding_dim, T** dev_a, T** dev_v, + const T* grad_base, T lr_scalar, int64 task_size, + const GPUDevice& device) { + TF_CHECK_OK(GpuLaunchKernel( + SparseApplyAdagradGPU, + (task_size + block_size - 1) / block_size * embedding_dim, block_size, + 0, device.stream(), dev_a, dev_v, grad_base, lr_scalar, embedding_dim, + task_size)); + } +}; + +template +__global__ void KvSparseApplyAdamKernel( + const TKey* key_base, int32* item_idxs, int64 dim, T** d_banks, + bool** d_flags, int32 var_slot_idx, int32 v_slot_idx, int32 m_slot_idx, + int32 slot_num, int32 bank_size, const T* beta1_scalar, + const T* beta2_scalar, const T* beta1_power_scalar, + const T* beta2_power_scalar, const T* epsilon_scalar, const T* lr_scalar, + const T* grad, T* var_default_v, T* v_default_v, T* m_default_v, + int32 var_default_v_num, int32 v_default_v_num, int32 m_default_v_num) { + const T lr = *lr_scalar; + const T beta1 = *beta1_scalar; + const T beta2 = *beta2_scalar; + const T beta1_power = *beta1_power_scalar; + const T beta2_power = *beta2_power_scalar; + const T epsilon = *epsilon_scalar; + + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto var_slot_offset = bank_idx * slot_num + var_slot_idx; + auto v_slot_offset = bank_idx * slot_num + v_slot_idx; + auto m_slot_offset = bank_idx * slot_num + m_slot_idx; + bool var_stored = d_flags[var_slot_offset][offset_in_bank]; + bool v_stored = d_flags[v_slot_offset][offset_in_bank]; + bool m_stored = d_flags[m_slot_offset][offset_in_bank]; + const T alpha = lr * sqrt(static_cast(1) - beta2_power) / + (static_cast(1) - beta1_power); + __syncthreads(); + + if (var_default_v != nullptr && var_stored == false) { + d_flags[var_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[var_slot_offset][offset_in_bank * dim + id] = + var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + + id]; + } + } + if (v_default_v != nullptr && v_stored == false) { + d_flags[v_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[v_slot_offset][offset_in_bank * dim + id] = + v_default_v[(*(key_base + item_idx) % v_default_v_num) * dim + id]; + } + } + if (m_default_v != nullptr && m_stored == false) { + d_flags[m_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[m_slot_offset][offset_in_bank * dim + id] = + m_default_v[(*(key_base + item_idx) % m_default_v_num) * dim + id]; + } + } + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + T grad_a = grad[item_idx * dim + id]; + T& var_a = d_banks[var_slot_offset][tmp_offset]; + T& v_a = d_banks[v_slot_offset][tmp_offset]; + T& m_a = d_banks[m_slot_offset][tmp_offset]; + + m_a = m_a * beta1 + grad_a * (static_cast(1) - beta1); + v_a = v_a * beta2 + grad_a * grad_a * (static_cast(1) - beta2); + var_a -= (m_a * alpha) / (sqrt(v_a) + epsilon); + } +} + +template +struct KvSparseApplyAdam { + Status operator()(const GPUDevice& d, EmbeddingVar* var, + EmbeddingVar* m, EmbeddingVar* v, + typename TTypes::ConstScalar beta1_power_scalar, + typename TTypes::ConstScalar beta2_power_scalar, + typename TTypes::ConstVec indices_vec, + typename TTypes::ConstMatrix grad, + typename TTypes::ConstScalar lr_scalar, + typename TTypes::ConstScalar beta1_scalar, + typename TTypes::ConstScalar beta2_scalar, + typename TTypes::ConstScalar epsilon_scalar, + typename TTypes::ConstScalar global_step_scalar, + const int64 inner_dim, Allocator* alloc) { + const int32 N = indices_vec.dimension(0); + if (N <= 0) return OkStatus(); + + if (inner_dim > 0) { + const int64 global_step = global_step_scalar(); + int32* item_idxs = + TypedAllocator::Allocate(alloc, N, AllocationAttributes()); + var->LookupOrCreateKey(indices_vec.data(), item_idxs, N, d, global_step); + auto const block_size = 256; + auto const grid_size = N; + auto hashtable = var->HashTable(); + TF_CHECK_OK(GpuLaunchKernel( + KvSparseApplyAdamKernel, grid_size, block_size, 0, + d.stream(), indices_vec.data(), item_idxs, var->ValueLen(), + hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs, + var->EmbIdx(), v->EmbIdx(), m->EmbIdx(), var->SlotNum(), + hashtable->initial_bank_size, beta1_scalar.data(), + beta2_scalar.data(), beta1_power_scalar.data(), + beta2_power_scalar.data(), epsilon_scalar.data(), lr_scalar.data(), + grad.data(), var->GetDefaultValuePtr(), v->GetDefaultValuePtr(), + m->GetDefaultValuePtr(), var->GetDefaultValueDim(), + v->GetDefaultValueDim(), m->GetDefaultValueDim())); + TypedAllocator::Deallocate(alloc, item_idxs, N); + } + + return OkStatus(); + } +}; + +#define FINAL_MASK 0xffffffff + +template +__inline__ __device__ T warpReduceSum(T val) { + for (int mask = 16; mask > 0; mask >>= 1) + val += __shfl_xor_sync(FINAL_MASK, val, mask, 32); + return val; +} + +template +__inline__ __device__ T blockReduceSum(T val) { + static __shared__ T shared[32]; + int lane = threadIdx.x & 0x1f; + int wid = threadIdx.x >> 5; + + val = warpReduceSum(val); + + if (lane == 0) shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)0.0f; + val = warpReduceSum(val); + return val; +} + +template +__global__ void kv_sparse_apply_ftrl_kernel( + const TKey* key_base, int32* item_idxs, int64 dim, Value** d_banks, + bool** d_flags, int32 var_slot_idx, int32 acc_slot_idx, + int32 linear_slot_idx, int32 slot_num, int32 bank_size, Value lr_scalar, + const Value* grad, Value* var_default_v, Value* acc_default_v, + Value* linear_default_v, int32 var_default_v_num, int32 acc_default_v_num, + int32 linear_default_v_num, Value l1_scalar, Value l2_scalar, + Value lr_power_scalar, bool has_l2_shrinkage, Value l2_shrinkage_scalar) { + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto var_slot_offset = bank_idx * slot_num + var_slot_idx; + auto acc_slot_offset = bank_idx * slot_num + acc_slot_idx; + auto linear_slot_offset = bank_idx * slot_num + linear_slot_idx; + extern __shared__ __align__(sizeof(Value)) unsigned char shared[]; + Value* new_acc = reinterpret_cast(shared); + __shared__ Value linear_sqr_sum; + bool var_stored = d_flags[var_slot_offset][offset_in_bank]; + bool acc_stored = d_flags[acc_slot_offset][offset_in_bank]; + bool linear_stored = d_flags[linear_slot_offset][offset_in_bank]; + __syncthreads(); + + if (var_default_v != nullptr && var_stored == false) { + d_flags[var_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[var_slot_offset][offset_in_bank * dim + id] = + var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + + id]; + } + } + if (acc_default_v != nullptr && acc_stored == false) { + d_flags[acc_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[acc_slot_offset][offset_in_bank * dim + id] = + acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim + + id]; + } + } + if (linear_default_v != nullptr && linear_stored == false) { + d_flags[linear_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[linear_slot_offset][offset_in_bank * dim + id] = + linear_default_v[(*(key_base + item_idx) % linear_default_v_num) * + dim + + id]; + } + } + Value linear_tmp = 0; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + Value* var_p = &d_banks[var_slot_offset][tmp_offset]; + Value g = grad[item_idx * dim + id]; + Value gg; + if (has_l2_shrinkage) { + gg = g + 2 * l2_shrinkage_scalar * (*var_p); + } else { + gg = g; + } + Value* acc_p = &d_banks[acc_slot_offset][tmp_offset]; + new_acc[id] = *acc_p + gg * gg; + Value* linear_p = &d_banks[linear_slot_offset][tmp_offset]; + if (lr_power_scalar == -0.5) { + (*linear_p) += + gg - (sqrtf(new_acc[id]) - sqrtf(*acc_p)) / lr_scalar * (*var_p); + } else { + (*linear_p) += gg - (powf(new_acc[id], -lr_power_scalar) - + powf(*acc_p, -lr_power_scalar)) / + lr_scalar * (*var_p); + } + linear_tmp += (*linear_p) * (*linear_p); + } + linear_tmp = blockReduceSum(linear_tmp); + if (threadIdx.x == 0) { + linear_sqr_sum = linear_tmp; + } + __syncthreads(); + Value linear_norm = sqrtf(linear_sqr_sum); + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + Value* var_p = &d_banks[var_slot_offset][tmp_offset]; + Value* acc_p = &d_banks[acc_slot_offset][tmp_offset]; + Value* linear_p = &d_banks[linear_slot_offset][tmp_offset]; + Value g = grad[item_idx * dim + id]; + if (linear_norm > l1_scalar) { + if (lr_power_scalar == -0.5) { + auto eta_rec = sqrtf(new_acc[id]) / lr_scalar; + auto coef = (l1_scalar - linear_norm) / + ((eta_rec + 2 * l2_scalar) * linear_norm); + *var_p = coef * (*linear_p); + } else { + auto eta_rec = powf(new_acc[id], -lr_power_scalar) / lr_scalar; + auto coef = (l1_scalar - linear_norm) / + ((eta_rec + 2 * l2_scalar) * linear_norm); + *var_p = coef * (*linear_p); + } + } else { + *var_p = 0; + } + (*acc_p) += g * g; + } +} + +template +struct KvSparseApplyFtrl { + void operator()(int32 num_items, Allocator* alloc, EmbeddingVar* var, + EmbeddingVar* accum, EmbeddingVar* linear, + const TKey* key_base, const T* grad, T lr, T l1, T l2, + T lr_power, bool has_l2_shrinkage, T l2_shrinkage, + const GPUDevice& device) { + int32* item_idxs = TypedAllocator::Allocate(alloc, num_items, + AllocationAttributes()); + var->LookupOrCreateKey(key_base, item_idxs, num_items, device); + auto const block_size = 256; + auto const grid_size = num_items; + auto hashtable = var->HashTable(); + TF_CHECK_OK(GpuLaunchKernel( + kv_sparse_apply_ftrl_kernel, grid_size, block_size, + (var->ValueLen()) * sizeof(T), device.stream(), key_base, item_idxs, + var->ValueLen(), hashtable->d_bank_ptrs, + hashtable->d_existence_flag_ptrs, var->EmbIdx(), accum->EmbIdx(), + linear->EmbIdx(), var->SlotNum(), hashtable->initial_bank_size, lr, + grad, var->GetDefaultValuePtr(), accum->GetDefaultValuePtr(), + linear->GetDefaultValuePtr(), var->GetDefaultValueDim(), + accum->GetDefaultValueDim(), linear->GetDefaultValueDim(), l1, l2, + lr_power, has_l2_shrinkage, l2_shrinkage)); + TypedAllocator::Deallocate(alloc, item_idxs, num_items); + } +}; + +template +__global__ void KvSparseApplyAdamAsyncKernel( + const TKey* key_base, int32* item_idxs, int64 dim, T** d_banks, + bool** d_flags, int32 var_slot_idx, int32 v_slot_idx, int32 m_slot_idx, + int32 slot_num, int32 bank_size, const T* beta1_scalar, + const T* beta2_scalar, const T* beta1_power_scalar, + const T* beta2_power_scalar, const T* epsilon_scalar, const T* lr_scalar, + const T* grad, T* var_default_v, T* v_default_v, T* m_default_v, + int32 var_default_v_num, int32 v_default_v_num, int32 m_default_v_num, + bool apply_sparse_rmsprop) { + const T lr = *lr_scalar; + const T beta1 = *beta1_scalar; + const T beta2 = *beta2_scalar; + const T beta1_power = *beta1_power_scalar; + const T beta2_power = *beta2_power_scalar; + const T epsilon = *epsilon_scalar; + + auto item_idx = blockIdx.x; + auto item_pos = item_idxs[item_idx]; + auto bank_idx = item_pos / bank_size; + auto offset_in_bank = item_pos % bank_size; + auto var_slot_offset = bank_idx * slot_num + var_slot_idx; + auto v_slot_offset = bank_idx * slot_num + v_slot_idx; + auto m_slot_offset = bank_idx * slot_num + m_slot_idx; + bool var_stored = d_flags[var_slot_offset][offset_in_bank]; + bool v_stored = d_flags[v_slot_offset][offset_in_bank]; + bool m_stored = d_flags[m_slot_offset][offset_in_bank]; + const T alpha = lr * sqrt(static_cast(1) - beta2_power) / + (static_cast(1) - beta1_power); + __syncthreads(); + + if (var_default_v != nullptr && var_stored == false) { + d_flags[var_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[var_slot_offset][offset_in_bank * dim + id] = + var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + + id]; + } + } + if (v_default_v != nullptr && v_stored == false) { + d_flags[v_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[v_slot_offset][offset_in_bank * dim + id] = + v_default_v[(*(key_base + item_idx) % v_default_v_num) * dim + id]; + } + } + if (m_default_v != nullptr && m_stored == false) { + d_flags[m_slot_offset][offset_in_bank] = true; + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + d_banks[m_slot_offset][offset_in_bank * dim + id] = + m_default_v[(*(key_base + item_idx) % m_default_v_num) * dim + id]; + } + } + + if (apply_sparse_rmsprop) { + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + T grad_a = grad[item_idx * dim + id]; + T& var_a = d_banks[var_slot_offset][tmp_offset]; + T& v_a = d_banks[v_slot_offset][tmp_offset]; + T& m_a = d_banks[m_slot_offset][tmp_offset]; + + v_a = v_a * beta2 + grad_a * grad_a * (static_cast(1) - beta2); + m_a = m_a * beta1 + rsqrt(v_a + epsilon) * lr * grad_a; + var_a -= m_a; + } + } else { + for (auto id = threadIdx.x; id < dim; id += blockDim.x) { + auto tmp_offset = offset_in_bank * dim + id; + T grad_a = grad[item_idx * dim + id]; + T& var_a = d_banks[var_slot_offset][tmp_offset]; + T& v_a = d_banks[v_slot_offset][tmp_offset]; + T& m_a = d_banks[m_slot_offset][tmp_offset]; + + m_a = m_a * beta1 + grad_a * (static_cast(1) - beta1); + v_a = v_a * beta2 + grad_a * grad_a * (static_cast(1) - beta2); + var_a -= (m_a * alpha) / (sqrt(v_a) + epsilon); + } + } +} + +template +struct KvSparseApplyAdamAsync { + Status operator()(const GPUDevice& d, EmbeddingVar* var, + EmbeddingVar* m, EmbeddingVar* v, + typename TTypes::Scalar beta1_power_scalar, + typename TTypes::Scalar beta2_power_scalar, + typename TTypes::ConstVec indices_vec, + typename TTypes::ConstMatrix grad, + typename TTypes::ConstScalar lr_scalar, + typename TTypes::ConstScalar beta1_scalar, + typename TTypes::ConstScalar beta2_scalar, + typename TTypes::ConstScalar epsilon_scalar, + typename TTypes::ConstScalar global_step_scalar, + bool apply_sparse_rmsprop, const int64 inner_dim, + Allocator* alloc) { + const int32 N = indices_vec.dimension(0); + if (N <= 0) return OkStatus(); + + if (inner_dim > 0) { + const int64 global_step = global_step_scalar(); + int32* item_idxs = + TypedAllocator::Allocate(alloc, N, AllocationAttributes()); + var->LookupOrCreateKey(indices_vec.data(), item_idxs, N, d, global_step); + auto const block_size = 256; + auto const grid_size = N; + auto hashtable = var->HashTable(); + TF_CHECK_OK(GpuLaunchKernel( + KvSparseApplyAdamAsyncKernel, grid_size, block_size, 0, + d.stream(), indices_vec.data(), item_idxs, var->ValueLen(), + hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs, + var->EmbIdx(), v->EmbIdx(), m->EmbIdx(), var->SlotNum(), + hashtable->initial_bank_size, beta1_scalar.data(), + beta2_scalar.data(), beta1_power_scalar.data(), + beta2_power_scalar.data(), epsilon_scalar.data(), lr_scalar.data(), + grad.data(), var->GetDefaultValuePtr(), v->GetDefaultValuePtr(), + m->GetDefaultValuePtr(), var->GetDefaultValueDim(), + v->GetDefaultValueDim(), m->GetDefaultValueDim(), + apply_sparse_rmsprop)); + TypedAllocator::Deallocate(alloc, item_idxs, N); + } + + if (!apply_sparse_rmsprop) { + beta1_power_scalar.device(d) = beta1_power_scalar * beta1_scalar; + beta2_power_scalar.device(d) = beta2_power_scalar * beta2_scalar; + } + + return OkStatus(); + } +}; + +template +struct KvSparseApplyAdamAsyncHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T* beta1_power_ptr, T* beta2_power_ptr, + int64 task_size, const GPUDevice& device) { + TF_CHECK_OK(GpuLaunchKernel( + SparseApplyAdamAsyncGPU, + (task_size + block_size - 1) / block_size * embedding_dim, block_size, + 0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2, + epsilon, beta1_power_ptr, beta2_power_ptr, embedding_dim, task_size)); + } +}; + +template +struct KvSparseApplyAdamAsyncSparseRmspropHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, int64 task_size, const GPUDevice& device) { + TF_CHECK_OK(GpuLaunchKernel( + SparseApplyAdamAsyncSparseRmspropGPU, + (task_size + block_size - 1) / block_size * embedding_dim, block_size, + 0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2, + epsilon, embedding_dim, task_size)); + } +}; + +template +struct KvSparseApplyAdamHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T beta1_power, T beta2_power, int64 task_size, + const GPUDevice& device) { + TF_CHECK_OK(GpuLaunchKernel( + SparseApplyAdamGPU, + (task_size + block_size - 1) / block_size * embedding_dim, block_size, + 0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2, + epsilon, beta1_power, beta2_power, embedding_dim, task_size)); + } +}; + +template +struct KvSparseApplyAdamWHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T weight_decay, int64 task_size, + const GPUDevice& device) { + TF_CHECK_OK(GpuLaunchKernel( + SparseApplyAdamWGPU, + (task_size + block_size - 1) / block_size * embedding_dim, block_size, + 0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2, + epsilon, weight_decay, embedding_dim, task_size)); + } +}; + +} // namespace functor + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdagrad; \ + template struct functor::KvSparseApplyAdagrad; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyFtrl; \ + template struct functor::KvSparseApplyFtrl; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdam; \ + template struct functor::KvSparseApplyAdam; \ + template struct functor::KvSparseApplyAdam; \ + template struct functor::KvSparseApplyAdam; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdamAsync; \ + template struct functor::KvSparseApplyAdamAsync; \ + template struct functor::KvSparseApplyAdamAsync; \ + template struct functor::KvSparseApplyAdamAsync; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdamAsyncSparseRmspropHbm< \ + GPUDevice, int32, type>; \ + template struct functor::KvSparseApplyAdamAsyncSparseRmspropHbm< \ + GPUDevice, int64, type>; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdamAsyncHbm; \ + template struct functor::KvSparseApplyAdamAsyncHbm; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdamHbm; \ + template struct functor::KvSparseApplyAdamHbm; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdagradHbm; \ + template struct functor::KvSparseApplyAdagradHbm; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +#define REGISTER_ALL_TYPE(type) \ + template struct functor::KvSparseApplyAdamWHbm; \ + template struct functor::KvSparseApplyAdamWHbm; +TF_CALL_float(REGISTER_ALL_TYPE); +TF_CALL_double(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + +} // end namespace tensorflow +#endif // GOOGLE_CUDA diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h new file mode 100644 index 00000000..b31a3691 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h @@ -0,0 +1,119 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_ +#define TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_ + +#if GOOGLE_CUDA +#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template +struct KvSparseApplyAdagrad { + void operator()(int32 num_items, Allocator* alloc, + EmbeddingVar* var, EmbeddingVar* accum, + const Tindex* key_base, const T* grad, T lr, int64 gs, + const Device& device); +}; + +template +struct KvSparseApplyFtrl { + void operator()(int32 num_items, Allocator* alloc, EmbeddingVar* var, + EmbeddingVar* accum, EmbeddingVar* linear, + const TKey* key_base, const T* grad, T lr, T l1, T l2, + T lr_power, bool has_l2_shrinkage, T l2_shrinkage, + const Device& device); +}; + +template +struct KvSparseApplyAdam { + Status operator()(const Device& d, EmbeddingVar* var, + EmbeddingVar* m, EmbeddingVar* v, + typename TTypes::ConstScalar beta1_power_scalar, + typename TTypes::ConstScalar beta2_power_scalar, + typename TTypes::ConstVec indices_vec, + typename TTypes::ConstMatrix grad, + typename TTypes::ConstScalar lr_scalar, + typename TTypes::ConstScalar beta1_scalar, + typename TTypes::ConstScalar beta2_scalar, + typename TTypes::ConstScalar epsilon_scalar, + typename TTypes::ConstScalar global_step_scalar, + const int64 inner_dim, Allocator* alloc); +}; + +template +struct KvSparseApplyAdamAsync { + Status operator()(const Device& d, EmbeddingVar* var, + EmbeddingVar* m, EmbeddingVar* v, + typename TTypes::Scalar beta1_power_scalar, + typename TTypes::Scalar beta2_power_scalar, + typename TTypes::ConstVec indices_vec, + typename TTypes::ConstMatrix grad, + typename TTypes::ConstScalar lr_scalar, + typename TTypes::ConstScalar beta1_scalar, + typename TTypes::ConstScalar beta2_scalar, + typename TTypes::ConstScalar epsilon_scalar, + typename TTypes::ConstScalar global_step_scalar, + bool apply_sparse_rmsprop, const int64 inner_dim, + Allocator* alloc); +}; + +template +struct KvSparseApplyAdamAsyncSparseRmspropHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, int64 task_size, const Device& device); +}; + +template +struct KvSparseApplyAdamHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T beta1_power, T beta2_power, int64 task_size, + const Device& device); +}; + +template +struct KvSparseApplyAdagradHbm { + void operator()(int block_size, int embedding_dim, T** dev_a, T** dev_v, + const T* grad_base, T lr_scalar, int64 task_size, + const Device& device); +}; + +template +struct KvSparseApplyAdamAsyncHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T* beta1_power_ptr, T* beta2_power_ptr, + int64 task_size, const Device& device); +}; + +template +struct KvSparseApplyAdamWHbm { + void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m, + T** dev_v, const T* grad_base, T lr, T beta1, T beta2, + T epsilon, T weight_decay, int64 task_size, + const Device& device); +}; +} // end namespace functor +} // end namespace tensorflow +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_ diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc new file mode 100644 index 00000000..741477be --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc @@ -0,0 +1,485 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h" +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/bfloat16/bfloat16.h" +#include "tensorflow/core/util/work_sharder.h" +#include "training_ali_op_helpers.h" + +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "training_ali_ops_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +using SYCLDevice = Eigen::SyclDevice; + +// Note, this op works on cpu only. +template +class KvSparseApplyFtrlOp : public OpKernel { + public: + explicit KvSparseApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0, 1, 2}); + + EmbeddingVar* var_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var_)); + core::ScopedUnref unref_var(var_); + EmbeddingVar* accum_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum_)); + core::ScopedUnref unref_accum(accum_); + EmbeddingVar* linear_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &linear_)); + core::ScopedUnref unref_linear(linear_); + + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + const Tensor& lr = ctx->input(5); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(lr.shape()) && + lr.scalar()() > static_cast(0), + errors::InvalidArgument("lr is not a positive scalar: ", + lr.shape().DebugString())); + + const Tensor& l1 = ctx->input(6); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(l1.shape()) && + l1.scalar()() >= static_cast(0), + errors::InvalidArgument("l1 regularization strength is not a " + "non-negative scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(7); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(l2.shape()) && + l2.scalar()() >= static_cast(0), + errors::InvalidArgument("l2 regularization strength is not a " + "non-negative scalar: ", + l2.shape().DebugString())); + const int lr_power_index = has_l2_shrinkage ? 9 : 8; + const Tensor& lr_power = ctx->input(lr_power_index); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(lr_power.shape()) && + lr_power.scalar()() <= static_cast(0), + errors::InvalidArgument("lr_power is not a " + "non-positive scalar: ", + lr_power.shape().DebugString())); + int64 inner_dim = 1; + TensorShape var_shape({var_->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + const Tensor* l2_shrinkage; + if (has_l2_shrinkage) { + l2_shrinkage = &ctx->input(8); + OP_REQUIRES( + ctx, + TensorShapeUtils::IsScalar(l2_shrinkage->shape()) && + l2_shrinkage->scalar()() >= static_cast(0), + errors::InvalidArgument("l2 shrinkage regularization strength " + "is not a non-negative scalar: ", + l2_shrinkage->shape().DebugString())); + } + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (has_counts) { + const int counts_input_index = has_l2_shrinkage ? 10 : 9; + const Tensor& counts_tensor = ctx->input(counts_input_index); + indices_counts = (int64*)counts_tensor.data(); + get_count_fn = [](int64* counts, int64 index) { return counts[index]; }; + } else { + get_count_fn = [](int64* counts, int64 index) { return 1; }; + } + + if (N > 0) { + if (inner_dim > 0) { + auto indices_vec = indices.vec(); + auto grad_flat = grad.flat_outer_dims(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + T l2_shrinkage_scalar = 0.0; + if (has_l2_shrinkage) { + l2_shrinkage_scalar = l2_shrinkage->scalar()(); + } + T lr_power_scalar = lr_power.scalar()(); + auto do_work = [this, ctx, inner_dim, &var_, &indices_vec, &accum_, + &linear_, &grad_flat, &lr_scalar, &l1_scalar, + &l2_scalar, &lr_power, &l2_shrinkage_scalar, + &lr_power_scalar, get_count_fn, + indices_counts](int64 start_i, int64 limit_i) { + for (int64 i = start_i; i < limit_i; i++) { + const TKey index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK( + ctx, var_->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + if (is_filter) { + auto var = var_->flat(value_ptr); + auto accum = accum_->flat(value_ptr); + auto linear = linear_->flat(value_ptr); + auto grad = grad_flat.template chip<0>(i); + +// Use a macro to implement the computation here due to the templating of the +// eigen tensor library. +#define COMPUTE_FTRL(grad_to_use) \ + auto new_accum = accum + grad_to_use.square(); \ + if (lr_power_scalar == static_cast(-0.5)) { \ + linear += \ + grad_to_use - (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var; \ + } else { \ + linear += grad_to_use - (new_accum.pow(-lr_power_scalar) - \ + accum.pow(-lr_power_scalar)) / \ + lr_scalar * var; \ + } \ + Eigen::Tensor linear_sqrsum = \ + linear.square().sum().sqrt(); \ + T linear_norm = linear_sqrsum(0); \ + if (linear_norm > l1_scalar) { \ + if (lr_power_scalar == static_cast(-0.5)) { \ + auto eta_rec = new_accum.sqrt() / lr_scalar; \ + auto coef = (l1_scalar - linear_norm) / \ + ((eta_rec + static_cast(2) * l2_scalar) * linear_norm); \ + var = coef * linear; \ + } else { \ + auto eta_rec = new_accum.pow(-lr_power_scalar) / lr_scalar; \ + auto coef = (l1_scalar - linear_norm) / \ + ((eta_rec + static_cast(2) * l2_scalar) * linear_norm); \ + var = coef * linear; \ + } \ + } else { \ + var = var.constant(static_cast(0)); \ + } \ + accum += grad.square(); + if (has_l2_shrinkage) { + auto grad_with_shrinkage = + grad + static_cast(2) * l2_shrinkage_scalar * var; + COMPUTE_FTRL(grad_with_shrinkage); + } else { + COMPUTE_FTRL(grad); + } + } + } +#undef COMPUTE_FTRL + }; + + const int64 cost = 4500; // very unreliable estimate for cost per step. + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + + if (has_counts && !indices_as_pointer) { + const int counts_input_index = has_l2_shrinkage ? 10 : 9; + const Tensor& indices_counts = ctx->input(counts_input_index); + var_->UpdateCache(indices, indices_counts); + } + } + } + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(Tindices, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrl") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyFtrl") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrlWithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyFtrlWithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); + +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(int64, T); \ + REGISTER_KERNELS(int32, T); + +TF_CALL_float(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(Tindices, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrlV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyFtrlV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrlV2WithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyFtrlV2WithCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOp); + +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(int64, T); \ + REGISTER_KERNELS(int32, T); + +TF_CALL_float(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +template +class KvSparseApplyFtrlOpGPU : public OpKernel { + public: + explicit KvSparseApplyFtrlOpGPU(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + EmbeddingVar* var_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var_)); + EmbeddingVar* accum_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum_)); + EmbeddingVar* linear_ = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &linear_)); + + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + const Tensor& lr = ctx->input(5); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(lr.shape()) && + lr.scalar()() > static_cast(0), + errors::InvalidArgument("lr is not a positive scalar: ", + lr.shape().DebugString())); + + const Tensor& l1 = ctx->input(6); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(l1.shape()) && + l1.scalar()() >= static_cast(0), + errors::InvalidArgument("l1 regularization strength is not a " + "non-negative scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(7); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(l2.shape()) && + l2.scalar()() >= static_cast(0), + errors::InvalidArgument("l2 regularization strength is not a " + "non-negative scalar: ", + l2.shape().DebugString())); + const int lr_power_index = has_l2_shrinkage ? 9 : 8; + const Tensor& lr_power = ctx->input(lr_power_index); + OP_REQUIRES(ctx, + TensorShapeUtils::IsScalar(lr_power.shape()) && + lr_power.scalar()() <= static_cast(0), + errors::InvalidArgument("lr_power is not a " + "non-positive scalar: ", + lr_power.shape().DebugString())); + int64 inner_dim = 1; + TensorShape var_shape({var_->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + const Tensor* l2_shrinkage; + if (has_l2_shrinkage) { + l2_shrinkage = &ctx->input(8); + OP_REQUIRES( + ctx, + TensorShapeUtils::IsScalar(l2_shrinkage->shape()) && + l2_shrinkage->scalar()() >= static_cast(0), + errors::InvalidArgument("l2 shrinkage regularization strength " + "is not a non-negative scalar: ", + l2_shrinkage->shape().DebugString())); + } + + if (N > 0) { + if (inner_dim > 0) { + auto indices_flat = indices.flat(); + auto grad_flat = grad.flat(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + T l2_shrinkage_scalar = 0.0; + if (has_l2_shrinkage) { + l2_shrinkage_scalar = l2_shrinkage->scalar()(); + } + T lr_power_scalar = lr_power.scalar()(); + const TKey* key_base = &indices_flat(0); + const T* grad_base = &grad_flat(0); + const Device& device = ctx->eigen_device(); + + functor::KvSparseApplyFtrl()( + N, ctx->get_allocator(AllocatorAttributes()), var_, accum_, linear_, + key_base, grad_base, lr_scalar, l1_scalar, l2_scalar, + lr_power_scalar, has_l2_shrinkage, l2_shrinkage_scalar, device); + } + } + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +namespace functor { +#define DECLARE_GPU_SPEC(TKey, T) \ + template <> \ + void KvSparseApplyFtrl::operator()( \ + int32 num_items, Allocator* alloc, EmbeddingVar* var, \ + EmbeddingVar* accum, EmbeddingVar* linear, \ + const TKey* key_base, const T* grad, T lr, T l1, T l2, T lr_power, \ + bool has_l2_shrinkage, T l2_shrinkage, const GPUDevice& device); \ + extern template struct KvSparseApplyFtrl; +DECLARE_GPU_SPEC(int32, float); +DECLARE_GPU_SPEC(int32, double); +DECLARE_GPU_SPEC(int64, float); +DECLARE_GPU_SPEC(int64, double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +#define REGISTER_KERNELS(Tindices, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrl") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("lr") \ + .HostMemory("l1") \ + .HostMemory("l2") \ + .HostMemory("lr_power") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOpGPU); +#define REGISTER_GPU_KERNELS(T) \ + REGISTER_KERNELS(int64, T); \ + REGISTER_KERNELS(int32, T); +TF_CALL_float(REGISTER_GPU_KERNELS); +TF_CALL_double(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(Tindices, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyFtrlV2") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("lr") \ + .HostMemory("l1") \ + .HostMemory("l2") \ + .HostMemory("lr_power") \ + .HostMemory("l2_shrinkage") \ + .TypeConstraint("Tindices"), \ + KvSparseApplyFtrlOpGPU); +#define REGISTER_GPU_KERNELS(T) \ + REGISTER_KERNELS(int64, T); \ + REGISTER_KERNELS(int32, T); +TF_CALL_float(REGISTER_GPU_KERNELS); +TF_CALL_double(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#undef REGISTER_KERNELS +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc new file mode 100644 index 00000000..16dd8e6d --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc @@ -0,0 +1,200 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA +#include + +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_op_helpers.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/bfloat16/bfloat16.h" +#include "tensorflow/core/util/work_sharder.h" +#include "training_ali_op_helpers.h" + +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/gpu_device_array.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; +using SYCLDevice = Eigen::SyclDevice; + +template +class KvResourceSparseApplyGradientDescentOp : public OpKernel { + public: + explicit KvResourceSparseApplyGradientDescentOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, {0}); + + EmbeddingVar* var = nullptr; + OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var)); + core::ScopedUnref unref_var(var); + + const Tensor& lr = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + + const Tensor& grad = ctx->input(2); + const Tensor& indices = ctx->input(3); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + const Tensor& global_step = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()), + errors::InvalidArgument("global_step is not a scalar: ", + global_step.shape().DebugString())); + + int64 inner_dim = 1; + TensorShape var_shape({var->ValueLen()}); + for (int d = 0; d < var_shape.dims(); d++) { + OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d + 1))); + inner_dim *= grad.dim_size(d + 1); + } + OP_REQUIRES(ctx, inner_dim > 0, + errors::InvalidArgument( + "Inner dimension should be greater than zero.")); + + const int64 N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + int64* indices_counts = nullptr; + std::function get_count_fn = 0; + if (has_counts) { + const Tensor& counts_tensor = ctx->input(5); + indices_counts = (int64*)counts_tensor.data(); + get_count_fn = [](int64* counts, int64 index) { return counts[index]; }; + } else { + get_count_fn = [](int64* counts, int64 index) { return 1; }; + } + + if (N > 0) { + auto indices_vec = indices.vec(); + T lr_scalar = lr.scalar()(); + Tstep gs = global_step.scalar()(); + + if (inner_dim > 0) { + auto grad_flat = grad.flat_outer_dims(); + auto do_work = [this, ctx, &indices_vec, var, &grad_flat, &gs, + &lr_scalar, indices_counts, + get_count_fn](int64 start_i, int64 limit_i) { + for (int64 i = start_i; i < limit_i; i++) { + const Tindex index = indices_vec(i); + void* value_ptr = nullptr; + bool is_filter = false; + int64 count = get_count_fn(indices_counts, i); + OP_REQUIRES_OK(ctx, + var->LookupOrCreateKey(index, &value_ptr, &is_filter, + indices_as_pointer, count)); + var->UpdateVersion(value_ptr, gs); + if (is_filter) { + auto g = grad_flat.template chip<0>(i); + auto v = var->flat(value_ptr); + v -= g.constant(lr_scalar) * g; + } + } + }; + const int64 cost = 1000; + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, N, cost, + do_work); + if (has_counts && !indices_as_pointer) { + const Tensor& indices = ctx->input(5); + var->UpdateCache(indices, indices_counts); + } else { + var->UpdateCache(indices); + } + } + } + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices, Tstep) \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyGradientDescent") \ + .Device(DEVICE_CPU) \ + .HostMemory("var") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvResourceSparseApplyGradientDescentOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyGradientDescent") \ + .Device(DEVICE_CPU) \ + .HostMemory("var") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvResourceSparseApplyGradientDescentOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("KvResourceSparseApplyGradientDescentWithCounts") \ + .Device(DEVICE_CPU) \ + .HostMemory("var") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvResourceSparseApplyGradientDescentOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_OPT_KvResourceSparseApplyGradientDescentWithCounts") \ + .Device(DEVICE_CPU) \ + .HostMemory("var") \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices") \ + .TypeConstraint("Tstep"), \ + KvResourceSparseApplyGradientDescentOp); + +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int64, int32); \ + REGISTER_KERNELS(T, int64, int64); \ + REGISTER_KERNELS(T, int32, int32); \ + REGISTER_KERNELS(T, int32, int64); + +TF_CALL_float(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc b/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc new file mode 100644 index 00000000..c3a8e129 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +namespace tensorflow { + +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("HotnessCalculate") + .Input("row_length_buffer: Tindices") + .Output("hotness: int32") + .Attr("num_gpus: int") + .Attr("num_lookups: int") + .Attr("Tindices: {int32, int64} = DT_INT64") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle unknown_1d_shape = c->UnknownShapeOfRank(1); + + c->set_output(0, unknown_1d_shape); + + return OkStatus(); + }); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc new file mode 100644 index 00000000..09f237ed --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc @@ -0,0 +1,282 @@ +// Copyright 2016 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/shape_inference.h" + +using ::tensorflow::shape_inference::DimensionHandle; +using ::tensorflow::shape_inference::InferenceContext; +using ::tensorflow::shape_inference::ShapeAndType; +using ::tensorflow::shape_inference::ShapeHandle; + +namespace tensorflow { + +REGISTER_OP("GroupEmbeddingVarLookupDense") + .Input("resource: num_lookups * resource") + .Input("dense_values: num_lookups * Tkeys") + .Input("default_value: dtype") + .Attr("is_use_default_value_tensor: bool = false") + .Attr("dimension: int") + .Output("output: num_lookups * dtype") + .Output("unique_keys: num_lookups * Tkeys") + .Output("unique_idx: num_lookups * int32") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .Attr("num_lookups: int >= 1") + .Attr("is_inference: bool = false") + .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'") // placeholder + .Attr("ignore_weights: bool = true") // placeholder + .Attr("is_sequence: bool = false") + .SetShapeFn([](InferenceContext* c) { + int num_lookups; + TF_RETURN_IF_ERROR(c->GetAttr("num_lookups", &num_lookups)); + const std::vector* shapes_and_types = + nullptr; + for (int i = 0; i < num_lookups; ++i) { + shapes_and_types = c->input_handle_shapes_and_types(i); + // LOG(INFO) << "shapes_and_types: shape=" + // << c->DebugString(shapes_and_types->at(0).shape); + + ShapeHandle temp; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(c->input(num_lookups + i), 1, &temp)); + + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(shapes_and_types->at(0).shape, 1, &unused)); + ShapeHandle params_subshape; + params_subshape = shapes_and_types->at(0).shape; + + ShapeHandle indices_shape = c->input(num_lookups + i); + ShapeHandle out; + TF_RETURN_IF_ERROR( + c->Concatenate(indices_shape, params_subshape, &out)); + c->set_output(i, out); + c->set_output(num_lookups + i, + c->Vector(InferenceContext::kUnknownDim)); + // c->set_output(num_lookups * 2 + i, c->input(num_lookups+i)); + } + + return OkStatus(); + }); + +REGISTER_OP("GroupEmbeddingVarLookup") + .Input("resource: num_lookups * resource") + .Input("sp_values: num_lookups * Tkeys") + .Input("sp_indices: num_lookups * int64") + .Input("sp_weights: num_lookups * dtype") + .Input("dense_shape: num_lookups * int64") + .Input("default_value: dtype") + .Attr("ignore_weights: bool = false") + .Attr("is_use_default_value_tensor: bool = false") + .Attr("is_sequence: bool = false") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("dimension: int") + .Output("output: num_lookups * dtype") + .Output("unique_keys: num_lookups * Tkeys") + .Output("unique_idx: num_lookups * int32") + .Output("batch_nums: num_lookups * int32") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .Attr("num_lookups: int >= 1") + .Attr("is_inference: bool = false") + .SetShapeFn([](InferenceContext* c) { + int num_lookups; + TF_RETURN_IF_ERROR(c->GetAttr("num_lookups", &num_lookups)); + + for (int i = 0; i < num_lookups; ++i) { + auto shapes_and_types = c->input_handle_shapes_and_types(i); + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(shapes_and_types->at(0).shape, 1, &unused)); + TF_RETURN_IF_ERROR( + c->WithRank(c->input(num_lookups * 2 + i), 2, &unused)); + // TF_RETURN_IF_ERROR(c->WithRank(c->input(num_lookups*3+i), 1, + // &unused)); + ShapeHandle params_subshape; + params_subshape = shapes_and_types->at(0).shape; + + ShapeHandle indices_shape = c->input(num_lookups + i); + ShapeHandle out; + TF_RETURN_IF_ERROR( + c->Concatenate(indices_shape, params_subshape, &out)); + c->set_output(i, out); + c->set_output(num_lookups + i, + c->Vector(InferenceContext::kUnknownDim)); + c->set_output(num_lookups * 2 + i, c->input(num_lookups + i)); + c->set_output(num_lookups * 3 + i, + c->Vector(InferenceContext::kUnknownDim)); + } + + return OkStatus(); + }); + +REGISTER_OP("GroupEmbeddingVariableLookupGrad") + .Input("grads: num_lookups * dtype") + .Input("embedding_resources: num_lookups * resource") + .Input("unique_keys: num_lookups * Tkeys") + .Input("sp_indices: num_lookups * int64") + .Input("batch_nums: num_lookups * int32") + .Output("nnz_grads: num_lookups * dtype") + .Attr("dimension: int") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("num_lookups: int >=1") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .SetShapeFn([](InferenceContext* ctx) { + int num_lookups = ctx->num_outputs(); + for (int i = 0; i < num_lookups; ++i) { + ShapeHandle top_grad_shape; + TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &top_grad_shape)); + DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1); + ctx->set_output(i, + ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim})); + } + return OkStatus(); + }); + +REGISTER_OP("GroupVariableLookup") + .Input("emb_variables: num_lookups * dtype") + .Input("sp_values: num_lookups * Tkeys") + .Input("sp_indices: num_lookups * int64") + .Input("sp_weights: num_lookups * dtype") + .Input("dense_shape: num_lookups * int64") + .Input("default_value: dtype") + .Output("output: num_lookups * dtype") + .Output("unique_keys: num_lookups * Tkeys") + .Output("unique_idx: num_lookups * int32") + .Output("batch_nums: num_lookups * int32") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("dimension: int") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .Attr("num_lookups: int >= 1") + .Attr("ignore_weights: bool = false") + .Attr("is_use_default_value_tensor: bool = false") + .Attr("is_sequence: bool = false") + .SetShapeFn([](InferenceContext* ctx) { + int num_lookups; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_lookups", &num_lookups)); + + bool is_sequence; + TF_RETURN_IF_ERROR(ctx->GetAttr("is_sequence", &is_sequence)); + + for (int i = 0; i < num_lookups; ++i) { + ShapeHandle temp; + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(num_lookups + i), 1, &temp)); + TF_RETURN_IF_ERROR( + ctx->WithRank(ctx->input(2 * num_lookups + i), 2, &temp)); + // TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3*num_lookups+i), 1, + // &temp)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(ctx->WithRankAtLeast(ctx->input(i), 1, &unused)); + ShapeHandle params_subshape; + TF_RETURN_IF_ERROR(ctx->Subshape(ctx->input(i), 1, ¶ms_subshape)); + DimensionHandle emb_vec_size_dim = ctx->Dim(params_subshape, 0); + DimensionHandle batch_dim = ctx->UnknownDim(); + if (is_sequence) { + ShapeHandle output_shape = + ctx->MakeShape({batch_dim, batch_dim, emb_vec_size_dim}); + ctx->set_output(i, output_shape); + } else { + ShapeHandle output_shape = + ctx->MakeShape({batch_dim, emb_vec_size_dim}); + ctx->set_output(i, output_shape); + } + ctx->set_output(num_lookups + i, + ctx->Vector(InferenceContext::kUnknownDim)); + ctx->set_output(num_lookups * 2 + i, ctx->input(num_lookups + i)); + ctx->set_output(num_lookups * 3 + i, + ctx->Vector(InferenceContext::kUnknownDim)); + } + + return OkStatus(); + }); + +REGISTER_OP("GroupVariableLookupGrad") + .Input("grads: num_lookups * float32") + .Input("embedding_variables: num_lookups * dtype") + .Input("unique_keys: num_lookups * Tkeys") + .Input("sp_indices: num_lookups * int64") + .Input("batch_nums: num_lookups * int32") + .Output("nnz_grads: num_lookups * float32") + .Attr("dimension: int") + .Attr("combiner: {'sqrtn', 'mean', 'sum'}") + .Attr("num_lookups: int >=1") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .SetShapeFn([](InferenceContext* ctx) { + int num_lookups = ctx->num_outputs(); + for (int i = 0; i < num_lookups; ++i) { + ShapeHandle top_grad_shape; + TF_RETURN_IF_ERROR( + ctx->WithRankAtLeast(ctx->input(i), 2, &top_grad_shape)); + DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1); + ctx->set_output(i, + ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim})); + } + return OkStatus(); + }); + +REGISTER_OP("GroupVariableLookupDense") + .Input("emb_variables: num_lookups * dtype") + .Input("dense_values: num_lookups * Tkeys") + .Input("default_value: dtype") + .Output("output: num_lookups * dtype") + .Output("unique_keys: num_lookups * Tkeys") + .Output("unique_idx: num_lookups * int32") + .Attr("dimension: int") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("max_norm: float = -1.0") + .Attr("num_lookups: int >= 1") + .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'") // placeholder + .Attr("ignore_weights: bool = true") // placeholder + .SetShapeFn([](InferenceContext* ctx) { + int num_lookups; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_lookups", &num_lookups)); + + for (int i = 0; i < num_lookups; ++i) { + ShapeHandle temp; + TF_RETURN_IF_ERROR( + ctx->WithRankAtLeast(ctx->input(num_lookups + i), 1, &temp)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(ctx->WithRankAtLeast(ctx->input(i), 1, &unused)); + ShapeHandle params_subshape; + TF_RETURN_IF_ERROR(ctx->Subshape(ctx->input(i), 1, ¶ms_subshape)); + DimensionHandle emb_vec_size_dim = ctx->Dim(params_subshape, 0); + DimensionHandle batch_dim = ctx->UnknownDim(); + ShapeHandle output_shape = + ctx->MakeShape({batch_dim, emb_vec_size_dim}); + ShapeHandle offset_shape = ctx->MakeShape({batch_dim, 1}); + ctx->set_output(i, output_shape); + ctx->set_output(num_lookups + i, + ctx->Vector(InferenceContext::kUnknownDim)); + // ctx->set_output(num_lookups * 2 + i, ctx->input(num_lookups+i)); + } + + return OkStatus(); + }); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc new file mode 100644 index 00000000..771cdf9f --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc @@ -0,0 +1,73 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/util/saved_tensor_slice_util.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("KvResourceIncrImport") + .Input("prefix: string") + .Input("resource_handle: resource") + .Input("tensor_names: string") + .Input("empty_key: Tkeys") + .Input("value: dtype") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .Attr("partition_id: int = 0") + .Attr("partition_num: int = 1") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + return OkStatus(); + }) + .Doc(R"doc()doc"); + +REGISTER_OP("IncrSave") + .Input("prefix: string") + .Input("tensor_names: string") + .Input("shape_and_slices: string") + .Input("is_sparse: bool") + .Input("tensors: dtypes") + .Attr("dtypes: list(type)") + .SetIsStateful() + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +REGISTER_OP("IncrRestore") + .Input("prefix: string") + .Input("tensor_names: string") + .Input("shape_and_slices: string") + .Input("is_sparse: bool") + .Input("in_tensors: dtypes") + .Output("out_tensors: dtypes") + .Attr("dtypes: list(type)") + .SetIsStateful() + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +REGISTER_OP("RecordSparseIndices") + .Input("keys: TIndex") + .Attr("var_name: string = ''") + .Attr("TIndex: {int32, int64}") + .Attr("auto_record: bool = false") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +REGISTER_OP("ActivateSparseRecorder") + .Input("tensor_names: string") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +REGISTER_OP("CollectSparseIndices") + .Output("indices: ktype") + .Output("global_indices: ktype") + .Attr("tensor_name: string") + .Attr("config: string = ''") + .Attr("part_idx: int = -1") + .Attr("part_count: int = 0") + .Attr("hash_bucket_size: int = 0") + .Attr("part_mode: string = ''") + .Attr("ktype: {int32, int64}") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc new file mode 100644 index 00000000..7c354106 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc @@ -0,0 +1,436 @@ +// Copyright 2016 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/shape_inference.h" + +using ::tensorflow::shape_inference::DimensionHandle; +using ::tensorflow::shape_inference::InferenceContext; +using ::tensorflow::shape_inference::ShapeAndType; +using ::tensorflow::shape_inference::ShapeHandle; + +namespace tensorflow { + +namespace { + +Status ReadVariableShapeFn(InferenceContext* c) { + std::vector shape_and_type; + TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type)); + c->set_output(0, shape_and_type[0].shape); + return OkStatus(); +} + +Status CreateAssignShapeFn(InferenceContext* c) { + std::vector handle_shape_and_type; + TF_RETURN_IF_ERROR(shape_inference::ValidateVariableResourceHandle( + c, &handle_shape_and_type)); + + ShapeHandle value_shape = c->input(1); + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->Merge(handle_shape_and_type[0].shape, value_shape, &unused)); + return OkStatus(); +} + +} // namespace + +// KvVar +REGISTER_OP("KvVarHandleOp") + .Attr("container: string = ''") + .Attr("shared_name: string = ''") + .Attr("dtype: type") + .Attr("shape: shape") + .Attr("Tkeys: {int64, int32} = DT_INT64") + .Output("resource: resource") + .SetIsStateful() + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Scalar()); + DataType t; + TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t)); + PartialTensorShape p; + TF_RETURN_IF_ERROR(c->GetAttr("shape", &p)); + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s)); + c->set_output_handle_shapes_and_types(0, + std::vector{{s, t}}); + + return absl::OkStatus(); + }) + .Doc(R"( +Creates a handle to a Variable resource. + +container: the container this variable is placed in. +shared_name: the name by which this variable is referred to. +dtype: the type of this variable. Must agree with the dtypes + of all ops using this variable. +shape: The (possibly partially specified) shape of this variable. +)"); + +REGISTER_OP("ReadKvVariableOp") + .Input("resource: resource") + .Output("value: dtype") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .SetShapeFn(ReadVariableShapeFn) + .Doc(R"( +Reads the value of a variable. + +The tensor returned by this operation is immutable. + +The value returned by this operation is guaranteed to be influenced by all the +writes on which this operation depends directly or indirectly, and to not be +influenced by any of the writes which depend directly or indirectly on this +operation. + +resource: handle to the resource in which to store the variable. +dtype: the dtype of the value. +)"); + +REGISTER_OP("InitializeKvVariableOp") + .Input("resource_self: resource") + .Input("resource_primary: resource") + .Input("value: dtype") + .Input("empty_key: Tkeys") + .Attr("slot_num: int = 0") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .Attr("shape: shape") + .Attr("initial_num_buckets: int = 131072") // 2^17 + .Attr("max_load_factor: float = 0.8") + .Attr("steps_to_live: int = 0") + .Attr("ht_type: string = ''") + .Attr("emb_index: int = 0") + .Attr("block_num: int = 1") + .Attr("slot_index: int = 0") + .Attr("ht_partition_num: int = 1000") + .Attr("filter_freq: int = 0") + .Attr("max_freq: int = 999999") + .Attr("max_element_size: int = 0") + .Attr("counter_type: type") + .Attr("false_positive_probability: float = -1.0") + .Attr("l2_weight_threshold: float =-1.0") + .Attr("layout: string = ''") + .Attr("storage_type: int = 0") + .Attr("storage_path: string = '.'") + .Attr("storage_size: list(int) = []") + .Attr("default_value_dim: int = 4096") + .Attr("default_value_no_permission: float = .0") + .Attr("record_freq: bool = false") + .Attr("record_version: bool = false") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }) + .Doc(R"( +Assigns a new value to a variable. + +Any ReadVariableOp with a control dependency on this op is guaranteed to return +this value or a subsequent newer value of the variable. + +resource_self: handle to the resource in which to store the variable. +resource_primary: handle to the resource in which to store the variable. +value: the value to set the new tensor to use. +dtype: the dtype of the value. +)"); + +REGISTER_OP("InitializeKvVariableV2Op") + .Input("resource_self: resource") + .Input("resource_primary: resource") + .Input("value: dtype") + .Input("empty_key: Tkeys") + .Attr("slot_num: int = 0") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .Attr("shape: shape") + .Attr("initial_num_buckets: int = 131072") // 2^17 + .Attr("max_load_factor: float = 0.8") + .Attr("steps_to_live: int = 0") + .Attr("ht_type: string = ''") + .Attr("emb_index: int = 0") + .Attr("block_num: int = 1") + .Attr("slot_index: int = 0") + .Attr("ht_partition_num: int = 1000") + .Attr("filter_freq: int = 0") + .Attr("max_freq: int = 999999") + .Attr("max_element_size: int = 0") + .Attr("counter_type: type") + .Attr("false_positive_probability: float = -1.0") + .Attr("l2_weight_threshold: float =-1.0") + .Attr("layout: string = ''") + .Attr("storage_type: int = 0") + .Attr("storage_path: string = '.'") + .Attr("storage_size: list(int) = []") + .Attr("default_value_dim: int = 4096") + .Attr("default_value_no_permission: float = .0") + .Attr("record_freq: bool = false") + .Attr("record_version: bool = false") + .Attr("embedding_variable_type: int = 0") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }) + .Doc(R"( +Assigns a new value to a variable. + +Any ReadVariableOp with a control dependency on this op is guaranteed to return +this value or a subsequent newer value of the variable. + +resource_self: handle to the resource in which to store the variable. +resource_primary: handle to the resource in which to store the variable. +value: the value to set the new tensor to use. +dtype: the dtype of the value. +)"); + +REGISTER_OP("KvVarIsInitializedOp") + .Input("resource: resource") + .Output("is_initialized: bool") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type = DT_FLOAT") + .SetShapeFn(tensorflow::shape_inference::ScalarShape) + .Doc(R"doc( +Checks whether a resource handle-based variable has been initialized. + +resource: the input resource handle. +is_initialized: a scalar boolean which is true if the variable has been +initialized. +)doc"); + +REGISTER_OP("KvVarIsAllSlotInitializedOp") + .Input("resource: resource") + .Output("is_all_slot_initialized: bool") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type = DT_FLOAT") + .SetShapeFn(tensorflow::shape_inference::ScalarShape) + .Doc(R"doc( +Checks whether a resource handle-based variable has been initialized. + +resource: the input resource handle. +is_all_slot_initialized: a scalar boolean which is true if the variable has been +initialized. +)doc"); + +REGISTER_OP("KvResourceInitCacheStrategyOp") + .Input("resource: resource") + .Attr("cache_strategy: int = 1") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: {float32, double}") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }); + +Status KvVariableShapeShapeFn(InferenceContext* c) { + auto* handle_data = c->input_handle_shapes_and_types(0); + if (handle_data == nullptr || handle_data->empty()) { + return errors::InvalidArgument("Handle doesn't have shape information."); + } + c->set_output(0, (*handle_data)[0].shape); + return OkStatus(); +} + +REGISTER_OP("KvVariableShape") + .Input("input: resource") + .Output("output: out_type") + .Attr("out_type: {int32, int64} = DT_INT32") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type = DT_FLOAT") + // .SetShapeFn(KvVariableShapeShapeFn) + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Vector(2)); + return OkStatus(); + }) + .Doc(R"doc( +Returns the shape of the variable pointed to by `resource`. + +This operation returns a 1-D integer tensor representing the shape of `input`. + +For example: + +``` +# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] +shape(t) ==> [2, 2, 3] +``` + +)doc"); + +REGISTER_OP("DestroyKvResourceOp") + .Input("resource: resource") + .Attr("ignore_lookup_error: bool = true") + .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"( +Deletes the resource specified by the handle. + +All subsequent operations using the resource will result in a NotFound +error status. + +resource: handle to the resource to delete. +ignore_lookup_error: whether to ignore the error when the resource + doesn't exist. +)"); + +REGISTER_OP("_OPT_KvResourceLookupID") + .Input("resource: resource") + .Input("indices: Tkeys") + .Output("pointer: int64") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .SetShapeFn([](InferenceContext* c) { + std::vector handle_shape_and_type; + TF_RETURN_IF_ERROR( + ValidateVariableResourceHandle(c, &handle_shape_and_type)); + + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused)); + + ShapeHandle indices_shape = c->input(1); + c->set_output(0, indices_shape); + return OkStatus(); + }) + .Doc(R"doc( +Lookup the `pointer` from the variable pointed to by `resource` according to `indices`. +)doc"); + +REGISTER_OP("KvResourceGatherV1") + .Input("resource: resource") + .Input("indices: Tkeys") + .Input("default_value: dtype") + .Input("counts: counts_type") + .Attr("validate_indices: bool = true") + .Attr("is_use_default_value_tensor: bool = false") + .Attr("is_inference: bool = false") + .Output("output: dtype") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("counts_type: {int32, int64} = DT_INT32") + .SetShapeFn([](InferenceContext* c) { + std::vector handle_shape_and_type; + TF_RETURN_IF_ERROR( + ValidateVariableResourceHandle(c, &handle_shape_and_type)); + + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused)); + ShapeHandle params_subshape; + params_subshape = handle_shape_and_type[0].shape; + // TF_RETURN_IF_ERROR( + // c->Subshape(handle_shape_and_type.shape, 1, ¶ms_subshape)); + ShapeHandle indices_shape = c->input(1); + ShapeHandle out; + TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out)); + c->set_output(0, out); + return OkStatus(); + }) + .Doc(R"doc( +Gather slices from the variable pointed to by `resource` according to `indices`. + +`indices` must be an integer tensor of any dimension (usually 0-D or 1-D). +Produces an output tensor with shape `indices.shape + params.shape[1:]` where: + +```python + # Scalar indices + output[:, ..., :] = params[indices, :, ... :] + + # Vector indices + output[i, :, ..., :] = params[indices[i], :, ... :] + + # Higher rank indices + output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :] +``` + +)doc"); + +REGISTER_OP("KvResourceGather") + .Input("resource: resource") + .Input("indices: Tkeys") + .Input("default_value: dtype") + .Attr("is_use_default_value_tensor: bool = false") + .Attr("validate_indices: bool = true") + .Output("output: dtype") + .Attr("dtype: type") + .Attr("Tkeys: {int64, int32}") + .Attr("is_inference: bool = false") + .SetShapeFn([](InferenceContext* c) { + std::vector handle_shape_and_type; + TF_RETURN_IF_ERROR(shape_inference::ValidateVariableResourceHandle( + c, &handle_shape_and_type)); + + ShapeHandle unused; + TF_RETURN_IF_ERROR( + c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused)); + + ShapeHandle params_subshape; + params_subshape = handle_shape_and_type[0].shape; + // TF_RETURN_IF_ERROR( + // c->Subshape(handle_shape_and_type.shape, 1, ¶ms_subshape)); + ShapeHandle indices_shape = c->input(1); + ShapeHandle out; + TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out)); + c->set_output(0, out); + return OkStatus(); + }) + .Doc(R"doc( +Gather slices from the variable pointed to by `resource` according to `indices`. + +`indices` must be an integer tensor of any dimension (usually 0-D or 1-D). +Produces an output tensor with shape `indices.shape + params.shape[1:]` where: + +```python + # Scalar indices + output[:, ..., :] = params[indices, :, ... :] + + # Vector indices + output[i, :, ..., :] = params[indices[i], :, ... :] + + # Higher rank indices + output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :] +``` + +)doc"); + +REGISTER_OP("EVGetFrequency") + .Input("resource_handle: resource") + .Input("ids: Tkeys") + .Output("output: int64") + .Attr("Tkeys: {int64, int32}") + .Attr("Tvalues: type") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }) + .Doc(R"doc()doc"); + +REGISTER_OP("EVGetVersion") + .Input("resource_handle: resource") + .Input("ids: Tkeys") + .Output("output: int64") + .Attr("Tkeys: {int64, int32}") + .Attr("Tvalues: type") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }) + .Doc(R"doc()doc"); + +REGISTER_OP("KvResourceLookupTier") + .Input("resource_handle: resource") + .Input("ids: Tkeys") + .Output("output: int32") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .SetShapeFn([](InferenceContext* c) { return OkStatus(); }) + .Doc(R"doc()doc"); + +REGISTER_OP("KvResourceLookupResource") + .Input("resource_handle: resource") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type = DT_FLOAT") + .Output("output: int64") + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Scalar()); + return OkStatus(); + }) + .Doc(R"doc()doc"); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc new file mode 100644 index 00000000..b49d868b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc @@ -0,0 +1,122 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/util/saved_tensor_slice_util.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("SaveV3") + .Input("prefix: string") + .Input("tensor_names: string") + .Input("shape_and_slices: string") + .Input("ev_names: string") + .Input("ev_resources: int64") + .Input("tensors: dtypes") + .Attr("dtypes: list(type)") + .Attr("ev_key_types: list(type) = []") + .Attr("has_ev: bool = false") + .SetIsStateful() + .SetShapeFn([](InferenceContext* c) { + ShapeHandle unused; + ShapeHandle s; + DimensionHandle unused_dim; + + // Validate prefix. + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused)); + + // Validate tensor_names and shapes_and_slices. + for (int i = 1; i <= 2; ++i) { + TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &s)); + TF_RETURN_IF_ERROR( + c->WithValue(c->Dim(s, 0), c->num_inputs() - 5, &unused_dim)); + } + + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &s)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &s)); + return OkStatus(); + }); + +REGISTER_OP("KvResourceImport") + .Input("resource_handle: resource") + .Input("value: dtype") + .Input("empty_key: Tkeys") + .Input("keys: Tkeys") + .Input("values: dtype") + .Input("versions: int64") + .Attr("shape: shape") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .Attr("steps_to_live: int = 0") + .Attr("ht_type: string = ''") + .Attr("ht_partition_num: int = 1000") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + + // TODO(dingchen): Validate keys and values shape. + return OkStatus(); + }) + .Doc(R"doc( +Replaces the contents of the table with the specified keys and values. + +The tensor `keys` must be of the same type as the keys of the table. +The tensor `values` must be of the type of the table values. + +resource_handle: Handle to the table. +keys: Any shape. Keys to look up. +values: Values to associate with keys. +)doc"); + +REGISTER_OP("KvResourceImportV3") + .Input("prefix: string") + .Input("resource_self: resource") + .Input("tensor_names: string") + .Input("empty_key: Tkeys") + .Attr("shape: shape") + .Attr("partition_id: int = 0") + .Attr("partition_num: int = 1") + .Attr("Tkeys: {int64, int32}") + .Attr("dtype: type") + .Attr("reset_version: bool = false") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle handle; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); + return OkStatus(); + }) + .Doc(R"doc()doc"); + +REGISTER_OP("KvResourceExport") + .Input("resource_handle: resource") + .Output("keys: Tkeys") + .Output("values: Tvalues") + .Output("versions: int64") + .Output("freqs: int64") + .Attr("Tkeys: {int64, int32}") + .Attr("Tvalues: type") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle values = c->UnknownShape(); + TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 2, &values)); + ShapeHandle keys = c->UnknownShapeOfRank(1); + ShapeHandle versions = c->UnknownShapeOfRank(1); + ShapeHandle freqs = c->UnknownShapeOfRank(1); + c->set_output(0, keys); + c->set_output(1, values); + c->set_output(2, versions); + c->set_output(3, freqs); + return OkStatus(); + }) + .Doc(R"doc( +Outputs all keys and values in the kv resource. + +resource_handle: Handle to the kvResource. +keys: Vector of all keys present in the table. +values: Tensor of all values in the table. Indexed in parallel with `keys`. +versions: Vector of all versions present in the table. +freqs: Vector of all freqs present in the table. +)doc"); + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc new file mode 100644 index 00000000..d61ea68b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc @@ -0,0 +1,109 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse, + int grad_idx, ShapeHandle* s) { + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); + if (!sparse) { + TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); + return OkStatus(); + } + // Indices is a vector where indices.dim[0].rank == grad[0].rank. + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); + + // Trailing part of grad matches trailing part of *s. + ShapeHandle grad_unknown_first; + TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first)); + TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); + + return OkStatus(); +} + +static Status KvResourceApplyAdagradShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr + TF_RETURN_IF_ERROR( + HandleKvGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return OkStatus(); +} + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("accum: resource") \ + .Input("lr: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvResourceApplyAdagradShapeFn(c, true /* sparse */); \ + }) \ + .Doc(R"doc()doc") +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagrad"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagrad"); +#undef REGISTER_OP_BY_NAME + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("accum: resource") \ + .Input("lr: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Input("indices_counts: int64") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvResourceApplyAdagradShapeFn(c, true /* sparse */); \ + }) \ + .Doc(R"doc()doc") +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagradWithCounts"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagradWithCounts"); +#undef REGISTER_OP_BY_NAME + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc new file mode 100644 index 00000000..a19cfeb8 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc @@ -0,0 +1,129 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse, + int grad_idx, ShapeHandle* s) { + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); + if (!sparse) { + TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); + return OkStatus(); + } + // Indices is a vector where indices.dim[0].rank == grad[0].rank. + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); + + // Trailing part of grad matches trailing part of *s. + ShapeHandle grad_unknown_first; + TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first)); + TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); + + return OkStatus(); +} + +static Status KvApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // beta2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleKvGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return OkStatus(); +} + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("m: resource") \ + .Input("v: resource") \ + .Input("beta1_power: resource") \ + .Input("beta2_power: resource") \ + .Input("lr: T") \ + .Input("beta1: T") \ + .Input("beta2: T") \ + .Input("epsilon: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("apply_sparse_rmsprop: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvApplyAdamAsyncShapeFn(c, true /* sparse */); \ + }) +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsync"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsync"); +#undef REGISTER_OP_BY_NAME + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("m: resource") \ + .Input("v: resource") \ + .Input("beta1_power: resource") \ + .Input("beta2_power: resource") \ + .Input("lr: T") \ + .Input("beta1: T") \ + .Input("beta2: T") \ + .Input("epsilon: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Input("indices_counts: int64") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("apply_sparse_rmsprop: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvApplyAdamAsyncShapeFn(c, true /* sparse */); \ + }) +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsyncWithCounts"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsyncWithCounts"); +#undef REGISTER_OP_BY_NAME + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc new file mode 100644 index 00000000..64be1148 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc @@ -0,0 +1,127 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse, + int grad_idx, ShapeHandle* s) { + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); + if (!sparse) { + TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); + return OkStatus(); + } + // Indices is a vector where indices.dim[0].rank == grad[0].rank. + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); + + // Trailing part of grad matches trailing part of *s. + ShapeHandle grad_unknown_first; + TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first)); + TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); + + return OkStatus(); +} + +static Status KvResourceApplyAdamShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // beta2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleKvGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return OkStatus(); +} + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("m: resource") \ + .Input("v: resource") \ + .Input("beta1_power: T") \ + .Input("beta2_power: T") \ + .Input("lr: T") \ + .Input("beta1: T") \ + .Input("beta2: T") \ + .Input("epsilon: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvResourceApplyAdamShapeFn(c, true /* sparse */); \ + }) +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdam"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdam"); +#undef REGISTER_OP_BY_NAME + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("m: resource") \ + .Input("v: resource") \ + .Input("beta1_power: T") \ + .Input("beta2_power: T") \ + .Input("lr: T") \ + .Input("beta1: T") \ + .Input("beta2: T") \ + .Input("epsilon: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Input("indices_counts: int64") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvResourceApplyAdamShapeFn(c, true /* sparse */); \ + }) +REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamWithCounts"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamWithCounts"); +#undef REGISTER_OP_BY_NAME + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc new file mode 100644 index 00000000..319a6a8f --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc @@ -0,0 +1,96 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse, + int grad_idx, ShapeHandle* s) { + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); + if (!sparse) { + TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); + return OkStatus(); + } + // Indices is a vector where indices.dim[0].rank == grad[0].rank. + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); + + // Trailing part of grad matches trailing part of *s. + ShapeHandle grad_unknown_first; + TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first)); + TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); + + return OkStatus(); +} + +static Status KvResourceApplyFtrlShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // linear + TF_RETURN_IF_ERROR( + HandleKvGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + int idx = sparse ? 5 : 4; + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr_power + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return OkStatus(); +} + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("accum: resource") \ + .Input("linear: resource") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("lr: T") \ + .Input("l1: T") \ + .Input("l2: T") \ + .Input("lr_power: T") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64, string}") \ + .Attr("use_locking: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn([](InferenceContext* c) { \ + return KvResourceApplyFtrlShapeFn(c, true /* sparse */); \ + }) \ + .Doc(R"doc()doc") +REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrl"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrl"); +#undef REGISTER_OP_BY_NAME + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc new file mode 100644 index 00000000..6ec435f5 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc @@ -0,0 +1,80 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +static Status KvApplyGradientDescentShapeFn(InferenceContext* c) { + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha + ShapeHandle grad = ShapeOrHandleShape(c, 2); + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &indices)); + DimensionHandle unused2; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused2)); + return OkStatus(); +} + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("alpha: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn(KvApplyGradientDescentShapeFn) +REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescent"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescent"); +#undef REGISTER_OP_BY_NAME + +#define REGISTER_OP_BY_NAME(name) \ + REGISTER_OP(name) \ + .Input("var: resource") \ + .Input("alpha: T") \ + .Input("grad: T") \ + .Input("indices: Tindices") \ + .Input("global_step: Tstep") \ + .Input("counts: int64") \ + .Attr("T: numbertype") \ + .Attr("Tindices: {int32, int64}") \ + .Attr("Tstep: {int32, int64}") \ + .Attr("use_locking: bool = false") \ + .Attr("indices_as_pointer: bool = false") \ + .SetShapeFn(KvApplyGradientDescentShapeFn) +REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescentWithCounts"); +REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescentWithCounts"); +#undef REGISTER_OP_BY_NAME + +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/tests/BUILD b/deepray/custom_ops/embedding_variable/cc/tests/BUILD new file mode 100644 index 00000000..6e0d0f8e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/tests/BUILD @@ -0,0 +1,65 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") + +cc_library( + name = "embedding_variable_test_lib", + hdrs = [ + "embedding_variable_test.h", + ], + deps = [ + "//deepray/custom_ops/embedding_variable:kv_variable_util", + "//deepray/custom_ops/embedding_variable/cc/lib:tensor_bundle", + ], +) + +cc_test( + name = "embedding_variable_ops_test", + srcs = [ + "embedding_variable_ops_test.cc", + "embedding_variable_test.h", + ], + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + deps = [ + ":embedding_variable_test_lib", + "//deepray/custom_ops/utils:tensor_testutil", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + "@local_tsl//tsl/platform:test", + "@local_tsl//tsl/platform:test_main", + ], +) + +cc_test( + name = "embedding_variable_performance_test", + srcs = [ + "embedding_variable_performance_test.cc", + "embedding_variable_test.h", + ], + deps = [ + ":embedding_variable_test_lib", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + "@local_tsl//tsl/platform:test", + "@local_tsl//tsl/platform:test_main", + ], +) + +cc_test( + name = "embedding_variable_memory_test", + srcs = [ + "embedding_variable_memory_test.cc", + "embedding_variable_test.h", + ], + deps = [ + ":embedding_variable_test_lib", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + "@local_tsl//tsl/platform:test", + "@local_tsl//tsl/platform:test_main", + ], +) diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc new file mode 100644 index 00000000..bc095509 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc @@ -0,0 +1,80 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "embedding_variable_test.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace embedding { +float PerfMemory(Tensor& default_value, const std::vector& id_list, + int value_size, int64 default_value_dim, int64 filter_freq = 0, + int64 steps_to_live = 0, int64 record_freq = false) { + auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim, + filter_freq, steps_to_live, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, record_freq); + void* value_ptr = nullptr; + bool is_filter = false; + double start_mem, end_mem; + start_mem = getResident() * getpagesize(); + for (int i = 0; i < id_list.size(); i++) { + ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); + if (is_filter) ev->flat(value_ptr); + } + end_mem = getResident() * getpagesize(); + double used_mb = (end_mem - start_mem) / 1000000; + LOG(INFO) << "[TestMemory]Use Memory: " << used_mb; + return used_mb; +} + +TEST(EmbeddingVariabelMemoryTest, TestMemory) { + int value_size = 32; + int64 default_value_dim = 4096; + int filter_freq = 2; + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + + int num_of_ids = 1000000; + std::vector id_list(num_of_ids); + for (int i = 0; i < num_of_ids; i++) { + id_list[i] = i; + } + float used_mb = + PerfMemory(default_value, id_list, value_size, default_value_dim); + float theoritical_mb = + 50 + num_of_ids * (value_size * sizeof(float)) / 1000000; + LOG(INFO) << "[TestMemory]Theoritical Memory: " << theoritical_mb; + EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && + (used_mb < theoritical_mb * 1.07)); + + for (int i = 0; i < num_of_ids / 2; i++) { + id_list.emplace_back(i); + } + used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim, + filter_freq); + theoritical_mb = 50 + num_of_ids * + (8 + value_size * sizeof(float) / 2 + + 4 /*memory for ids_list*/) / + 1000000; + LOG(INFO) << "[TestMemory]Theoritical Memory: " << theoritical_mb; + EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && + (used_mb < theoritical_mb * 1.25)); +} +} // namespace embedding +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc new file mode 100644 index 00000000..a29d3d16 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc @@ -0,0 +1,1324 @@ +#include + +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#include "deepray/custom_ops/utils/tensor_testutil.h" +#include "embedding_variable_test.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/common_runtime/device/device_id.h" +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" +#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" +#endif // GOOGLE_CUDA + +#include +#include + +#ifdef TENSORFLOW_USE_JEMALLOC +#include "jemalloc/jemalloc.h" +#endif + +namespace tensorflow { +namespace embedding { +namespace { +const int THREADNUM = 16; +const int64 max = 2147483647; + +struct ProcMemory { + long size; // total program size + long resident; // resident set size + long share; // shared pages + long trs; // text (code) + long lrs; // library + long drs; // data/stack + long dt; // dirty pages + + ProcMemory() + : size(0), resident(0), share(0), trs(0), lrs(0), drs(0), dt(0) {} +}; + +ProcMemory getProcMemory() { + ProcMemory m; + FILE* fp = fopen("/proc/self/statm", "r"); + if (fp == NULL) { + LOG(ERROR) << "Fail to open /proc/self/statm."; + return m; + } + + if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", &m.size, &m.resident, &m.share, + &m.trs, &m.lrs, &m.drs, &m.dt) != 7) { + fclose(fp); + LOG(ERROR) << "Fail to fscanf /proc/self/statm."; + return m; + } + fclose(fp); + + return m; +} + +double getSize() { + ProcMemory m = getProcMemory(); + return m.size; +} + +double getResident() { + ProcMemory m = getProcMemory(); + return m.resident; +} + +string Prefix(const string& prefix) { + return strings::StrCat(testing::TmpDir(), "/", prefix); +} + +std::vector AllTensorKeys(BundleReader* reader) { + std::vector ret; + reader->Seek(kHeaderEntryKey); + reader->Next(); + for (; reader->Valid(); reader->Next()) { + // ret.push_back(reader->key().ToString()); + ret.push_back(std::string(reader->key())); + } + return ret; +} + +TEST(EmbeddingVariableTest, TestEmptyEV) { + int64 value_size = 8; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 9.0)); + { + auto variable = CreateEmbeddingVar(value_size, value, 1); + + LOG(INFO) << "size:" << variable->Size(); + Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); + + BundleWriter writer(Env::Default(), Prefix("foo")); + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = 1; + variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); + TF_ASSERT_OK(writer.Finish()); + + { + BundleReader reader(Env::Default(), Prefix("foo")); + TF_ASSERT_OK(reader.status()); + EXPECT_EQ(AllTensorKeys(&reader), + std::vector( + {"var/part_0-freqs", "var/part_0-freqs_filtered", + "var/part_0-keys", "var/part_0-keys_filtered", + "var/part_0-partition_filter_offset", + "var/part_0-partition_offset", "var/part_0-values", + "var/part_0-versions", "var/part_0-versions_filtered"})); + { + string key = "var/part_0-keys"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{0}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read keys:" << val.DebugString(); + } + { + string key = "var/part_0-values"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{0, value_size}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read values:" << val.DebugString(); + } + { + string key = "var/part_0-versions"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{0}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read versions:" << val.DebugString(); + } + } + } +} + +TEST(EmbeddingVariableTest, TestEVExportSmallLockless) { + int64 value_size = 8; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 9.0)); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); + + Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); + + for (int64 i = 0; i < 5; i++) { + void* value_ptr = nullptr; + variable->LookupOrCreateKey(i, &value_ptr); + typename TTypes::Flat vflat = variable->flat(value_ptr); + vflat(i) = 5.0; + } + + LOG(INFO) << "size:" << variable->Size(); + + BundleWriter writer(Env::Default(), Prefix("foo")); + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = 1; + variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); + TF_ASSERT_OK(writer.Finish()); + + { + BundleReader reader(Env::Default(), Prefix("foo")); + TF_ASSERT_OK(reader.status()); + EXPECT_EQ( + AllTensorKeys(&reader), + std::vector( + {"var/part_0-freqs", "var/part_0-freqs_filtered", "var/part_0-keys", + "var/part_0-keys_filtered", "var/part_0-partition_filter_offset", + "var/part_0-partition_offset", "var/part_0-values", + "var/part_0-versions", "var/part_0-versions_filtered"})); + { + string key = "var/part_0-keys"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{5}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read keys:" << val.DebugString(); + } + { + string key = "var/part_0-values"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{5, value_size}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read values:" << val.DebugString(); + } + { + string key = "var/part_0-versions"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{5}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read versions:" << val.DebugString(); + } + } +} + +TEST(EmbeddingVariableTest, TestEVExportLargeLockless) { + int64 value_size = 128; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 9.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); + + Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); + + int64 ev_size = 10048576; + for (int64 i = 0; i < ev_size; i++) { + void* value_ptr = nullptr; + variable->LookupOrCreateKey(i, &value_ptr); + typename TTypes::Flat vflat = variable->flat(value_ptr); + } + + LOG(INFO) << "size:" << variable->Size(); + + BundleWriter writer(Env::Default(), Prefix("foo")); + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = 1; + variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); + TF_ASSERT_OK(writer.Finish()); + + { + BundleReader reader(Env::Default(), Prefix("foo")); + TF_ASSERT_OK(reader.status()); + EXPECT_EQ( + AllTensorKeys(&reader), + std::vector( + {"var/part_0-freqs", "var/part_0-freqs_filtered", "var/part_0-keys", + "var/part_0-keys_filtered", "var/part_0-partition_filter_offset", + "var/part_0-partition_offset", "var/part_0-values", + "var/part_0-versions", "var/part_0-versions_filtered"})); + { + string key = "var/part_0-keys"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{ev_size}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read keys:" << val.DebugString(); + } + { + string key = "var/part_0-values"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{ev_size, value_size}); + LOG(INFO) << "read values:" << val.DebugString(); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read values:" << val.DebugString(); + } + { + string key = "var/part_0-versions"; + EXPECT_TRUE(reader.Contains(key)); + // Tests for LookupDtypeAndShape(). + DataType dtype; + TensorShape shape; + TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape)); + // Tests for Lookup(), checking tensor contents. + Tensor val(dtype, TensorShape{ev_size}); + TF_ASSERT_OK(reader.Lookup(key, &val)); + LOG(INFO) << "read versions:" << val.DebugString(); + } + } +} + +void multi_insertion(EmbeddingVar* variable, int64 value_size) { + for (long j = 0; j < 5; j++) { + void* value_ptr = nullptr; + variable->LookupOrCreateKey(j, &value_ptr); + typename TTypes::Flat vflat = variable->flat(value_ptr); + } +} + +TEST(EmbeddingVariableTest, TestMultiInsertion) { + int64 value_size = 128; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 9.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + auto variable = CreateEmbeddingVar(value_size, value, 1); + + std::vector insert_threads(THREADNUM); + for (size_t i = 0; i < THREADNUM; i++) { + insert_threads[i] = std::thread(multi_insertion, variable, value_size); + } + for (auto& t : insert_threads) { + t.join(); + } + + ASSERT_EQ(variable->Size(), 5); +} + +void InsertAndLookup(EmbeddingVar* variable, int64* keys, + long ReadLoops, int value_size) { + for (long j = 0; j < ReadLoops; j++) { + void* val = nullptr; + void* val_1 = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(keys[j], &val, &is_filter, false); + variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false); + ASSERT_EQ(val, val_1); + } +} + +void MultiBloomFilter(EmbeddingVar* var, int value_size, + int64 i) { + for (long j = 0; j < 1; j++) { + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(i + 1, &val, &is_filter, false); + } +} + +TEST(EmbeddingVariableTest, TestBloomFilter) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + std::vector default_value = {0.0, 1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, 9.0}; + test::FillValues(&value, default_value); + + auto var = CreateEmbeddingVar(value_size, value, 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, false, 10, 0.01); + + // float *val = (float *)malloc((value_size+1)*sizeof(float)); + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(2, &val, &is_filter, false); + + std::vector keylist; + std::vector valuelist; + std::vector version_list; + std::vector freq_list; + + ASSERT_EQ(var->Size(), 1); +} + +TEST(EmbeddingVariableTest, TestBloomCounterInt64) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + auto var = CreateEmbeddingVar( + value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT64); + + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + + std::vector hash_val1 = {17, 7, 48, 89, 9, 20, 56}; + std::vector hash_val2 = {58, 14, 10, 90, 28, 14, 67}; + std::vector hash_val3 = {64, 63, 9, 77, 7, 38, 11}; + std::vector hash_val4 = {39, 10, 79, 28, 58, 55, 60}; + + std::map tab; + for (auto it : hash_val1) tab.insert(std::pair(it, 1)); + for (auto it : hash_val2) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val3) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val4) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + + std::vector insert_threads(4); + for (size_t i = 0; i < 4; i++) { + insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i); + } + for (auto& t : insert_threads) { + t.join(); + } + + auto filter = var->GetFilter(); + auto bloom_filter = + static_cast>*>( + filter); + //(int64 *)var->GetBloomCounter(); + int64* counter = (int64*)bloom_filter->GetBloomCounter(); + + for (auto it : hash_val1) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val2) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val3) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val4) { + ASSERT_EQ(counter[it], tab[it]); + } +} + +TEST(EmbeddingVariableTest, TestBloomCounterInt32) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + + auto var = CreateEmbeddingVar( + value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT32); + + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + + std::vector hash_val1 = {17, 7, 48, 89, 9, 20, 56}; + std::vector hash_val2 = {58, 14, 10, 90, 28, 14, 67}; + std::vector hash_val3 = {64, 63, 9, 77, 7, 38, 11}; + std::vector hash_val4 = {39, 10, 79, 28, 58, 55, 60}; + + std::map tab; + for (auto it : hash_val1) tab.insert(std::pair(it, 1)); + for (auto it : hash_val2) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val3) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val4) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + + std::vector insert_threads(4); + for (size_t i = 0; i < 4; i++) { + insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i); + } + for (auto& t : insert_threads) { + t.join(); + } + + auto filter = var->GetFilter(); + auto bloom_filter = + static_cast>*>( + filter); + //(int64 *)var->GetBloomCounter(); + int32* counter = (int32*)bloom_filter->GetBloomCounter(); + + for (auto it : hash_val1) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val2) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val3) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val4) { + ASSERT_EQ(counter[it], tab[it]); + } +} + +TEST(EmbeddingVariableTest, TestBloomCounterInt16) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + + auto var = CreateEmbeddingVar( + value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT16); + + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + + std::vector hash_val1 = {17, 7, 48, 89, 9, 20, 56}; + std::vector hash_val2 = {58, 14, 10, 90, 28, 14, 67}; + std::vector hash_val3 = {64, 63, 9, 77, 7, 38, 11}; + std::vector hash_val4 = {39, 10, 79, 28, 58, 55, 60}; + + std::map tab; + for (auto it : hash_val1) tab.insert(std::pair(it, 1)); + for (auto it : hash_val2) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val3) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val4) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + + std::vector insert_threads(4); + for (size_t i = 0; i < 4; i++) { + insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i); + } + for (auto& t : insert_threads) { + t.join(); + } + + // int16* counter = (int16 *)var->GetBloomCounter(); + auto filter = var->GetFilter(); + auto bloom_filter = + static_cast>*>( + filter); + //(int64 *)var->GetBloomCounter(); + int16* counter = (int16*)bloom_filter->GetBloomCounter(); + + for (auto it : hash_val1) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val2) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val3) { + ASSERT_EQ(counter[it], tab[it]); + } + for (auto it : hash_val4) { + ASSERT_EQ(counter[it], tab[it]); + } +} + +TEST(EmbeddingVariableTest, TestBloomCounterInt8) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + + auto var = CreateEmbeddingVar( + value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT8); + + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + + std::vector hash_val1 = {17, 7, 48, 89, 9, 20, 56}; + std::vector hash_val2 = {58, 14, 10, 90, 28, 14, 67}; + std::vector hash_val3 = {64, 63, 9, 77, 7, 38, 11}; + std::vector hash_val4 = {39, 10, 79, 28, 58, 55, 60}; + + std::map tab; + for (auto it : hash_val1) tab.insert(std::pair(it, 1)); + for (auto it : hash_val2) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val3) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + for (auto it : hash_val4) { + if (tab.find(it) != tab.end()) + tab[it]++; + else + tab.insert(std::pair(it, 1)); + } + + std::vector insert_threads(4); + for (size_t i = 0; i < 4; i++) { + insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i); + } + for (auto& t : insert_threads) { + t.join(); + } + + auto filter = var->GetFilter(); + auto bloom_filter = + static_cast>*>( + filter); + int8* counter = (int8*)bloom_filter->GetBloomCounter(); + //(int64 *)var->GetBloomCounter(); + // int8* counter = (int8 *)var->GetBloomCounter(); + + for (auto it : hash_val1) { + ASSERT_EQ((int)counter[it], tab[it]); + } + for (auto it : hash_val2) { + ASSERT_EQ((int)counter[it], tab[it]); + } + for (auto it : hash_val3) { + ASSERT_EQ((int)counter[it], tab[it]); + } + for (auto it : hash_val4) { + ASSERT_EQ((int)counter[it], tab[it]); + } +} + +TEST(EmbeddingVariableTest, TestInsertAndLookup) { + int64 value_size = 128; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10)); + auto variable = CreateEmbeddingVar(value_size, value, 1); + + int64 InsertLoops = 1000; + bool* flag = (bool*)malloc(sizeof(bool) * max); + srand((unsigned)time(NULL)); + int64* keys = (int64*)malloc(sizeof(int64) * InsertLoops); + + for (long i = 0; i < max; i++) { + flag[i] = 0; + } + + int index = 0; + while (index < InsertLoops) { + long j = rand() % max; + if (flag[j] == 1) // the number is already set as a key + continue; + else { // the number is not selected as a key + keys[index] = j; + index++; + flag[j] = 1; + } + } + free(flag); + std::vector insert_threads(THREADNUM); + for (size_t i = 0; i < THREADNUM; i++) { + insert_threads[i] = std::thread(InsertAndLookup, variable, + &keys[i * InsertLoops / THREADNUM], + InsertLoops / THREADNUM, value_size); + } + for (auto& t : insert_threads) { + t.join(); + } +} + +void MultiFilter(EmbeddingVar* variable, int value_size) { + bool is_filter = true; + void* val; + variable->LookupOrCreateKey(20, &val, &is_filter, false); +} + +TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5); + + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + int thread_num = 5; + std::vector insert_threads(thread_num); + for (size_t i = 0; i < thread_num; i++) { + insert_threads[i] = std::thread(MultiFilter, var, value_size); + } + for (auto& t : insert_threads) { + t.join(); + } + + void* value_ptr = nullptr; + var->LookupOrCreateKey(20, &value_ptr); + ASSERT_EQ(var->GetFreq(20), thread_num); +} + +EmbeddingVar* InitEV_Lockless(int64 value_size) { + Tensor value(DT_INT64, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10)); + auto variable = CreateEmbeddingVar(value_size, value, 1); + + return variable; +} + +void MultiLookup(EmbeddingVar* variable, int64 InsertLoop, + int thread_num, int i) { + for (int64 j = i * InsertLoop / thread_num; + j < (i + 1) * InsertLoop / thread_num; j++) { + void* value_ptr = nullptr; + variable->LookupOrCreateKey(j, &value_ptr); + } +} + +void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) { + // testing::StopTiming(); + // testing::UseRealTime(); + + int64 value_size = 128; + auto variable = InitEV_Lockless(value_size); + int64 InsertLoop = 1000000; + + float* fill_v = (float*)malloc(value_size * sizeof(float)); + + for (int64 i = 0; i < InsertLoop; i++) { + void* value_ptr = nullptr; + variable->LookupOrCreateKey(i, &value_ptr); + typename TTypes::Flat vflat = variable->flat(value_ptr); + } + + // testing::StartTiming(); + while (iters--) { + std::vector insert_threads(thread_num); + for (size_t i = 0; i < thread_num; i++) { + insert_threads[i] = + std::thread(MultiLookup, variable, InsertLoop, thread_num, i); + } + for (auto& t : insert_threads) { + t.join(); + } + } +} + +TEST(EmbeddingVariableTest, TestAllocate) { + int value_len = 8; + double t0 = getResident() * getpagesize() / 1024.0 / 1024.0; + double t1 = 0; + LOG(INFO) << "memory t0: " << t0; + for (int64 i = 0; i < 1000; ++i) { + float* tensor_val = TypedAllocator::Allocate( + ev_allocator(), value_len, AllocationAttributes()); + t1 = getResident() * getpagesize() / 1024.0 / 1024.0; + memset(tensor_val, 0, sizeof(float) * value_len); + } + double t2 = getResident() * getpagesize() / 1024.0 / 1024.0; + LOG(INFO) << "memory t1-t0: " << t1 - t0; + LOG(INFO) << "memory t2-t1: " << t2 - t1; + LOG(INFO) << "memory t2-t0: " << t2 - t0; +} + +TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) { + int64 value_size = 128; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 9.0)); + float* fill_v = (float*)malloc(value_size * sizeof(float)); + auto variable = CreateEmbeddingVar(value_size, value, 1); + + int64 ev_size = 100; + for (int64 i = 0; i < ev_size; i++) { + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(i, &val, &is_filter, false); + } + + LOG(INFO) << "size:" << variable->Size(); +} + +void t1(KVInterface* hashmap) { + for (int i = 0; i < 100; ++i) { + hashmap->Insert(i, nullptr); + } +} + +TEST(EmbeddingVariableTest, TestRemoveLockless) { + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM, false, false, + {false, 0}); + KVInterface* hashmap = + new LocklessHashMap(feat_desc); + feat_desc->InitSlotInfo(0, 100, {nullptr, 1}); + ASSERT_EQ(hashmap->Size(), 0); + LOG(INFO) << "hashmap size: " << hashmap->Size(); + auto t = std::thread(t1, hashmap); + t.join(); + LOG(INFO) << "hashmap size: " << hashmap->Size(); + ASSERT_EQ(hashmap->Size(), 100); + TF_CHECK_OK(hashmap->Remove(1)); + TF_CHECK_OK(hashmap->Remove(2)); + ASSERT_EQ(hashmap->Size(), 98); + LOG(INFO) << "2 size:" << hashmap->Size(); +} + +TEST(EmbeddingVariableTest, TestLRUCachePrefetch) { + BatchCache* cache = new LRUCache(); + int num_ids = 5; + std::vector prefetch_ids; + int index = 0; + int64 true_evict_size; + int64* evict_ids = new int64[num_ids]; + std::vector access_seq; + for (int i = 1; i <= num_ids; i++) { + for (int j = 0; j < i; j++) { + prefetch_ids.emplace_back(i); + } + } + cache->add_to_prefetch_list(prefetch_ids.data(), prefetch_ids.size()); + ASSERT_EQ(cache->size(), 0); + true_evict_size = cache->get_evic_ids(evict_ids, num_ids); + ASSERT_EQ(true_evict_size, 0); + for (int i = 1; i <= 2; i++) { + for (int j = 0; j < i; j++) { + access_seq.emplace_back(i); + } + } + cache->add_to_cache(access_seq.data(), access_seq.size()); + ASSERT_EQ(cache->size(), 2); + true_evict_size = cache->get_evic_ids(evict_ids, num_ids); + ASSERT_EQ(true_evict_size, 2); + access_seq.clear(); + for (int i = 5; i >= 3; i--) { + for (int j = 0; j < i; j++) { + access_seq.emplace_back(i); + } + } + cache->add_to_cache(access_seq.data(), access_seq.size()); + ASSERT_EQ(cache->size(), 3); + true_evict_size = cache->get_evic_ids(evict_ids, 2); + ASSERT_EQ(evict_ids[0], 5); + ASSERT_EQ(evict_ids[1], 4); + ASSERT_EQ(cache->size(), 1); + + delete cache; + delete[] evict_ids; +} + +TEST(EmbeddingVariableTest, TestLRUCache) { + BatchCache* cache = new LRUCache(); + int num_ids = 30; + int num_access = 100; + int num_evict = 50; + int64 ids[num_access] = {0}; + int64 evict_ids[num_evict] = {0}; + for (int i = 0; i < num_access; i++) { + ids[i] = i % num_ids; + } + cache->update(ids, num_access); + int64 size = cache->get_evic_ids(evict_ids, num_evict); + ASSERT_EQ(size, num_ids); + ASSERT_EQ(cache->size(), 0); + for (int i = 0; i < size; i++) { + ASSERT_EQ(evict_ids[i], (num_access % num_ids + i) % num_ids); + } +} + +TEST(EmbeddingVariableTest, TestLRUCacheGetCachedIds) { + BatchCache* cache = new LRUCache(); + int num_ids = 30; + int num_access = 100; + int num_evict = 15; + int num_cache = 20; + int64 ids[num_access] = {0}; + int64 evict_ids[num_evict] = {0}; + for (int i = 0; i < num_access; i++) { + ids[i] = i % num_ids; + } + cache->update(ids, num_access); + ASSERT_EQ(cache->size(), num_ids); + int64* cached_ids = new int64[num_cache]; + int64* cached_freqs = new int64[num_cache]; + int64 true_size = + cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs); + ASSERT_EQ(true_size, 20); + cache->get_evic_ids(evict_ids, num_evict); + ASSERT_EQ(cache->size(), 15); + true_size = + cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs); + ASSERT_EQ(true_size, 15); + delete cache; + delete[] cached_ids; + delete[] cached_freqs; +} + +TEST(EmbeddingVariableTest, TestLFUCacheGetCachedIds) { + BatchCache* cache = new LFUCache(); + int num_ids = 30; + int num_access = 100; + int num_evict = 15; + int num_cache = 20; + int64 ids[num_access] = {0}; + int64 evict_ids[num_evict] = {0}; + for (int i = 0; i < num_access; i++) { + ids[i] = i % num_ids; + } + cache->update(ids, num_access); + ASSERT_EQ(cache->size(), num_ids); + int64* cached_ids = new int64[num_cache]; + int64* cached_freqs = new int64[num_cache]; + int64 true_size = + cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs); + ASSERT_EQ(true_size, 20); + cache->get_evic_ids(evict_ids, num_evict); + ASSERT_EQ(cache->size(), 15); + true_size = + cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs); + ASSERT_EQ(true_size, 15); + delete cache; + delete[] cached_ids; + delete[] cached_freqs; +} + +TEST(EmbeddingVariableTest, TestLFUCachePrefetch) { + BatchCache* cache = new LFUCache(); + int num_ids = 5; + std::vector prefetch_ids; + int index = 0; + int64 true_evict_size; + int64* evict_ids = new int64[num_ids]; + std::vector access_seq; + for (int i = 1; i <= num_ids; i++) { + for (int j = 0; j < i; j++) { + prefetch_ids.emplace_back(i); + } + } + cache->add_to_prefetch_list(prefetch_ids.data(), prefetch_ids.size()); + ASSERT_EQ(cache->size(), 0); + true_evict_size = cache->get_evic_ids(evict_ids, num_ids); + ASSERT_EQ(true_evict_size, 0); + for (int i = 1; i <= 2; i++) { + for (int j = 0; j < i; j++) { + access_seq.emplace_back(i); + } + } + cache->add_to_cache(access_seq.data(), access_seq.size()); + ASSERT_EQ(cache->size(), 2); + true_evict_size = cache->get_evic_ids(evict_ids, num_ids); + ASSERT_EQ(true_evict_size, 2); + access_seq.clear(); + for (int i = 5; i >= 3; i--) { + for (int j = 0; j < i; j++) { + access_seq.emplace_back(i); + } + } + cache->add_to_cache(access_seq.data(), access_seq.size()); + ASSERT_EQ(cache->size(), 3); + true_evict_size = cache->get_evic_ids(evict_ids, 2); + ASSERT_EQ(evict_ids[0], 3); + ASSERT_EQ(evict_ids[1], 4); + ASSERT_EQ(cache->size(), 1); + + delete cache; + delete[] evict_ids; +} + +TEST(EmbeddingVariableTest, TestLFUCache) { + BatchCache* cache = new LFUCache(); + int num_ids = 30; + int num_access = 100; + int num_evict = 50; + int64 ids[num_access] = {0}; + int64 evict_ids[num_evict] = {0}; + for (int i = 0; i < num_access; i++) { + ids[i] = i % num_ids; + } + cache->update(ids, num_access); + int64 size = cache->get_evic_ids(evict_ids, num_evict); + ASSERT_EQ(size, num_ids); + ASSERT_EQ(cache->size(), 0); + for (int i = 0; i < size; i++) { + ASSERT_EQ(evict_ids[i], (num_access % num_ids + i) % num_ids); + } +} + +const int total_size = 1024 * 8; +const int th_num = 1; +const int malloc_size = total_size / th_num; + +void malloc_use_allocator(Allocator* allocator) { + timespec start; + timespec end; + float* first = (float*)allocator->AllocateRaw(0, sizeof(float)); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < malloc_size; ++i) { + int ev_list_size = 32; + float* ptr_ = + (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size); + } + clock_gettime(CLOCK_MONOTONIC, &end); + LOG(INFO) << "cost time: " + << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - + start.tv_nsec) / + 1000000 + << "ms"; +} + +TEST(EmbeddingVariableTest, TestEVMalloc) { + std::thread th_arr[th_num]; + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i] = std::thread(malloc_use_allocator, ev_allocator()); + } + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i].join(); + } +} + +TEST(EmbeddingVariableTest, TestCPUMalloc) { + std::thread th_arr[th_num]; + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i] = std::thread(malloc_use_allocator, cpu_allocator()); + } + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i].join(); + } +} + +#if GOOGLE_CUDA +TEST(EmbeddingVariableTest, TestGPUMalloc) { + SessionOptions sops; + std::unique_ptr device = + DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0"); + Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator( + GPUOptions(), TfDeviceId(0), 1 << 26, {} /* peer_gpu_ids */); + + std::thread th_arr[th_num]; + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i] = std::thread(malloc_use_allocator, gpu_allocator); + } + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i].join(); + } +} + +TEST(EmbeddingVariableTest, TestCPUGPUMalloc) { + SessionOptions sops; + std::unique_ptr device = + DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0"); + + auto gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator( + GPUOptions(), TfDeviceId(0), 1 << 26, {} /* peer_gpu_ids */); + auto mem_pool = new EmbeddingMemoryPool(gpu_allocator, 256, 1024); + float* ptr_1 = mem_pool->Allocate(); + float* ptr_2 = mem_pool->Allocate(); + std::vector value_ptrs; + value_ptrs.emplace_back(ptr_1); + mem_pool->Deallocate(value_ptrs); + value_ptrs.clear(); + value_ptrs.emplace_back(ptr_2); + mem_pool->Deallocate(value_ptrs); + float* ptr_3 = mem_pool->Allocate(); + ASSERT_EQ(ptr_1, ptr_3); + delete mem_pool; +} +#endif // GOOGLE_CUDA + +void malloc_free_use_allocator(Allocator* allocator) { + timespec start; + timespec end; + std::vector ptrs; + float* first = (float*)allocator->AllocateRaw(0, sizeof(float)); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < malloc_size; ++i) { + int ev_list_size = 32; + float* ptr_ = + (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size); + ptrs.push_back(ptr_); + } + clock_gettime(CLOCK_MONOTONIC, &end); + LOG(INFO) << "first time: " + << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - + start.tv_nsec) / + 1000000 + << "ms"; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (auto iter = ptrs.begin(); iter != ptrs.end(); iter++) { + allocator->DeallocateRaw(*iter); + } + clock_gettime(CLOCK_MONOTONIC, &end); + LOG(INFO) << "free time: " + << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - + start.tv_nsec) / + 1000000 + << "ms"; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < malloc_size; ++i) { + int ev_list_size = 32; + float* ptr_ = + (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size); + } + clock_gettime(CLOCK_MONOTONIC, &end); + LOG(INFO) << "second time: " + << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - + start.tv_nsec) / + 1000000 + << "ms"; +} + +TEST(EmbeddingVariableTest, TestEVMallocFree) { + std::thread th_arr[th_num]; + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i] = std::thread(malloc_free_use_allocator, ev_allocator()); + } + for (unsigned int i = 0; i < th_num; ++i) { + th_arr[i].join(); + } +} + +void SingleCommit(KVInterface* hashmap, std::vector keys, + int bias) { + std::vector value_ptrs; + for (int64 i = 0; i < keys.size(); ++i) { + void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16); + for (int j = 0; j < 124; j++) { + ((float*)tmp)[j] = keys[i] + bias; + } + value_ptrs.push_back(tmp); + } + ASSERT_EQ(keys.size(), value_ptrs.size()); + uint64 start = Env::Default()->NowNanos(); + for (int64 i = 0; i < keys.size(); i++) { + hashmap->Commit(keys[i], value_ptrs[i]); + } + uint64 end = Env::Default()->NowNanos(); + uint64 result_cost = end - start; +} + +void TestCompaction() { + std::string temp_dir = testing::TmpDir(); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, true, true, + {false, 0}); + auto hashmap = new SSDHashKV(temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); + ASSERT_EQ(hashmap->Size(), 0); + std::vector ids; + for (int i = 0; i < 262144; i++) { + ids.emplace_back(i); + } + auto t1 = std::thread(SingleCommit, hashmap, ids, 3); + t1.join(); + ids.clear(); + for (int i = 0; i < 131073; i++) { + ids.emplace_back(i); + } + t1 = std::thread(SingleCommit, hashmap, ids, 1); + t1.join(); + ids.clear(); + sleep(1); + void* val = nullptr; + for (int i = 131073; i < 262144; i++) { + hashmap->Lookup(i, &val); + float* v = (float*)val; + for (int j = 0; j < 124; j++) { + ASSERT_EQ(v[j], i + 3); + } + } + for (int i = 131073; i < 262144; i++) { + ids.emplace_back(i); + } + t1 = std::thread(SingleCommit, hashmap, ids, 2); + t1.join(); + ids.clear(); + ids.emplace_back(262155); + t1 = std::thread(SingleCommit, hashmap, ids, 0); + t1.join(); + sleep(1); + for (int i = 0; i < 131073; i++) { + hashmap->Lookup(i, &val); + float* v = (float*)val; + for (int j = 0; j < 124; j++) { + ASSERT_EQ(v[j], i + 1); + } + } + for (int i = 131073; i < 262144; i++) { + hashmap->Lookup(i, &val); + float* v = (float*)val; + for (int j = 0; j < 124; j++) { + ASSERT_EQ(v[j], i + 2); + } + } + delete hashmap; +} + +TEST(KVInterfaceTest, TestSSDKVAsyncCompaction) { + setenv("TF_SSDHASH_ASYNC_COMPACTION", "true", 1); + TestCompaction(); +} + +TEST(KVInterfaceTest, TestSSDKVSyncCompaction) { + setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1); + TestCompaction(); +} + +void TestReadEmbFile() { + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, true, true, + {false, 0}); + std::string temp_dir = testing::TmpDir(); + auto hashmap = new SSDHashKV(temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); + ASSERT_EQ(hashmap->Size(), 0); + std::vector ids; + for (int i = 0; i < 262145; i++) { + ids.emplace_back(i); + } + SingleCommit(hashmap, ids, 3); + sleep(1); + ids.clear(); + void* val = nullptr; + for (int i = 0; i < 262144; i++) { + hashmap->Lookup(i, &val); + float* v = (float*)val; + for (int j = 0; j < 124; j++) { + ASSERT_EQ(v[j], i + 3); + } + } + delete hashmap; +} + +TEST(KVInterfaceTest, TestMmapMadviseFile) { + setenv("TF_SSDHASH_IO_SCHEME", "mmap_and_madvise", 1); + TestReadEmbFile(); +} + +TEST(KVInterfaceTest, TestMmapFile) { + std::string temp_dir = testing::TmpDir(); + setenv("TF_SSDHASH_IO_SCHEME", "mmap", 1); + TestReadEmbFile(); +} + +TEST(KVInterfaceTest, TestDirectIoFile) { + std::string temp_dir = testing::TmpDir(); + setenv("TF_SSDHASH_IO_SCHEME", "directio", 1); + TestReadEmbFile(); +} + +void InsertKey(EmbeddingVar* variable, int value_size) { + float* val = (float*)malloc((value_size + 1) * sizeof(float)); + for (int64 i = 0; i < 100000000; i++) { + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(20, &val, &is_filter, false); + } +} + +void RemoveKey(EmbeddingVar* variable) { + for (int64 i = 0; i < 10; i++) { + sleep(1); + variable->storage()->Remove(20); + } +} + +TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + auto var = CreateEmbeddingVar(value_size, value, 1); + int thread_num = 5; + std::vector insert_threads(thread_num); + for (size_t i = 0; i < thread_num - 1; i++) { + insert_threads[i] = std::thread(InsertKey, var, value_size); + } + insert_threads[thread_num - 1] = std::thread(RemoveKey, var); + for (auto& t : insert_threads) { + t.join(); + } +} + +TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + auto var = CreateEmbeddingVar(value_size, value, 1); + float* set_value = (float*)malloc(value_size * sizeof(float)); + // Insertion + for (int i = 0; i < 100; i++) { + for (int j = 0; j < value_size; j++) { + set_value[j] = i + j; + } + var->Insert(i, set_value); + } + free(set_value); + // GetSnapshot + std::vector key_list; + std::vector value_ptr_list; + std::vector version_list; + std::vector freq_list; + var->GetSnapshot(&key_list, &value_ptr_list, &version_list, &freq_list); + for (int i = 0; i < key_list.size(); i++) { + ASSERT_EQ(key_list[i], i); + for (int j = 0; j < value_size; j++) { + ASSERT_EQ(value_ptr_list[i][j], i + j); + } + } +} + +} // namespace +} // namespace embedding +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc new file mode 100644 index 00000000..a7de65cc --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc @@ -0,0 +1,455 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "embedding_variable_test.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace embedding { +void GenerateSkewIds(int num_of_ids, float skew_factor, + std::vector& hot_ids_list, + std::vector& cold_ids_list) { + int num_of_hot_ids = num_of_ids * (1 - skew_factor); + int num_of_cold_ids = num_of_ids - num_of_hot_ids; + std::set hot_ids_set; + std::set cold_ids_set; + hot_ids_list.resize(num_of_hot_ids); + cold_ids_list.resize(num_of_cold_ids); + srand((unsigned)time(NULL)); + // Generate hot ids + for (int i = 0; i < num_of_hot_ids; i++) { + bool flag = false; + int64 key; + do { + key = rand() % 100000000; + flag = hot_ids_set.insert(key).second; + hot_ids_list[i] = key; + } while (!flag); + } + // Generate cold ids + for (int i = 0; i < num_of_cold_ids; i++) { + bool flag = false; + int64 key; + do { + key = rand() % 100000000; + if (hot_ids_set.find(key) != hot_ids_set.end()) { + flag = false; + } else { + flag = cold_ids_set.insert(key).second; + cold_ids_list[i] = key; + } + } while (!flag); + } +} + +void InitSkewInputBatch(std::vector>& input_batches, + float skew_factor, + const std::vector& hot_ids_list, + const std::vector& cold_ids_list) { + srand((unsigned)time(NULL)); + int num_of_hot_ids = hot_ids_list.size(); + int num_of_cold_ids = cold_ids_list.size(); + int num_of_batch = input_batches.size(); + for (int i = 0; i < input_batches.size(); i++) { + for (int j = 0; j < input_batches[i].size(); j++) { + int tmp = rand() % 10; + if ((float)tmp * 0.1 < skew_factor) { + int pos = rand() % num_of_hot_ids; + input_batches[i][j] = hot_ids_list[pos]; + } else { + int pos = rand() % num_of_cold_ids; + input_batches[i][j] = cold_ids_list[pos]; + } + } + } +} + +void GenerateSkewInput(int num_of_ids, float skew_factor, + std::vector>& input_batches) { + std::vector hot_ids_list; + std::vector cold_ids_list; + // Generate hot ids + GenerateSkewIds(num_of_ids, skew_factor, hot_ids_list, cold_ids_list); + // Select id for each batch + InitSkewInputBatch(input_batches, skew_factor, hot_ids_list, cold_ids_list); +} + +void thread_lookup_or_create(EmbeddingVar* ev, + const int64* input_batch, float* default_value, + int default_value_dim, float** outputs, + int value_size, int start, int end) { + void* value_ptr = nullptr; + bool is_filter = false; + for (int i = start; i < end; i++) { + ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false); + if (is_filter) { + auto val = ev->flat(value_ptr); + memcpy(outputs[i], &val(0), sizeof(float) * value_size); + } else { + int default_value_index = input_batch[i] % default_value_dim; + memcpy(outputs[i], default_value + default_value_index * value_size, + sizeof(float) * value_size); + } + } +} + +double PerfLookupOrCreate(const std::vector>& input_batches, + int num_thread, int filter_freq = 0) { + int value_size = 32; + int64 default_value_dim = 4096; + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim, + filter_freq); + std::vector worker_threads(num_thread); + double total_time = 0.0; + timespec start, end; + for (int k = 0; k < input_batches.size(); k++) { + // Allocate Outputs for each batch + std::vector outputs(input_batches[k].size()); + for (int i = 0; i < outputs.size(); i++) { + outputs[i] = + (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size); + } + // Execution + std::vector> thread_task_range(num_thread); + for (int i = 0; i < num_thread; i++) { + int st = input_batches[k].size() / num_thread * i; + int ed = input_batches[k].size() / num_thread * (i + 1); + ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed; + thread_task_range[i].first = st; + thread_task_range[i].second = ed; + } + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < num_thread; i++) { + worker_threads[i] = std::thread( + thread_lookup_or_create, ev, input_batches[k].data(), + default_value_matrix.data(), default_value_dim, outputs.data(), + value_size, thread_task_range[i].first, thread_task_range[i].second); + } + for (int i = 0; i < num_thread; i++) { + worker_threads[i].join(); + } + clock_gettime(CLOCK_MONOTONIC, &end); + if (k > 10) + total_time += ((double)(end.tv_sec - start.tv_sec) * 1000000000 + + end.tv_nsec - start.tv_nsec); + // Check + for (int i = 0; i < input_batches[k].size(); i++) { + int64 key = input_batches[k][i]; + float* output = outputs[i]; + for (int j = 0; j < value_size; j++) { + float val = default_value_matrix(key % default_value_dim, j); + if (output[j] != val) { + LOG(INFO) << "Value Error: outputs[" << key << "][" << j << "] is " + << output[j] << ", while the anwser is " << val; + return -1.0; + } + } + } + // Deallocate Output + for (auto ptr : outputs) { + cpu_allocator()->DeallocateRaw(ptr); + } + } + ev->Unref(); + return total_time; +} + +TEST(EmbeddingVariablePerformanceTest, TestLookupOrCreate) { + int num_of_batch = 100; + int batch_size = 1024 * 128; + int num_of_ids = 5000000; + std::vector> input_batches(num_of_batch); + for (int i = 0; i < num_of_batch; i++) { + input_batches[i].resize(batch_size); + } + LOG(INFO) << "[TestLookupOrCreate] Start generating skew input"; + GenerateSkewInput(num_of_ids, 0.8, input_batches); + LOG(INFO) << "[TestLookupOrCreate] Finish generating skew input"; + std::vector num_thread_vec({1, 2, 4, 8, 16}); + for (auto num_thread : num_thread_vec) { + LOG(INFO) << "[TestLookupOrCreate] Test LookupOrCreate With " << num_thread + << " threads."; + double exec_time = PerfLookupOrCreate(input_batches, num_thread); + if (exec_time == -1.0) { + LOG(INFO) << "[TestLookupOrCreate] Test Failed"; + } else { + LOG(INFO) << "[TestLookupOrCreate] Performance of LookupOrCreate With " + << num_thread << " threads: " << exec_time / 1000000 << " ms"; + } + } +} + +void thread_lookup(EmbeddingVar* ev, const int64* input_batch, + float** outputs, int value_size, int start, int end) { + void* value_ptr = nullptr; + bool is_filter = false; + for (int i = start; i < end; i++) { + ev->LookupKey(input_batch[i], &value_ptr); + auto val = ev->flat(value_ptr); + memcpy(outputs[i], &val(0), sizeof(float) * value_size); + } +} + +double PerfLookup(EmbeddingVar* ev, + const std::vector>& input_batches, + int num_thread, int value_size, float* default_value, + int64 default_value_dim) { + std::vector worker_threads(num_thread); + double total_time = 0.0; + timespec start, end; + for (int k = 0; k < input_batches.size(); k++) { + // Allocate Outputs for each batch + std::vector outputs(input_batches[k].size()); + for (int i = 0; i < outputs.size(); i++) { + outputs[i] = + (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size); + } + // Execution + std::vector> thread_task_range(num_thread); + for (int i = 0; i < num_thread; i++) { + int st = input_batches[k].size() / num_thread * i; + int ed = input_batches[k].size() / num_thread * (i + 1); + ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed; + thread_task_range[i].first = st; + thread_task_range[i].second = ed; + } + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < num_thread; i++) { + worker_threads[i] = std::thread( + thread_lookup, ev, input_batches[k].data(), outputs.data(), + value_size, thread_task_range[i].first, thread_task_range[i].second); + } + for (int i = 0; i < num_thread; i++) { + worker_threads[i].join(); + } + clock_gettime(CLOCK_MONOTONIC, &end); + if (k > 10) + total_time += ((double)(end.tv_sec - start.tv_sec) * 1000000000 + + end.tv_nsec - start.tv_nsec); + // Check + for (int i = 0; i < input_batches[k].size(); i++) { + int64 key = input_batches[k][i]; + float* output = outputs[i]; + for (int j = 0; j < value_size; j++) { + float val = default_value[(key % default_value_dim) * value_size + j]; + if (output[j] != val) { + LOG(INFO) << "Value Error: outputs[" << key << "][" << j << "] is " + << output[j] << ", while is the anwser is " << val; + return -1.0; + } + } + } + // Deallocate Output + for (auto ptr : outputs) { + cpu_allocator()->DeallocateRaw(ptr); + } + } + return total_time; +} + +TEST(EmbeddingVariablePerformanceTest, TestLookup) { + int num_of_batch = 100; + int batch_size = 1024 * 128; + int num_of_ids = 5000000; + int value_size = 32; + int64 default_value_dim = 4096; + float skew_factor = 0.8; + + LOG(INFO) << "[TestLookup] Start initializing EV storage."; + std::vector hot_ids_list; + std::vector cold_ids_list; + GenerateSkewIds(num_of_ids, skew_factor, hot_ids_list, cold_ids_list); + + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim); + void* value_ptr = nullptr; + bool is_filter = false; + for (int i = 0; i < hot_ids_list.size(); i++) { + ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false); + } + for (int i = 0; i < cold_ids_list.size(); i++) { + ev->LookupOrCreateKey(cold_ids_list[i], &value_ptr, &is_filter, false); + } + LOG(INFO) << "[TestLookup] End initializing EV storage."; + + LOG(INFO) << "[TestLookup] Start generating skew input"; + std::vector> input_batches(num_of_batch); + for (int i = 0; i < num_of_batch; i++) { + input_batches[i].resize(batch_size); + } + InitSkewInputBatch(input_batches, skew_factor, hot_ids_list, cold_ids_list); + LOG(INFO) << "[TestLookup] Finish generating skew input"; + std::vector num_thread_vec({1, 2, 4, 8, 16}); + for (auto num_thread : num_thread_vec) { + LOG(INFO) << "[TestLookup] Test Lookup With " << num_thread << " threads."; + double exec_time = + PerfLookup(ev, input_batches, num_thread, value_size, + (float*)default_value.data(), default_value_dim); + if (exec_time == -1.0) { + LOG(INFO) << "[TestLookup] Test Failed"; + } else { + LOG(INFO) << "[TestLookup] Performance of Lookup With " << num_thread + << " threads: " << exec_time / 1000000 << " ms"; + } + } + ev->Unref(); +} + +string Prefix(const string& prefix) { + return strings::StrCat(testing::TmpDir(), "/", prefix); +} + +void PerfSave(Tensor& default_value, const std::vector& id_list, + int value_size, int64 default_value_dim, int64 steps_to_live = 0, + float l2_weight_threshold = -1.0) { + auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim, 0, + steps_to_live, l2_weight_threshold); + void* value_ptr = nullptr; + bool is_filter = false; + srand((unsigned)time(NULL)); + + for (int i = 0; i < id_list.size(); i++) { + ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); + ev->flat(value_ptr); + int64 global_step = rand() % 100; + ev->UpdateVersion(value_ptr, global_step); + } + Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); + + BundleWriter writer(Env::Default(), Prefix("foo")); + timespec start, end; + double total_time = 0.0; + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = 100; + clock_gettime(CLOCK_MONOTONIC, &start); + ev->Save("var", Prefix("foo"), &writer, shrink_args); + clock_gettime(CLOCK_MONOTONIC, &end); + total_time += (double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - + start.tv_nsec; + TF_ASSERT_OK(writer.Finish()); + LOG(INFO) << "[TestSave]execution time: " << total_time / 1000000 << "ms"; + ev->Unref(); +} + +TEST(EmbeddingVariablePerformanceTest, TestSave) { + int value_size = 32; + int64 default_value_dim = 4096; + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + + int num_of_ids = 1000000; + srand((unsigned)time(NULL)); + std::vector id_list(num_of_ids); + for (int i = 0; i < num_of_ids; i++) { + id_list[i] = rand() % 50000000; + } + PerfSave(default_value, id_list, value_size, default_value_dim); +} + +TEST(EmbeddingVariablePerformanceTest, TestGlobalStepEviction) { + int value_size = 32; + int64 default_value_dim = 4096; + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + + int num_of_ids = 1000000; + std::vector id_list(num_of_ids); + srand((unsigned)time(NULL)); + for (int i = 0; i < num_of_ids; i++) { + id_list[i] = rand() % 50000000; + } + PerfSave(default_value, id_list, value_size, default_value_dim, 80); +} + +TEST(EmbeddingVariablePerformanceTest, TestL2WeightEviction) { + int value_size = 32; + int64 default_value_dim = 4096; + Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size})); + auto default_value_matrix = default_value.matrix(); + for (int i = 0; i < default_value_dim; i++) { + for (int j = 0; j < value_size; j++) { + default_value_matrix(i, j) = i * value_size + j; + } + } + + int l2_weight_threshold_index = default_value_dim * 0.2; + float l2_weight_threshold = 0.0; + for (int64 j = 0; j < value_size; j++) { + l2_weight_threshold += + pow(default_value_matrix(l2_weight_threshold_index, j), 2); + } + l2_weight_threshold *= 0.5; + + int num_of_ids = 1000000; + std::vector id_list(num_of_ids); + srand((unsigned)time(NULL)); + for (int i = 0; i < num_of_ids; i++) { + id_list[i] = rand() % 50000000; + } + PerfSave(default_value, id_list, value_size, default_value_dim, 0, + l2_weight_threshold); +} + +TEST(EmbeddingVariablePerformaceTest, TestCounterFilterLookupOrCreate) { + int num_of_batch = 100; + int batch_size = 1024 * 128; + int num_of_ids = 5000000; + int64 filter_freq = 5; + std::vector> input_batches(num_of_batch); + for (int i = 0; i < num_of_batch; i++) { + input_batches[i].resize(batch_size); + } + LOG(INFO) << "[TestCounterFilterLookupOrCreate] Start generating skew input"; + GenerateSkewInput(num_of_ids, 0.8, input_batches); + LOG(INFO) << "[TestCounterFilterLookupOrCreate] Finish generating skew input"; + std::vector num_thread_vec({1, 2, 4, 8, 16}); + for (auto num_thread : num_thread_vec) { + LOG(INFO) << "[TestCounterFilterLookupOrCreate] Test LookupOrCreate With " + << num_thread << " threads."; + double exec_time = + PerfLookupOrCreate(input_batches, num_thread, filter_freq); + if (exec_time == -1.0) { + LOG(INFO) << "[TestCounterFilterLookupOrCreate] Test Failed"; + } else { + LOG(INFO) << "[TestCounterFilterLookupOrCreate] Performance of " + "LookupOrCreate With " + << num_thread << " threads: " << exec_time / 1000000 << " ms"; + } + } +} +} // namespace embedding +} // namespace tensorflow diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h new file mode 100644 index 00000000..76b566f4 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h @@ -0,0 +1,109 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H +#define TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H +#include + +#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h" +#include "deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h" +#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h" +#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h" +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" +#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" +#endif // GOOGLE_CUDA + +#include +#include + +#ifdef TENSORFLOW_USE_JEMALLOC +#include "jemalloc/jemalloc.h" +#endif + +namespace tensorflow { +namespace embedding { +struct ProcMemory { + long size; // total program size + long resident; // resident set size + long share; // shared pages + long trs; // text (code) + long lrs; // library + long drs; // data/stack + long dt; // dirty pages + + ProcMemory() + : size(0), resident(0), share(0), trs(0), lrs(0), drs(0), dt(0) {} +}; + +ProcMemory getProcMemory() { + ProcMemory m; + FILE* fp = fopen("/proc/self/statm", "r"); + if (fp == NULL) { + LOG(ERROR) << "Fail to open /proc/self/statm."; + return m; + } + + if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", &m.size, &m.resident, &m.share, + &m.trs, &m.lrs, &m.drs, &m.dt) != 7) { + fclose(fp); + LOG(ERROR) << "Fail to fscanf /proc/self/statm."; + return m; + } + fclose(fp); + + return m; +} + +double getSize() { + ProcMemory m = getProcMemory(); + return m.size; +} + +double getResident() { + ProcMemory m = getProcMemory(); + return m.resident; +} + +EmbeddingVar* CreateEmbeddingVar( + int value_size, Tensor& default_value, int64 default_value_dim, + int64 filter_freq = 0, int64 steps_to_live = 0, + float l2_weight_threshold = -1.0, + embedding::StorageType storage_type = embedding::StorageType::DRAM, + std::vector storage_size = {1024 * 1024 * 1024, 1024 * 1024 * 1024, + 1024 * 1024 * 1024, 1024 * 1024 * 1024}, + bool record_freq = false, int64 max_element_size = 0, + float false_positive_probability = -1.0, + DataType counter_type = DT_UINT64) { + auto embedding_config = EmbeddingConfig( + 0, 0, 1, 0, "emb_var", steps_to_live, filter_freq, 999999, + l2_weight_threshold, max_element_size, false_positive_probability, + counter_type, default_value_dim, 0.0, record_freq, false, false); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), storage_type, record_freq, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq}); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(storage_type, "", storage_size, + embedding_config), + cpu_allocator(), feat_desc, "emb_var"); + auto ev = new EmbeddingVar("emb_var", storage, embedding_config, + cpu_allocator(), feat_desc); + ev->Init(default_value, default_value_dim); + return ev; +} +} // namespace embedding +} // namespace tensorflow +#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ diff --git a/deepray/custom_ops/embedding_variable/config.proto b/deepray/custom_ops/embedding_variable/config.proto new file mode 100644 index 00000000..424fc5e1 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/config.proto @@ -0,0 +1,58 @@ +syntax = "proto3"; + +package tensorflow.embedding; + +enum StorageType { + // none + DEFAULT = 0; + + // one level + DRAM = 1; + PMEM_MEMKIND = 2; + PMEM_LIBPMEM = 3; + SSDHASH = 4; + LEVELDB = 5; + HBM = 6; + + // two level + DRAM_PMEM = 11; + DRAM_SSDHASH = 12; + HBM_DRAM = 13; + DRAM_LEVELDB = 14; + + // three level + DRAM_PMEM_SSDHASH = 101; + HBM_DRAM_SSDHASH = 102; + +} + +enum CopyBackFlag { + NOT_COPYBACK = 0; + COPYBACK = 1; + COPYBACK_AND_DESTROY = 2; +} + +enum SlotType { + EMBEDDING_VARIABLE = 0; + VARIABLE = 1; +} + +enum CacheStrategy { + LRU = 0; + LFU = 1; +} + +enum EmbeddingVariableType { + IMMUTABLE = 0; + MUTABLE = 1; +} + +enum ValuePtrStatus { + OK = 0; + IS_DELETED = 1; + NOT_IN_DRAM = 2; +} + +enum IsSetInitialized { + NOT_SET_INITAILIZED = 0; +} diff --git a/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py new file mode 100644 index 00000000..42ca0c6b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py @@ -0,0 +1,114 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for multiplex_2.""" + +import numpy as np +import tensorflow as tf + +from deepray.custom_ops.multiplex_2 import multiplex_2_op +from tensorflow.python.framework import errors_impl +# This pylint disable is only needed for internal google users +from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import + + +@test_util.with_eager_op_as_function +class MultiplexOpRank1Test(tf.test.TestCase): + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_int(self): + a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64) + b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64) + cond = tf.constant([True, False, True, False, True], dtype=bool) + expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b)) + # expected result is [1, 20, 3, 40, 5] + result = multiplex_2_op.multiplex(cond, a, b) + self.assertAllEqual(result, expect) + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_float(self): + a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0]) + b = tf.constant([10.0, 20.0, 30.0, 40.0, 50.0]) + cond = tf.constant([True, False, True, False, True], dtype=bool) + # expected result is [1.0, 20.0, 3.0, 40.0, 5.0] + expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b)) + result = multiplex_2_op.multiplex(cond, a, b) + self.assertAllEqual(result, expect) + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_bad_types(self): + a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0]) # float + b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64) + cond = tf.constant([True, False, True, False, True], dtype=bool) + with self.assertRaisesRegex( + (errors_impl.InvalidArgumentError, TypeError), + # Eager mode raises InvalidArgumentError with the following message + r'(cannot compute Examples>MultiplexDense as input #2\(zero-based\) ' + r'was expected to be a float tensor but is a int64 tensor ' + r'\[Op:Examples>MultiplexDense\]' + r')|(' + # Graph mode raises TypeError with the following message + r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that " + r"does not match type float32 of argument 'a'.)" + ): + self.evaluate(multiplex_2_op.multiplex(cond, a, b)) + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_bad_size(self): + a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64) # longer than b + b = tf.constant([10, 20], dtype=tf.int64) # shorter than a + cond = tf.constant([True, False, True, False, True], dtype=bool) + with self.assertRaisesRegex( + (errors_impl.InvalidArgumentError, ValueError), + # Eager mode raises InvalidArgumentError with the following message + r'(?s)(a and b must have the same shape. ' + r'a shape: \[5\] b shape: \[2\].* ' + r'\[Op:Examples>MultiplexDense\]' + r')|(' + # Graph mode raises ValueError with the following message + r'Dimension 0 in both shapes must be equal, but are 5 and 2\. ' + r'Shapes are \[5\] and \[2\]\.)' + ): + self.evaluate(multiplex_2_op.multiplex(cond, a, b)) + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_2d(self): + a = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.int64) + b = tf.constant([[10, 20, 30], [40, 50, 60]], dtype=tf.int64) + cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool) + expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b)) + # expected result is [[1, 20], [3, 40]] + result = multiplex_2_op.multiplex(cond, a, b) + self.assertAllEqual(result, expect) + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_bad_shape(self): + a = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.int64) # shape (2,3) + b = tf.constant([[10, 20], [30, 40], [50, 60]], dtype=tf.int64) # shape (3,2) + cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool) + with self.assertRaisesRegex( + (errors_impl.InvalidArgumentError, ValueError), + # Eager mode raises InvalidArgumentError with the following message + r'(a and b must have the same shape.' + r' a shape: \[2,3\] b shape: \[3,2\]' + r')|(' + # Graph mode raises ValueError with the following message + r'Dimension 0 in both shapes must be equal, ' + r'but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.' + ): + self.evaluate(multiplex_2_op.multiplex(cond, a, b)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/deepray/custom_ops/embedding_variable/multiplex_1_test.py b/deepray/custom_ops/embedding_variable/multiplex_1_test.py new file mode 100644 index 00000000..2f2045e6 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/multiplex_1_test.py @@ -0,0 +1,50 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for multiplex_1.""" + +import numpy as np +import tensorflow as tf + +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +# This pylint disable is only needed for internal google users +from tensorflow.python.framework import errors_impl # pylint: disable=g-direct-tensorflow-import +from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import + + +@test_util.with_eager_op_as_function +class MultiplexOpRank1Test(tf.test.TestCase): + + @test_util.run_in_graph_and_eager_modes + def test_multiplex_int(self): + print(gen_kv_variable_ops) + print(dir(gen_kv_variable_ops)) + + # @test_util.run_in_graph_and_eager_modes + # def test_multiplex_int(self): + # shape = [3] + # dtype = tf.float32 + # shared_name = "var_1_2" + # name = "var_1/" + # _invalid_key_type = tf.int64 + # container = "" + # gen_kv_variable_ops.kv_var_handle_op(shape=shape, dtype=dtype, + # shared_name=shared_name, + # name=name, + # Tkeys=_invalid_key_type, + # container=container) + + +if __name__ == '__main__': + tf.test.main() diff --git a/deepray/seq2seq/tests/__init__.py b/deepray/custom_ops/embedding_variable/python/__init__.py similarity index 100% rename from deepray/seq2seq/tests/__init__.py rename to deepray/custom_ops/embedding_variable/python/__init__.py diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py new file mode 100644 index 00000000..35536d6b --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py @@ -0,0 +1,543 @@ +import sys +from collections import defaultdict + +import tensorflow as tf +from tensorflow.python.framework import indexed_slices +from tensorflow.python.framework import ops +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import resource_loader +from tensorflow.python.platform import tf_logging as logging + +import deepray as dp +from . import kv_variable_ops +from .group_embedding_types import (DistStrategy, get_group_lookup_strategy) + +gen_group_embedding_ops = tf.load_op_library(resource_loader.get_path_to_datafile("../_group_embedding_ops.so")) + +__all__ = ["group_embedding_lookup", "group_embedding_lookup_sparse"] + + +#for GPU EV group_lookup_dense +def group_embedding_var_lookup_dense(params, dense_values, dimensions, ev_init_value=None): + if ev_init_value is not None: + default_value = ev_init_value + is_use_default_value_tensor = True + else: + default_value = ops.convert_to_tensor(1.0) + is_use_default_value_tensor = False + return gen_group_embedding_ops.group_embedding_var_lookup_dense( + params, dense_values, default_value, dimensions, is_use_default_value_tensor + ) + + +#for GPU EV group_lookup +def group_embedding_var_lookup( + params, + sp_values, + sp_indices, + sp_weights, + combiners, + batch_size, + dimensions, + ignore_weights, + is_sequence=False, + ev_init_value=None +): + if ev_init_value is not None: + default_value = ev_init_value + is_use_default_value_tensor = True + else: + default_value = ops.convert_to_tensor(1.0) + is_use_default_value_tensor = False + if ignore_weights: + sp_weight = ops.convert_to_tensor(1.0) + sp_weights = [sp_weight for _ in range(len(sp_values))] + return gen_group_embedding_ops.group_embedding_var_lookup( + params, + sp_values, + sp_indices, + sp_weights, + batch_size, + default_value, + combiners, + dimensions, + ignore_weights=ignore_weights, + is_use_default_value_tensor=is_use_default_value_tensor, + is_sequence=is_sequence + ) + + +def group_embedding_lookup(params, ids, partition_strategy="mod", name=None): + """ + This interface is designed for fused multiple embedding lookup. + Args: + params: list, tuple + a list or tuple of trainable *Variable* or *EmbeddingVariable*. + ids: list, tuple + a list or tuple of tf.SparseTensor or tf.Tensor. + btw RaggedTensor is preferred. + name: The operations name + Returns + ------- + emb_vec: list + a list of tf.Tensor(the results of lookup). + """ + + if params is None: + raise ValueError("params must be specified") + if not isinstance(params, list): + params = [params] + for index, param in enumerate(params): + if isinstance(param, dp.layers.embedding_variable.EmbeddingVariable): + params[index] = param.embedding_variable + + if len(params) != len(ids): + raise ValueError("len of params must be equal to len of ids") + + ## Currently not doing unique + strategy = get_group_lookup_strategy() + + if strategy == DistStrategy.LOCALIZED: + + emb_vec = [None for _ in range(len(params))] + + ev_group_id_map = {} + tf_group_id_map = {} + ev_group_id = 0 + tf_group_id = 0 + is_ev_list = [False for _ in range(len(params))] + params_idx_map = {} + + for index, param in enumerate(params): + params_idx_map[param.ref()] = index + + if isinstance(param, kv_variable_ops.EmbeddingVariable): + is_ev_list[index] = True + dim = param.shape[0] + if dim not in ev_group_id_map: + ev_group_id_map[dim] = ev_group_id + ev_group_id += 1 + else: # tensorflow variable + dim = param.shape[1] + if dim not in tf_group_id_map: + tf_group_id_map[dim] = tf_group_id + tf_group_id += 1 + + if ev_group_id > 0: + ev_ids = [[] for _ in range(ev_group_id)] + ev_handlers = [[] for _ in range(ev_group_id)] + ev_dimensions = [0 for _ in range(ev_group_id)] + output_index_list = [[] for _ in range(ev_group_id)] + + for index, ev_flag in enumerate(is_ev_list): + if not ev_flag: + continue + param = params[index] + dim = param.shape[0] + group_id = ev_group_id_map[dim] + ev_id = ids[index] + + ev_dimensions[group_id] = dim + resource_variable_ops.variable_accessed(param) + ev_handlers[group_id].append(param.handle) + ev_ids[group_id].append(array_ops.reshape(ev_id, [-1])) + output_index_list[group_id].append(params_idx_map[param.ref()]) + + for group_id in range(ev_group_id): + dim = ev_dimensions[group_id] + output_index = output_index_list[group_id] + with ops.name_scope(name, "localized_group_embedding_lookup_ev_dim{}".format(dim), params + ids) as name_scope: + outputs = group_embedding_var_lookup_dense(ev_handlers[group_id], ev_ids[group_id], dim)[0] + for idx, output in zip(output_index, outputs): + emb_vec[idx] = output + + if tf_group_id > 0: + tf_ids = [[] for _ in range(tf_group_id)] + tf_handlers = [[] for _ in range(tf_group_id)] + tf_dimensions = [0 for _ in range(tf_group_id)] + output_index_list = [[] for _ in range(tf_group_id)] + + for index, ev_flag in enumerate(is_ev_list): + if ev_flag: + continue + param = params[index] + dim = param.shape[1] + group_id = tf_group_id_map[dim] + tf_id = ids[index] + + tf_dimensions[group_id] = dim + tf_handlers[group_id].append(param) + tf_ids[group_id].append(array_ops.reshape(tf_id, [-1])) + output_index_list[group_id].append(params_idx_map[param.ref()]) + + for group_id in range(tf_group_id): + dim = tf_dimensions[group_id] + output_index = output_index_list[group_id] + with ops.name_scope( + name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + ids + ) as name_scope: + outputs = group_embedding_lookup_ops.group_variable_lookup_dense( + tf_handlers[group_id], tf_ids[group_id], dim + )[0] + for idx, output in zip(output_index, outputs): + emb_vec[idx] = output + + else: + raise ValueError("Unrecognized strategy, expected collective, given{}".format(strategy)) + + return emb_vec + + +def group_embedding_lookup_sparse( + params, + sp_ids, + combiners, + sp_weights=None, + partition_strategy='mod', + is_sequence=False, + params_num_per_group=sys.maxsize, + name=None, +): + """ + This interface is designed for fused multiple embedding lookup. + Args: + params: list, tuple + a list or tuple of trainable *Variable* or *EmbeddingVariable*. + sp_ids: list, tuple + a list or tuple of tf.SparseTensor or tf.RaggedTensor. + btw RaggedTensor is preferred. + combiners: list, tuple + a list or tuple of string to specify the combiner of each embedding lookup, + supported args is *sum* or *mean* + sp_weights: list, tuple + a list or tuple of tf.SparseTensor used for embedding lookup. + is_sequence: bool + return list of `Tensor` of shape `[batch_size, D]` when is False + return list of `Tensor` of shape `[batch_size, T, D]` when is True + params_num_per_group: int + The number of params in GroupEmbedding op.Function will schedule len(params) // params_num_per_group + 1 + GroupEmbedding Op. Default setting would launch one Op containing all params which is suitable for GPU scenarios + to maximize the GPU utilization.On the contrast, you could set value to 1 when Op + is placed on CPU so as to maximize inter parallelism. + name: The operations name + Returns + ------- + emb_vec: list + a list of tf.Tensor(the results of lookup). + """ + + if combiners is None: + logging.warn('The default value of combiner will change from "mean" to "sqrtn" after 2016/11/01.') + combiners = ['mean'] * len(params) + if not isinstance(combiners, list): + combiners = [combiners] + for combiner in combiners: + if combiner not in ('mean', 'sum'): + raise ValueError("combiners must be one of 'mean', 'sum'") + + if params is None: + raise ValueError('params must be specified') + if not isinstance(params, list): + params = [params] + + # Currently do not support PartitionedVariable. + for index, param in enumerate(params): + if isinstance(param, variables.PartitionedVariable): + tmp_param = list(param) + if len(tmp_param) != 1: + raise TypeError("PartitionedVariable not support in 'group_embedding_lookup_sparse'. ") + params[index] = tmp_param[0] + elif isinstance(param, dp.layers.embedding_variable.EmbeddingVariable): + params[index] = param.embedding_variable + + ignore_weights = sp_weights is None + + if len(combiners) != len(sp_ids): + raise ValueError('len of combiners must be equal to len of sp_ids') + if len(combiners) != len(params): + raise ValueError('len of combiners must be equal to len of params') + if not ignore_weights: + if len(combiners) != len(sp_weights): + raise ValueError('len of combiners must be equal to len of sp_weights') + + strategy = get_group_lookup_strategy() + if strategy == DistStrategy.SOK: + import horovod.tensorflow as hvd + should_shard = False + if len(params) > hvd.size(): + should_shard = True + global_size = hvd.size() + if should_shard: + for (index, param) in enumerate(params): + param.target_gpu = index % global_size + else: + for (index, param) in enumerate(params): + param.target_gpu = -1 + + try: + from sparse_operation_kit import experiment as sok + except: + raise ImportError('sparse_operation_kit is not found while group_embedding strategy is given `collective`') + with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope: + emb_vec = sok.lookup_sparse(params, sp_ids, combiners=combiners) + elif strategy == DistStrategy.HB: + emb_vec = [] + with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope: + for idx, embedding in enumerate(params): + if not ignore_weights: + sp_weight = sp_weights[idx] + else: + sp_weight = None + emb_vec.append(embedding_lookup_sparse(embedding, sp_ids[idx], sp_weight, combiner=combiners[idx])) + + elif strategy == DistStrategy.LOCALIZED: + + emb_vec = [None for _ in range(len(params))] + + ev_group_id_map = {} + tf_group_id_map = {} + ev_group_id = 0 + tf_group_id = 0 + is_ev_list = [False for _ in range(len(params))] + params_idx_map = defaultdict(list) # queue + + for (index, param) in enumerate(params): + params_idx_map[param.ref()].append(index) + sp_id = sp_ids[index] + if not isinstance(sp_id, sparse_tensor.SparseTensor): + try: # assume RaggedTensor + sp_id = sp_id.to_sparse() + sp_ids[index] = sp_id + except: + raise ValueError('sp_id is neither SparseTensor nor RaggedTensor!') + + if not ignore_weights: + sp_weight = sp_weights[index] + if sp_weight is not None: + if not isinstance(sp_weight, sparse_tensor.SparseTensor): + raise TypeError('sp_weights must be either None or SparseTensor') + sp_id.values.get_shape().assert_is_compatible_with(sp_weight.values.get_shape()) + sp_id.indices.get_shape().assert_is_compatible_with(sp_weight.indices.get_shape()) + sp_id.dense_shape.get_shape().assert_is_compatible_with(sp_weight.dense_shape.get_shape()) + + if isinstance(param, kv_variable_ops.EmbeddingVariable): + is_ev_list[index] = True + dim = param.shape[0] + if dim not in ev_group_id_map: + ev_group_id_map[dim] = ev_group_id + ev_group_id += 1 + else: + # tensorflow variable + dim = param.shape[1] + if dim not in tf_group_id_map: + tf_group_id_map[dim] = tf_group_id + tf_group_id += 1 + + if ev_group_id > 0: + ev_sp_values = [[] for _ in range(ev_group_id)] + ev_sp_indices = [[] for _ in range(ev_group_id)] + ev_sp_weights = [[] for _ in range(ev_group_id)] + ev_dense_shapes = [[] for _ in range(ev_group_id)] + ev_handlers = [[] for _ in range(ev_group_id)] + ev_dimensions = [0 for _ in range(ev_group_id)] + ev_combiners = ['mean' for _ in range(ev_group_id)] + output_index_list = [[] for _ in range(ev_group_id)] + + for (index, ev_flag) in enumerate(is_ev_list): + if not ev_flag: + continue + param = params[index] + dim = param.shape[0] + group_id = ev_group_id_map[dim] + sp_id = sp_ids[index] + combiner = combiners[index] + + ev_combiners[group_id] = combiner + ev_dimensions[group_id] = dim + resource_variable_ops.variable_accessed(param) + ev_handlers[group_id].append(param.handle) + ev_sp_values[group_id].append(sp_id.values) + ev_sp_indices[group_id].append(sp_id.indices) + ev_dense_shapes[group_id].append(sp_id.dense_shape) + output_index_list[group_id].append(params_idx_map[param.ref()].pop(0)) + + if not ignore_weights: + sp_weight = sp_weights[index] + ev_sp_weights[group_id].append(sp_weight.values) + + for group_id in range(ev_group_id): + dim = ev_dimensions[group_id] + output_index = output_index_list[group_id] + + (num_sub_group, num_remainder) = \ + divmod(len(ev_handlers[group_id]), + params_num_per_group) + for j in range(num_sub_group): + sub_ev_sp_weight = ( + [None for _ in range(params_num_per_group)] if ignore_weights else + (ev_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group] + ) + with ops.name_scope( + name, 'localized_group_embedding_lookup_ev_dim{}_{}'.format(dim, j), params + sp_ids + ) as name_scope: + outputs = group_embedding_var_lookup( + (ev_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + (ev_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + (ev_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + sub_ev_sp_weight, + ev_combiners[group_id], + (ev_dense_shapes[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + dim, + ignore_weights, + is_sequence, + )[0] + + for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs): + emb_vec[idx] = output + + if num_remainder > 0: + sub_ev_sp_weight = ( + [None for _ in range(num_remainder)] if ignore_weights else (ev_sp_weights[group_id])[-num_remainder:] + ) + with ops.name_scope( + name, 'localized_group_embedding_lookup_ev_dim{}'.format(dim), params + sp_ids + ) as name_scope: + outputs = group_embedding_var_lookup( + (ev_handlers[group_id])[-num_remainder:], + (ev_sp_values[group_id])[-num_remainder:], + (ev_sp_indices[group_id])[-num_remainder:], + sub_ev_sp_weight, + ev_combiners[group_id], + (ev_dense_shapes[group_id])[-num_remainder:], + dim, + ignore_weights, + is_sequence, + )[0] + + for (idx, output) in zip(output_index[-num_remainder:], outputs): + emb_vec[idx] = output + + if tf_group_id > 0: + tf_sp_values = [[] for _ in range(tf_group_id)] + tf_sp_indices = [[] for _ in range(tf_group_id)] + tf_sp_weights = [[] for _ in range(tf_group_id)] + tf_dense_shape = [[] for _ in range(tf_group_id)] + tf_handlers = [[] for _ in range(tf_group_id)] + tf_dimensions = [0 for _ in range(tf_group_id)] + tf_combiners = ['mean' for _ in range(tf_group_id)] + output_index_list = [[] for _ in range(tf_group_id)] + + for (index, ev_flag) in enumerate(is_ev_list): + if ev_flag: + continue + param = params[index] + dim = param.shape[1] + group_id = tf_group_id_map[dim] + sp_id = sp_ids[index] + combiner = combiners[index] + + tf_combiners[group_id] = combiner + tf_dimensions[group_id] = dim + tf_handlers[group_id].append(param) + tf_sp_values[group_id].append(sp_id.values) + tf_sp_indices[group_id].append(sp_id.indices) + tf_dense_shape[group_id].append(sp_id.dense_shape) + output_index_list[group_id].append(params_idx_map[param].pop(0)) + + if not ignore_weights: + sp_weight = sp_weights[index] + tf_sp_weights[group_id].append(sp_weight.values) + + for group_id in range(tf_group_id): + dim = tf_dimensions[group_id] + output_index = output_index_list[group_id] + + (num_sub_group, num_remainder) = divmod(len(tf_handlers[group_id]), params_num_per_group) + for j in range(num_sub_group): + sub_tf_sp_weight = ( + [None for _ in range(params_num_per_group)] if ignore_weights else + (tf_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group] + ) + with ops.name_scope( + name, 'localized_group_embedding_lookup_variable_dim{}_{}'.format(dim, j), params + sp_ids + ) as name_scope: + outputs = group_embedding_lookup_ops.group_variable_lookup( + (tf_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + (tf_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + (tf_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + sub_tf_sp_weight, + tf_combiners[group_id], + (tf_dense_shape[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group], + dim, + ignore_weights, + is_sequence, + )[0] + + for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs): + emb_vec[idx] = output + + if num_remainder > 0: + sub_tf_sp_weight = ( + [None for _ in range(num_remainder)] if ignore_weights else (tf_sp_weights[group_id])[-num_remainder:] + ) + with ops.name_scope( + name, 'localized_group_embedding_lookup_variable_dim{}'.format(dim), params + sp_ids + ) as name_scope: + outputs = group_embedding_lookup_ops.group_variable_lookup( + (tf_handlers[group_id])[-num_remainder:], + (tf_sp_values[group_id])[-num_remainder:], + (tf_sp_indices[group_id])[-num_remainder:], + sub_tf_sp_weight, + tf_combiners[group_id], + (tf_dense_shape[group_id])[-num_remainder:], + dim, + ignore_weights, + is_sequence, + )[0] + + for (idx, output) in zip(output_index[-num_remainder:], outputs): + emb_vec[idx] = output + elif strategy == DistStrategy.UNKNOWN: + + raise ValueError('Unrecognized strategy, expected collective, given{}'.format(strategy)) + + return emb_vec + + +@ops.RegisterGradient("GroupEmbeddingVarLookupDense") +def _GroupGatherDenseGrad(op, *top_grads): + ev_num = op.get_attr("num_lookups") + grads = [] + for i in range(ev_num): + handle = op.inputs[i] + indice = op.inputs[ev_num + i] + params_shape = resource_variable_ops.variable_shape(handle) + grad = top_grads[i] + grads.append(indexed_slices.IndexedSlices(grad, indice, params_shape)) + return grads + [None for _ in range(ev_num + 1)] + + +@ops.RegisterGradient("GroupEmbeddingVarLookup") +def _GroupGatherGrad(op, *grads): + ev_num = op.get_attr("num_lookups") + combiner = op.get_attr("combiner") + dimension = op.get_attr("dimension") + return_grads = [] + params = op.inputs[:ev_num] + sp_indices = op.inputs[ev_num * 2:ev_num * 3] + unique_values = op.outputs[ev_num:2 * ev_num] + batch_nums = op.outputs[3 * ev_num:4 * ev_num] + with ops.colocate_with(params[0]): + nnz_grads = gen_group_embedding_ops.group_embedding_variable_lookup_grad( + grads[:ev_num], params, unique_values, sp_indices, batch_nums, dimension, combiner + ) + for i in range(ev_num): + handle = params[i] + params_shape = resource_variable_ops.variable_shape(handle) + indice = unique_values[i] + grad = nnz_grads[i] + return_grads.append(indexed_slices.IndexedSlices(grad, indice, params_shape)) + return return_grads + [None for _ in range(ev_num * 4 + 1)] diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_types.py b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py new file mode 100644 index 00000000..4eb679c9 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py @@ -0,0 +1,54 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines functions common to group embedding lookup files.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from enum import Enum, unique + + +@unique +class DistStrategy(Enum): + SOK = "sok" + HB = "hb" + DISTRIBUTED = "ps" + LOCALIZED = "localized" + UNKNOWN = "unknown" + + +_group_lookup_strategy = DistStrategy.LOCALIZED + + +def set_group_lookup_strategy(strategy): + + def str_to_strategy(strategy): + if strategy == "sok": + return DistStrategy.SOK + elif strategy == "hb": + return DistStrategy.HB + elif strategy == "ps": + return DistStrategy.DISTRIBUTED + elif strategy == "localized": + return DistStrategy.LOCALIZED + + global _group_lookup_strategy + _group_lookup_strategy = str_to_strategy(strategy) + + +def get_group_lookup_strategy(): + global _group_lookup_strategy + return _group_lookup_strategy diff --git a/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py new file mode 100644 index 00000000..c9dab432 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py @@ -0,0 +1,1027 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Ops to use variables as resources.""" + +# pylint: disable=g-bad-name +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import contextlib +import os +import weakref + +import tensorflow as tf +from absl import flags +from tensorflow.core.framework import attr_value_pb2 +from tensorflow.python.eager import context +from tensorflow.python.eager import tape +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import indexed_slices +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor as tensor_module +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import handle_data_util +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables +from tensorflow.python.ops.resource_variable_ops import get_eager_safe_handle_data, _combine_handle_data, _set_handle_shapes_and_types, ResourceVariable +from tensorflow.python.platform import resource_loader +from tensorflow.python.saved_model import registration +from tensorflow.python.trackable import base as trackable +from tensorflow.python.training.saving import saveable_object +from tensorflow.python.util import compat + +from deepray.custom_ops.embedding_variable import config_pb2 +from deepray.custom_ops.embedding_variable import variables as ev_variables +from deepray.utils import logging_util + +gen_kv_variable_ops = tf.load_op_library(resource_loader.get_path_to_datafile("../_kv_variable_ops.so")) + +logger = logging_util.get_logger() + +__all__ = ["EmbeddingVariable"] + + +def _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, name, graph_mode, initial_value=None): + """Create a variable handle, copying in handle data from `initial_value`.""" + container = ops.get_default_graph()._container # pylint: disable=protected-access + if container is None: + container = "" + shape = tensor_shape.as_shape(shape) + dtype = dtypes.as_dtype(dtype) + key_type = dtypes.as_dtype(key_type) + + handle = gen_kv_variable_ops.kv_var_handle_op( + shape=shape, + dtype=dtype, + Tkeys=key_type, + shared_name=shared_name, + # debug_name=name, + name=name, + container=container + ) + if initial_value is None: + initial_value = handle + if graph_mode: + full_handle_data = _combine_handle_data(handle, initial_value) + _set_handle_shapes_and_types(handle, full_handle_data, graph_mode) + return handle + else: + handle_data = handle_data_util.create_handle_data(shape, dtype) + if initial_value is not None and initial_value.dtype == dtypes.variant: + extra_handle_data = get_eager_safe_handle_data(initial_value) + if extra_handle_data is not None and extra_handle_data.is_set: + if (not handle_data.is_set or len(handle_data.shape_and_type) != 1): + raise RuntimeError("Expected VarHandleOp to return a length==1 shape_and_type, " + f"but saw: '{handle_data}'") + handle_data.shape_and_type.extend(extra_handle_data.shape_and_type) + + _set_handle_shapes_and_types(handle, handle_data, graph_mode) + return handle + + +def eager_safe_variable_handle(initial_value, shape, key_type, shared_name, name, graph_mode): + """Creates a variable handle with information to do shape inference. + + The dtype is read from `initial_value` and stored in the returned + resource tensor's handle data. + + If `initial_value.dtype == tf.variant`, we additionally extract the handle + data (if any) from `initial_value` and append it to the `handle_data`. + In this case, the returned tensor's handle data is in the form + + ``` + is_set: true + shape_and_type { + shape { + // initial_value.shape + } + dtype: DT_VARIANT + } + shape_and_type { + // handle_data(initial_value).shape_and_type[0] + } + shape_and_type { + // handle_data(initial_value).shape_and_type[1] + } + ... + ``` + + Ops that read from this tensor, such as `ReadVariableOp` and + `AssignVariableOp`, know that `handle_data(handle).shape_and_type[1:]` + correspond to the handle data of the variant(s) stored in the Variable. + + Args: + initial_value: A `Tensor`. + shape: The shape of the handle data. Can be `TensorShape(None)` (i.e. + unknown shape). + shared_name: A string. + name: A string. + graph_mode: A python bool. + + Returns: + The handle, a `Tensor` of type `resource`. + """ + dtype = initial_value.dtype.base_dtype + return _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, name, graph_mode, initial_value) + + +class EmbeddingVariable(ResourceVariable, saveable_object.SaveableObject): + """Variable based on resource handles. + + See the [Variables How To](https://tensorflow.org/guide/variables) + for a high level overview. + + A `ResourceVariable` allows you to maintain state across subsequent calls to + session.run. + + The `ResourceVariable` constructor requires an initial value for the variable, + which can be a `Tensor` of any type and shape. The initial value defines the + type and shape of the variable. After construction, the type and shape of + the variable are fixed. The value can be changed using one of the assign + methods. + + Just like any `Tensor`, variables created with + `tf.Variable(use_resource=True)` can be used as inputs for other Ops in the + graph. Additionally, all the operators overloaded for the `Tensor` class are + carried over to variables, so you can also add nodes to the graph by just + doing arithmetic on variables. + + Unlike ref-based variable, a ResourceVariable has well-defined semantics. Each + usage of a ResourceVariable in a TensorFlow graph adds a read_value operation + to the graph. The Tensors returned by a read_value operation are guaranteed to + see all modifications to the value of the variable which happen in any + operation on which the read_value depends on (either directly, indirectly, or + via a control dependency) and guaranteed to not see any modification to the + value of the variable from operations that depend on the read_value operation. + Updates from operations that have no dependency relationship to the read_value + operation might or might not be visible to read_value. + + For example, if there is more than one assignment to a ResourceVariable in + a single session.run call there is a well-defined value for each operation + which uses the variable's value if the assignments and the read are connected + by edges in the graph. Consider the following example, in which two writes + can cause tf.Variable and tf.ResourceVariable to behave differently: + + ```python + a = tf.Variable(1.0, use_resource=True) + a.initializer.run() + + assign = a.assign(2.0) + with tf.control_dependencies([assign]): + b = a.read_value() + with tf.control_dependencies([b]): + other_assign = a.assign(3.0) + with tf.control_dependencies([other_assign]): + # Will print 2.0 because the value was read before other_assign ran. If + # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed. + tf.compat.v1.Print(b, [b]).eval() + ``` + """ + + def __init__( + self, # pylint: disable=super-init-not-called + initial_value=None, + trainable=None, + collections=None, + validate_shape=True, # pylint: disable=unused-argument + caching_device=None, + name=None, + dtype=None, + variable_def=None, + import_scope=None, + constraint=None, + distribute_strategy=None, + synchronization=None, + aggregation=None, + shape=None, + handle=None, + experimental_enable_variable_lifting=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Creates a variable. + + Args: + initial_value: A `Tensor`, or Python object convertible to a `Tensor`, + which is the initial value for the Variable. Can also be a callable with + no argument that returns the initial value when called. (Note that + initializer functions from init_ops.py must first be bound to a shape + before being used here.) + trainable: If `True`, the default, also adds the variable to the graph + collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as + the default list of variables to use by the `Optimizer` classes. + Defaults to `True`, unless `synchronization` is set to `ON_READ`, in + which case it defaults to `False`. + collections: List of graph collections keys. The new variable is added to + these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. + validate_shape: If `False`, allows the variable to be initialized with a + value of unknown shape. If `True`, the default, the shape of + `initial_value` must be known. + caching_device: Optional device string or function describing where the + Variable should be cached for reading. Defaults to the Variable's + device. If not `None`, caches on another device. Typical use is to + cache on the device where the Ops using the Variable reside, to + deduplicate copying through `Switch` and other conditional statements. + name: Optional name for the variable. Defaults to `'Variable'` and gets + uniquified automatically. + dtype: If set, initial_value will be converted to the given type. If None, + either the datatype will be kept (if initial_value is a Tensor) or + float32 will be used (if it is a Python object convertible to a Tensor). + variable_def: `VariableDef` protocol buffer. If not None, recreates the + `ResourceVariable` object with its contents. `variable_def` and other + arguments (except for import_scope) are mutually exclusive. + import_scope: Optional `string`. Name scope to add to the + ResourceVariable. Only used when `variable_def` is provided. + constraint: An optional projection function to be applied to the variable + after being updated by an `Optimizer` (e.g. used to implement norm + constraints or value constraints for layer weights). The function must + take as input the unprojected Tensor representing the value of the + variable and return the Tensor for the projected value (which must have + the same shape). Constraints are not safe to use when doing asynchronous + distributed training. + distribute_strategy: The tf.distribute.Strategy this variable is being + created inside of. + synchronization: Indicates when a distributed a variable will be + aggregated. Accepted values are constants defined in the class + `tf.VariableSynchronization`. By default the synchronization is set to + `AUTO` and the current `DistributionStrategy` chooses when to + synchronize. + aggregation: Indicates how a distributed variable will be aggregated. + Accepted values are constants defined in the class + `tf.VariableAggregation`. + shape: (optional) The shape of this variable. If None, the shape of + `initial_value` will be used. When setting this argument to + `tf.TensorShape(None)` (representing an unspecified shape), the variable + can be assigned with values of different shapes. + handle: (optional) The handle of a `tf.Variable`. If provided, only + `trainable`, `shape`, `dtype`, and `handle` will be used to construct + this `tf.Variable`. + experimental_enable_variable_lifting: Whether to lift the variable out if + it's in a `tf.function`. Default is `True`. When this argument + is `True`, variable creation will follow the behavior and + restrictions described + [here](https://www.tensorflow.org/guide/function#creating_tfvariables). + If this argument is `False`, that description doesn't apply, + and you can freely create and use the variable in the + `tf.function`, as if it's a "mutable `tf.Tensor`". You can't + return the variable though. + + Raises: + ValueError: If the initial value is not specified, or does not have a + shape and `validate_shape` is `True`. + + @compatibility(eager) + When Eager Execution is enabled, the default for the `collections` argument + is `None`, which signifies that this `Variable` will not be added to any + collections. + @end_compatibility + """ + if variable_def: + if initial_value is not None: + raise ValueError( + f"The variable_def and initial_value args to " + f"`tf.Variable` are mutually exclusive, but got both: " + f"variable_def={variable_def},\n" + f"initial_value={initial_value}" + ) + if context.executing_eagerly(): + raise ValueError( + f"Creating a `tf.Variable` with a `variable_def` arg " + f"is not supported when eager execution is enabled. " + f"Got: variable_def={variable_def}" + ) + self._init_from_proto(variable_def, import_scope=import_scope, validate_shape=validate_shape) + elif handle is not None: + self._init_from_handle(trainable=trainable, shape=shape, dtype=dtype, handle=handle) + else: + evconfig.reveal() + self._init_from_args( + initial_value=initial_value, + trainable=trainable, + collections=collections, + caching_device=caching_device, + name=name, + dtype=dtype, + constraint=constraint, + synchronization=synchronization, + aggregation=aggregation, + shape=shape, + distribute_strategy=distribute_strategy, + validate_shape=validate_shape, + experimental_enable_variable_lifting=experimental_enable_variable_lifting, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + def __repr__(self): + return "" % (self.name, self.shape, self.dtype.name) + + def _init_from_args( + self, + initial_value=None, + trainable=None, + collections=None, + caching_device=None, + name=None, + dtype=None, + constraint=None, + synchronization=None, + aggregation=None, + distribute_strategy=None, + shape=None, + validate_shape=True, + experimental_enable_variable_lifting=None, + invalid_key=-1, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Creates a variable. + + Args: + initial_value: A `Tensor`, or Python object convertible to a `Tensor`, + which is the initial value for the Variable. The initial value must have + a shape specified unless `validate_shape` is set to False. Can also be a + callable with no argument that returns the initial value when called. + (Note that initializer functions from init_ops.py must first be bound to + a shape before being used here.) + trainable: If `True`, the default, also adds the variable to the graph + collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as + the default list of variables to use by the `Optimizer` classes. + Defaults to `True`, unless `synchronization` is set to `ON_READ`, in + which case it defaults to `False`. + collections: List of graph collections keys. The new variable is added to + these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. + caching_device: Optional device string or function describing where the + Variable should be cached for reading. Defaults to the Variable's + device. If not `None`, caches on another device. Typical use is to + cache on the device where the Ops using the Variable reside, to + deduplicate copying through `Switch` and other conditional statements. + name: Optional name for the variable. Defaults to `'Variable'` and gets + uniquified automatically. + dtype: If set, initial_value will be converted to the given type. If None, + either the datatype will be kept (if initial_value is a Tensor) or + float32 will be used (if it is a Python object convertible to a Tensor). + constraint: An optional projection function to be applied to the variable + after being updated by an `Optimizer` (e.g. used to implement norm + constraints or value constraints for layer weights). The function must + take as input the unprojected Tensor representing the value of the + variable and return the Tensor for the projected value (which must have + the same shape). Constraints are not safe to use when doing asynchronous + distributed training. + synchronization: Indicates when a distributed a variable will be + aggregated. Accepted values are constants defined in the class + `tf.VariableSynchronization`. By default the synchronization is set to + `AUTO` and the current `DistributionStrategy` chooses when to + synchronize. + aggregation: Indicates how a distributed variable will be aggregated. + Accepted values are constants defined in the class + `tf.VariableAggregation`. + distribute_strategy: DistributionStrategy under which this variable was + created. + shape: (optional) The shape of this variable. If None, the shape of + `initial_value` will be used. When setting this argument to + `tf.TensorShape(None)` (representing an unspecified shape), the variable + can be assigned with values of different shapes. + validate_shape: If `False`, allows the variable to be initialized with a + value of unknown shape. If `True`, the default, the shape of + `initial_value` must be known. + experimental_enable_variable_lifting: Whether to lift the variable out if + it's in a `tf.function`. Default is `True`. When this argument + is `True`, variable creation will follow the behavior and + restrictions described + [here](https://www.tensorflow.org/guide/function#creating_tfvariables). + If this argument is `False`, that description doesn't apply, + and you can freely create and use the variable in the + `tf.function`, as if it's a "mutable `tf.Tensor`". You can't + return the variable though. + + Raises: + ValueError: If the initial value is not specified, or does not have a + shape and `validate_shape` is `True`. + + @compatibility(eager) + When Eager Execution is enabled, variables are never added to collections. + It is not implicitly added to the `GLOBAL_VARIABLES` or + `TRAINABLE_VARIABLES` collections, and the `collections` argument is + ignored. + @end_compatibility + """ + synchronization, aggregation, trainable = ( + variables.validate_synchronization_aggregation_trainable(synchronization, aggregation, trainable, name) + ) + if experimental_enable_variable_lifting is None: + experimental_enable_variable_lifting = True + if initial_value is None: + raise ValueError( + "The `initial_value` arg to `tf.Variable` must " + "be specified except when you are not providing a " + "`variable_def`. You provided neither." + ) + init_from_fn = callable(initial_value) + + if isinstance(initial_value, + tensor_module.Tensor) and hasattr(initial_value, "graph") and initial_value.graph.building_function: + raise ValueError( + f"Argument `initial_value` ({initial_value}) could not " + "be lifted out of a `tf.function`. " + f"(Tried to create variable with name='{name}'). " + "To avoid this error, when constructing `tf.Variable`s " + "inside of `tf.function` you can create the " + "`initial_value` tensor in a " + "`tf.init_scope` or pass a callable `initial_value` " + "(e.g., `tf.Variable(lambda : " + "tf.truncated_normal([10, 40]))`). " + "Please file a feature request if this " + "restriction inconveniences you." + ) + + if collections is None: + collections = [ops.GraphKeys.GLOBAL_VARIABLES] + if not isinstance(collections, (list, tuple, set)): + raise ValueError( + f"collections argument to Variable constructor must be a list, " + f"tuple, or set. Got {collections} of type {type(collections)}" + ) + if constraint is not None and not callable(constraint): + raise ValueError( + f"Argument `constraint` must be None or a callable. " + f"a callable. Got a {type(constraint)}: {constraint}" + ) + + if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: + collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES] + + self._save_slice_info = None + self._in_graph_mode = not context.executing_eagerly() + self._steps_to_live = evconfig.steps_to_live + self._init_data_source = evconfig.init_data_source + self._emb_index = evconfig.emb_index + self._slot_index = evconfig.slot_index + self._block_num = evconfig.block_num + self._block_handle_name = None + self._primary = evconfig.primary + self._ht_type = evconfig.ht_type + self._ht_partition_num = ht_partition_num + self._is_sparse = False + self.importer = None + if evconfig.filter_strategy != None: + if isinstance(evconfig.filter_strategy, ev_variables.CounterFilter): + self._filter_freq = evconfig.filter_strategy.filter_freq + self._max_element_size = 0 + self._false_positive_probability = -1.0 + self._counter_type = dtypes.uint64 + elif isinstance(evconfig.filter_strategy, ev_variables.CBFFilter): + self._filter_freq = evconfig.filter_strategy.filter_freq + self._max_element_size = evconfig.filter_strategy.max_element_size + self._false_positive_probability = evconfig.filter_strategy.false_positive_probability + self._counter_type = evconfig.filter_strategy.counter_type + else: + self._filter_freq = 0 + self._max_element_size = 0 + self._false_positive_probability = -1.0 + self._counter_type = dtypes.uint64 + + self._record_freq = (os.environ.get("TF_RECORD_FREQ", "0") == "1") + self._record_version = (os.environ.get("TF_RECORD_VERSION", "0") == "1") + self._l2_weight_threshold = evconfig.l2_weight_threshold + self._storage_type = evconfig.storage_type + self._storage_path = evconfig.storage_path + self._storage_size = evconfig.storage_size + self._default_value_dim = evconfig.default_value_dim + self._default_value_no_permission = evconfig.default_value_no_permission + self._storage_cache_strategy = evconfig.storage_cache_strategy + self._layout = evconfig.layout + + if self._primary is None: + self._is_primary = True + else: + self._is_primary = False + + with ops.init_scope(): + self._in_graph_mode = not context.executing_eagerly() + if experimental_enable_variable_lifting: + maybe_init_scope = ops.init_scope + else: + maybe_init_scope = contextlib.nullcontext + with maybe_init_scope(): + with ops.name_scope(name, "Variable", [] if init_from_fn else [initial_value], skip_on_eager=False) as name: + self._invalid_key = invalid_key + self._invalid_key_type = ops.convert_to_tensor(invalid_key, name="invalid_key").dtype.base_dtype + handle_name = ops.name_from_scope_name(name) + shared_name = handle_name + if self._in_graph_mode: + unique_id = shared_name + else: + # When in eager mode, use a uid for the shared_name, to prevent + # accidental sharing. + unique_id = "%s_%d" % (handle_name, ops.uid()) + self._unique_id = unique_id + if handle_name is None: + self._handle_name = "Variable:0" + else: + self._handle_name = handle_name + ":0" + # Use attr_scope and device(None) to simulate the behavior of + # colocate_with when the variable we want to colocate with doesn't + # yet exist. + device_context_manager = (ops.device if self._in_graph_mode else ops.NullContextmanager) + attr = attr_value_pb2.AttrValue( + list=attr_value_pb2.AttrValue.ListValue(s=[compat.as_bytes("loc:@%s" % handle_name)]) + ) + with ops.get_default_graph()._attr_scope({"_class": attr}): + with ops.name_scope("Initializer"), device_context_manager(None): + if init_from_fn: + initial_value = initial_value() + if isinstance(initial_value, trackable.CheckpointInitialValue): + self._maybe_initialize_trackable() + self._update_uid = initial_value.checkpoint_position.restore_uid + initial_value = initial_value.wrapped_value + initial_value = ops.convert_to_tensor(initial_value, name="initial_value", dtype=dtype) + rank = initial_value.get_shape().rank - 1 + if shape is not None: + if not initial_value.shape.is_compatible_with(shape): + raise ValueError( + f"In this `tf.Variable` creation, the initial value's shape " + f"({initial_value.shape}) is not compatible with " + f"the explicitly supplied `shape` argument ({shape})." + ) + else: + shape = initial_value.get_shape()[rank:] + _device = "GPU" if self._storage_type in [ + config_pb2.StorageType.HBM, config_pb2.StorageType.HBM_DRAM, config_pb2.StorageType.HBM_DRAM_SSDHASH + ] else "CPU" + with ops.device(_device): + handle = eager_safe_variable_handle( + initial_value=initial_value, + shape=shape, + key_type=self._invalid_key_type, + shared_name=shared_name, + name=name, + graph_mode=self._in_graph_mode + ) + handle._parent_trackable = weakref.ref(self) + handle._name = handle_name + ":0" + handle._unique_id = unique_id + self._handle = handle + # pylint: disable=protected-access + if ( + self._in_graph_mode and initial_value is not None and + initial_value.op._get_control_flow_context() is not None + ): + raise ValueError( + f"The `initial_value` passed to `tf.Variable` {name} is from " + f"inside a control-flow construct, such as a loop or " + f"conditional. When creating a " + f"`tf.Variable` inside a loop or conditional, use a lambda as " + f"the `initial_value`. Got: initial_value=({initial_value})" + ) + # pylint: enable=protected-access + dtype = initial_value.dtype.base_dtype + self._counts_tensor = {} + self._is_multi_tier = self.is_multi_tier(self._storage_type) + if self._primary is None: + self._primary = self + + if self._is_primary: + self._slot_num = flags.FLAGS.ev_slot_num + else: + self._slot_num = evconfig.slot_num + + if self._in_graph_mode: + with ops.name_scope("IsInitialized"): + self._is_initialized_op = ( + gen_kv_variable_ops.kv_var_is_initialized_op(handle, Tkeys=self._invalid_key_type, dtype=self._dtype) + ) + if initial_value is not None: + # pylint: disable=g-backslash-continuation + with ops.name_scope("Assign") as n, \ + ops.colocate_with(None, ignore_existing=True), \ + ops.device(handle.device): + with ops.control_dependencies(None if self._is_primary else [self._primary.initializer]): + self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op( + handle, + self._primary._handle, + variables._try_guard_against_uninitialized_dependencies(name, initial_value), + ops.convert_to_tensor(invalid_key), + slot_num=self._slot_num, + shape=initial_value.get_shape()[rank:], + steps_to_live=self._steps_to_live, + emb_index=self._emb_index, + block_num=self.block_num, + slot_index=self._slot_index, + ht_type=self._ht_type, + ht_partition_num=self._ht_partition_num, + filter_freq=self._filter_freq, + l2_weight_threshold=self._l2_weight_threshold, + max_element_size=self._max_element_size, + false_positive_probability=self._false_positive_probability, + counter_type=self._counter_type, + max_freq=99999, + layout=self._layout, + storage_type=self._storage_type, + storage_path=self._storage_path, + storage_size=self._storage_size, + default_value_dim=self._default_value_dim, + default_value_no_permission=self._default_value_no_permission, + record_freq=self._record_freq, + record_version=self._record_version, + embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE, + name=n + ) + set_attr_ops = [] + + if self._is_primary and self._is_multi_tier: + with ops.control_dependencies([self._init_op]): + set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op( + self._handle, + cache_strategy=self._storage_cache_strategy, + Tkeys=self._invalid_key_type, + dtype=dtype + ) + set_attr_ops.append(set_cache_strategy_op) + with ops.control_dependencies(set_attr_ops + [self._init_op]): + self._initializer_op = control_flow_ops.no_op() + + self.create_init_op_for_restore(name, initial_value, invalid_key, rank) + else: + self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op( + handle, + self._primary._handle, + initial_value, + ops.convert_to_tensor(invalid_key), + slot_num=self._slot_num, + shape=shape, + steps_to_live=self._steps_to_live, + emb_index=self._emb_index, + block_num=self.block_num, + slot_index=self._slot_index, + ht_type=self._ht_type, + ht_partition_num=self._ht_partition_num, + filter_freq=self._filter_freq, + l2_weight_threshold=self._l2_weight_threshold, + max_element_size=self._max_element_size, + false_positive_probability=self._false_positive_probability, + counter_type=self._counter_type, + max_freq=99999, + layout=self._layout, + storage_type=self._storage_type, + storage_path=self._storage_path, + storage_size=self._storage_size, + default_value_dim=self._default_value_dim, + default_value_no_permission=self._default_value_no_permission, + record_freq=self._record_freq, + record_version=self._record_version, + embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE + ) + if self._is_primary and self._is_multi_tier: + with ops.control_dependencies([self._init_op]): + set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op( + self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=dtype + ) + + if self._in_graph_mode: + # Eager variables are only added to collections if they are part of an + # eager variable store (otherwise in an interactive session they would + # hog memory and cause OOM). This is done in ops/variable_scope.py. + ops.add_to_collections(collections, self) + elif ops.GraphKeys.GLOBAL_STEP in collections: + ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self) + initial_value = initial_value if self._in_graph_mode else None + super(EmbeddingVariable, self).__init__( + trainable=trainable, + shape=shape, + dtype=dtype, + handle=handle, + synchronization=synchronization, + constraint=constraint, + aggregation=aggregation, + distribute_strategy=distribute_strategy, + name=name, + initial_value=initial_value, + caching_device=caching_device, + validate_shape=validate_shape, + ) + + def is_multi_tier(self, storage_type): + multi_level_list = [ + config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_PMEM, + config_pb2.StorageType.DRAM_LEVELDB, config_pb2.StorageType.DRAM_SSDHASH, config_pb2.StorageType.HBM_DRAM, + config_pb2.StorageType.DRAM_PMEM_SSDHASH, config_pb2.StorageType.HBM_DRAM_SSDHASH + ] + return storage_type in multi_level_list + + def create_init_op_for_restore(self, name, initial_value, invalid_key, rank): + with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]): + self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op( + self._handle, + self._primary._handle, + variables._try_guard_against_uninitialized_dependencies(name, initial_value), + ops.convert_to_tensor(invalid_key), + initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED, + slot_num=self._slot_num, + shape=initial_value.get_shape()[rank:], + steps_to_live=self._steps_to_live, + emb_index=self._emb_index, + block_num=self.block_num, + slot_index=self._slot_index, + ht_type=self._ht_type, + ht_partition_num=self._ht_partition_num, + filter_freq=self._filter_freq, + l2_weight_threshold=self._l2_weight_threshold, + max_element_size=self._max_element_size, + false_positive_probability=self._false_positive_probability, + counter_type=self._counter_type, + max_freq=99999, + layout=self._layout, + storage_type=self._storage_type, + storage_path=self._storage_path, + storage_size=self._storage_size, + default_value_dim=self._default_value_dim, + default_value_no_permission=self._default_value_no_permission, + record_freq=self._record_freq, + record_version=self._record_version, + embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE + ) + set_attr_ops = [] + if self._is_primary and self._is_multi_tier: + with ops.control_dependencies([self._initializer_for_restore]): + set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op( + self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=self._dtype + ) + set_attr_ops.append(set_cache_op) + with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]): + self._init_op_for_restore = control_flow_ops.no_op() + # self.collect_restore_denpendencies() + + def sparse_read(self, indices, name=None, ev_init_value=None, counts=None): + """Reads the value of this variable sparsely, using `gather`.""" + with ops.name_scope("Gather" if name is None else name) as name: + if self._trainable: + tape.variable_accessed(self) + if ev_init_value is not None: + default_value = math_ops.cast(ev_init_value, self.dtype) + is_use_default_value_tensor = True + else: + default_value = ops.convert_to_tensor(1.0, dtype=self.dtype) + is_use_default_value_tensor = False + if counts is not None: + value = gen_kv_variable_ops.kv_resource_gather_v1( + self._handle, indices, default_value, counts, is_inference=True, name=name + ) + self._counts_tensor[indices] = counts + else: + value = gen_kv_variable_ops.kv_resource_gather( + self._handle, indices, default_value, is_use_default_value_tensor, is_inference=True, name=name + ) + return value + + @property + def initializer(self): + """The op responsible for initializing this variable.""" + return self._initializer_op + + @property + def initial_value(self): + """Returns the Tensor used as the initial value for the variable.""" + if context.executing_eagerly(): + raise RuntimeError("initial_value not supported in EAGER mode.") + return self._initial_value + + def is_initialized(self): + return gen_kv_variable_ops.kv_var_is_initialized_op(self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype) + + def is_all_slot_initialized(self): + return gen_kv_variable_ops.kv_var_is_all_slot_initialized_op( + self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype + ) + + @property + def block_num(self): + if self._block_num is None: + return 1 + else: + return self._block_num + + def need_counts(self): + return self._record_freq or (self._filter_freq > 0) or self._is_multi_tier + + @property + def storage_type(self): + return self._storage_type + + def lookup_resource(self): + return gen_kv_variable_ops.kv_resource_lookup_resource(self.handle, Tkeys=self._invalid_key_type, dtype=self._dtype) + + # Unused + # def _gather_saveables_for_checkpoint(self): + # return {"foo": lambda name: EmbeddingVariableSaveable(self, name)} + + +def lookup_resource(var): + return gen_kv_variable_ops.kv_resource_lookup_resource(var.handle, Tkeys=var._invalid_key_type, dtype=var._dtype) + + +def variable_shape(handle, indices, grad): + handle_data = get_eager_safe_handle_data(handle) + if handle_data is None or not handle_data.is_set: + return gen_kv_variable_ops.kv_variable_shape(handle, Tkeys=indices.dtype, dtype=grad.dtype) + shape_proto = handle_data.shape_and_type[0].shape + if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim): + return gen_kv_variable_ops.kv_variable_shape(handle, Tkeys=indices.dtype, dtype=grad.dtype) + return constant_op.constant([x.size for x in shape_proto.dim], dtype=dtypes.int32) + + +def get_tensor_slices(trackables): + tensor_names = [] + shapes_and_slices = [] + tensors = [] + restored_trackables = [] + ev_names = [] + ev_resources = [] + ev_key_types = [] + has_ev = False + for obj_prefix, obj in trackables.items(): + if isinstance(obj, EmbeddingVariable): + ev_names.append(obj.name) + ev_resources.append(obj.lookup_resource()) + ev_key_types.append(obj._invalid_key_type) + has_ev = True + + tensor_names.append(obj_prefix + "/value") + shapes_and_slices.append("") + tensors.append(constant_op.constant(2, dtype=obj.dtype)) + return tensor_names, shapes_and_slices, tensors, restored_trackables, ev_names, ev_resources, ev_key_types, has_ev + + +def save_fn(trackables, file_prefix): + """Save stack and part objects to a checkpoint shard.""" + tensor_names, shapes_and_slices, tensors, _, ev_names, ev_resources, ev_key_types, has_ev = get_tensor_slices( + trackables + ) + gen_kv_variable_ops.save_v3( + file_prefix, tensor_names, shapes_and_slices, ev_names, ev_resources, tensors, ev_key_types, has_ev + ) + return file_prefix + + +restore_queue = dict() + + +def restore_fn(trackables, merged_prefix): + for obj_prefix, obj in trackables.items(): + # Initialize queue entry if not exists + if obj._primary.name not in restore_queue: + restore_queue[obj._primary.name] = [] + restore_queue[obj._primary.name].append(obj) + if obj.is_all_slot_initialized(): + for ev in restore_queue[obj._primary.name]: + gen_kv_variable_ops.kv_resource_import_v3( + merged_prefix, + ev.handle, + ev.name, + ops.convert_to_tensor(ev._invalid_key), + shape=ev.shape, + partition_id=0, + partition_num=1, + dtype=ev.dtype + ) + + +registration.register_checkpoint_saver( + name="EmbeddingVariable", + predicate=lambda x: isinstance(x, (EmbeddingVariable)), + save_fn=save_fn, + restore_fn=restore_fn +) + + +@ops.RegisterGradient("KvResourceGather") +def _GatherGrad(op, grad): + """Gradient for gather op.""" + # Build appropriately shaped IndexedSlices + handle = op.inputs[0] + indices = op.inputs[1] + params_shape = variable_shape(handle, indices, grad) + size = array_ops.expand_dims(array_ops.size(indices), 0) + values_shape = array_ops.concat([size, params_shape[0:]], 0) + values = array_ops.reshape(grad, values_shape) + indices = array_ops.reshape(indices, size) + return [indexed_slices.IndexedSlices(values, indices, params_shape), None, None] + + +@ops.RegisterGradient("KvResourceGatherV1") +def _GatherV1Grad(op: ops.Operation, grad): + """Gradient for gather op.""" + # Build appropriately shaped IndexedSlices + handle = op.inputs[0] + indices = op.inputs[1] + params_shape = variable_shape(handle, indices, grad) + size = array_ops.expand_dims(array_ops.size(indices), 0) + values_shape = array_ops.concat([size, params_shape[0:]], 0) + values = array_ops.reshape(grad, values_shape) + indices = array_ops.reshape(indices, size) + return [indexed_slices.IndexedSlices(values, indices, params_shape), None, None] + + +ops.NotDifferentiable("KvVarIsInitializedOp") +ops.NotDifferentiable("KvVariableShape") + + +class EmbeddingVariableSaveable(saveable_object.SaveableObject): + """SaveableObject implementation that handles EmbeddingVariables.""" + + def __init__(self, var, name): + self.handle_op = var.handle + self.invalid_key = var.invalid_key + self.dtype = var._dtype + self.key_type = var._invalid_key_type + self.steps_to_live = var.steps_to_live + self.ht_type = var._ht_type + self.ht_partition_num = var._ht_partition_num + name = var._shared_name + self.var = var + is_partitioned_ev = not isinstance(self.var._save_slice_info, str) + self.partition_id = 0 + self.partition_num = 1 + if self.var._save_slice_info is not None: + self.partition_id = self.var._save_slice_info.var_offset[0] if is_partitioned_ev else 0 + self.partition_num = self.var._save_slice_info.full_shape[0] if is_partitioned_ev else 1 + + def _read_variable_closure(v): + + def f(): + with ops.device(v.device): + x = v.read_value() + return array_ops.identity(x) + + return f + + unused_tensor = var.handle + self.resource = lookup_resource(var) + + specs = [] + specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-keys", dtype=self.key_type, device=var.device)) + specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-values", dtype=dtypes.float32, device=var.device)) + specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-versions", dtype=dtypes.int64, device=var.device)) + specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-freqs", dtype=dtypes.int64, device=var.device)) + + # pylint: disable=protected-access + super(EmbeddingVariableSaveable, self).__init__(var, specs, name) + self.is_sparse = var._is_sparse + + def restore(self, restored_tensors, unused_restored_shapes): + # pylint: disable=protected-access + with ops.device("/cpu:0"): + name_tensor = ops.convert_to_tensor(self.name) + with ops.colocate_with(self.handle_op): + handle_name = ops.name_from_scope_name(self.name) + is_partitioned_ev = not isinstance(self.var._save_slice_info, str) + if self.var._init_data_source is not None: + return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num) + else: + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0] + with ops.control_dependencies(restore_dependency[self.var._primary_handle]): + rank = self.op.initial_value.get_shape().rank - 1 + restore_op = gen_kv_variable_ops.kv_resource_import_v3( + restored_tensors[0], + self.handle_op, + name_tensor, + ops.convert_to_tensor(self.invalid_key), + shape=self.op.initial_value.get_shape()[rank:], + partition_id=self.partition_id, + partition_num=self.partition_num, + dtype=self.var._dtype + ) + return restore_op + + def incr_restore(self, restored_tensors, unused_restored_shapes): + # pylint: disable=protected-access + name_tensor = ops.convert_to_tensor(self.name) + with ops.colocate_with(self.handle_op): + handle_name = ops.name_from_scope_name(self.name) + return gen_kv_variable_ops.kv_resource_incr_import( + restored_tensors[0], + self.handle_op, + name_tensor, + ops.convert_to_tensor(self.invalid_key), + variables._try_guard_against_uninitialized_dependencies(self.name, self.op.initial_value), + partition_id=self.partition_id, + partition_num=self.partition_num + ) diff --git a/build_deps/toolchains/gpu/BUILD b/deepray/custom_ops/embedding_variable/python/tests/__init__.py similarity index 100% rename from build_deps/toolchains/gpu/BUILD rename to deepray/custom_ops/embedding_variable/python/tests/__init__.py diff --git a/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py new file mode 100644 index 00000000..f1d1ee33 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py @@ -0,0 +1,116 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for EmbeddingBag layer.""" + +import pytest +import numpy as np +import tensorflow as tf + +from deepray.custom_ops.embedding_bag import EmbeddingBag, _embedding_bag +from deepray.utils import test_utils + + +def manual_embedding_bag(indices, params, weights=None, combiner="mean"): + gathered = tf.gather(params, indices) + if weights is not None: + gathered *= tf.expand_dims(weights, -1) + if combiner == "sum": + return tf.reduce_sum(gathered, -2, keepdims=False) + else: + assert combiner == "mean" + assert weights is None + return tf.reduce_mean(gathered, -2, keepdims=False) + + +@pytest.mark.with_device(["cpu", "gpu"]) +@pytest.mark.parametrize("input_shape", [(16, 32)]) +@pytest.mark.parametrize("input_dim", [63, 64]) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("combiner", ["sum", "mean"]) +def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner): + indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype) + params = np.random.random(size=(input_dim, 16)).astype(dtype) + if combiner == "sum": + weights = np.random.random(size=indices.shape).astype(dtype) + else: + weights = None + expected = manual_embedding_bag(indices, params, weights, combiner=combiner) + embedding_bag = EmbeddingBag(input_dim, 16, combiner=combiner, dtype=dtype) + embedding_bag.build(indices.shape) + embedding_bag.set_weights([params]) + indices = tf.convert_to_tensor(indices) + if weights is not None: + weights = tf.convert_to_tensor(weights) + output = embedding_bag( + indices, + weights, + ) + test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2) + + +@pytest.mark.with_device(["cpu", "gpu"]) +@pytest.mark.parametrize("input_shape", [(16, 32)]) +@pytest.mark.parametrize("input_dim", [63, 64]) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("combiner", ["sum", "mean"]) +@pytest.mark.usefixtures("maybe_run_functions_eagerly") +def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner): + indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype) + params = np.random.random(size=(input_dim, 16)).astype(dtype) + if combiner == "sum": + weights = np.random.random(size=indices.shape).astype(dtype) + else: + weights = None + + indices = tf.convert_to_tensor(indices) + params = tf.convert_to_tensor(params) + if weights is not None: + weights = tf.convert_to_tensor(weights) + + embedding_bag_fn = tf.function(_embedding_bag) + + if combiner == "sum": + with tf.GradientTape(persistent=True) as tape: + tape.watch([params, weights]) + output = embedding_bag_fn(indices, params, weights, combiner="sum") + expected = manual_embedding_bag(indices, params, weights, combiner="sum") + + grads = tape.gradient(output, [params, weights]) + expected_grads = tape.gradient(expected, [params, weights]) + # Gather returns sparse IndexedSlices so we have to sum them together. + test_utils.assert_allclose_according_to_type( + tf.convert_to_tensor(expected_grads[0]), + tf.convert_to_tensor(grads[0]), + half_rtol=1e-2, + half_atol=1e-2, + ) + test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2) + else: + with tf.GradientTape(persistent=True) as tape: + tape.watch(params) + output = embedding_bag_fn(indices, params, combiner=combiner) + expected = manual_embedding_bag(indices, params, combiner=combiner) + + grads = tape.gradient(output, [params]) + expected_grads = tape.gradient(expected, [params]) + # Gather returns sparse IndexedSlices so we have to sum them together. + test_utils.assert_allclose_according_to_type( + tf.convert_to_tensor(expected_grads[0]), + tf.convert_to_tensor(grads[0]), + half_rtol=1e-2, + half_atol=1e-2, + ) diff --git a/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py new file mode 100644 index 00000000..f41f9179 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py @@ -0,0 +1,254 @@ +"""Tests for tensorflow.ops.embedding_variable GPU version.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensorflow.python.platform import googletest +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.ops import embedding_ops + +from deepray.custom_ops.embedding_variable import kv_variable_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import array_ops + +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import config +from deepray.custom_ops.embedding_variable import config_pb2 +from tensorflow.python.training import training_util +from tensorflow.python.training import adagrad +from tensorflow.python.feature_column import feature_column +from tensorflow.python.feature_column import feature_column_v2 +from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable +from deepray.custom_ops.embedding_variable import variables as ev_variables + + +class GroupEmbeddingGPUTest(test_util.TensorFlowTestCase): + + @test_util.run_gpu_only + def testMultiKvResourceGather(self): + print("testMultiKvResourceGather") + + def runTestAdagrad(embedding_weights, indices, combiners): + emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners) + contcat_emb = array_ops.concat(emb, axis=-1) + fun = math_ops.multiply(contcat_emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session(use_gpu=True, force_gpu=True) as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + r, _, _ = sess.run([emb, train_op, loss]) + return r + + with ops.device('/GPU:0'): + emb_var_0 = get_embedding_variable( + "emb_var_0", embedding_dim=8, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + emb_var_1 = get_embedding_variable( + "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + indices_0 = sparse_tensor.SparseTensor( + indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64), + values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64), + dense_shape=[4, 3] + ) + + indices = [indices_0 for _ in range(2)] + ev_weights = [emb_var_0, emb_var_1] + combiners = ["mean", "sum"] + + ev_result = runTestAdagrad(ev_weights, indices, combiners) + for i in range(4): + if i == 2: + for j in range(16): + self.assertEqual(ev_result[1].tolist()[i][j], 2) + else: + for j in range(16): + self.assertEqual(ev_result[1].tolist()[i][j], 1) + + for i in range(4): + for j in range(8): + self.assertEqual(ev_result[0].tolist()[i][j], 1) + + @test_util.run_gpu_only + def testMultiEmbeddingSparseLookUp(self): + print("testMultiEmbeddingSparseLookUp") + + def runTestAdagrad(embedding_weights, indices, combiners): + emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners) + contcat_emb = array_ops.concat(emb, axis=-1) + fun = math_ops.multiply(contcat_emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session(use_gpu=True, force_gpu=True) as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + r, _, _ = sess.run([emb, train_op, loss]) + return r + + with ops.device('/GPU:0'): + + var_0 = variable_scope.get_variable( + "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 8) + ) + var_1 = variable_scope.get_variable( + "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16) + ) + + indices_0 = sparse_tensor.SparseTensor( + indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64), + values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64), + dense_shape=[4, 3] + ) + + indices = [indices_0 for _ in range(2)] + var_weights = [var_0, var_1] + combiners = ["mean", "sum"] + + var_result = runTestAdagrad(var_weights, indices, combiners) + for i in range(4): + if i == 2: + for j in range(16): + self.assertEqual(var_result[1].tolist()[i][j], 2) + else: + for j in range(16): + self.assertEqual(var_result[1].tolist()[i][j], 1) + + for i in range(4): + for j in range(8): + self.assertEqual(var_result[0].tolist()[i][j], 1) + + @test_util.run_gpu_only + def testMultiKvResourceGatherEqualMultiEmbeddingSparseLookUp(self): + print("testMultiKvResourceGather") + + def runTestAdagrad(embedding_weights, indices, combiners): + emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners) + contcat_emb = array_ops.concat(emb, axis=-1) + fun = math_ops.multiply(contcat_emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session(use_gpu=True, force_gpu=True) as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + r, _, _ = sess.run([emb, train_op, loss]) + r, _, _ = sess.run([emb, train_op, loss]) + r, _, _ = sess.run([emb, train_op, loss]) + r, _, _ = sess.run([emb, train_op, loss]) + r, _, _ = sess.run([emb, train_op, loss]) + return r + + with ops.device('/GPU:0'): + emb_var_1 = get_embedding_variable( + "emb_var_0", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + emb_var_2 = get_embedding_variable( + "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + var_0 = variable_scope.get_variable( + "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16) + ) + var_1 = variable_scope.get_variable( + "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16) + ) + + indices_0 = sparse_tensor.SparseTensor( + indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64), + values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64), + dense_shape=[4, 3] + ) + + indices = [indices_0 for _ in range(4)] + weights = [emb_var_1, emb_var_2, var_0, var_1] + combiners = ["mean", "sum", "mean", "sum"] + + ev_result = runTestAdagrad(weights, indices, combiners) + + for i in range(2): + for j in range(0, 4): + for k in range(0, 16): + self.assertNear(ev_result[i].tolist()[j][k], ev_result[2 + i].tolist()[j][k], 1e-05) + + @test_util.run_gpu_only + def testMultiKvResourceGatherForSparseColumnEmbeddingCol(self): + with feature_column_v2.group_embedding_column_scope(name="test"): + ad_columns = feature_column_v2.categorical_column_with_embedding( + key="ad_emb", + dtype=dtypes.int64, + ev_option=ev_variables.EmbeddingVariableOption( + storage_option=ev_variables.StorageOption(storage_type=config_pb2.StorageType.HBM) + ) + ) + ad_weights = feature_column_v2.embedding_column( + categorical_column=ad_columns, dimension=8, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + user_columns = feature_column_v2.categorical_column_with_embedding( + key="user_emb", + dtype=dtypes.int64, + ev_option=variables.EmbeddingVariableOption( + storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM) + ) + ) + user_weights = feature_column_v2.embedding_column( + categorical_column=user_columns, dimension=16, initializer=init_ops.ones_initializer(dtypes.float32) + ) + + ids = {} + ids["ad_emb"] = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 3]], + values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64), + dense_shape=[5, 4] + ) + ids["user_emb"] = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 1], [2, 2], [2, 3], [4, 3]], + values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64), + dense_shape=[5, 4] + ) + + emb = feature_column.input_layer(features=ids, feature_columns=[ad_weights, user_weights]) + + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session(force_gpu=True) as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run(init) + print("init global done") + print(sess.run([emb, train_op, loss])) + print(sess.run([emb, train_op, loss])) + print(sess.run([emb, train_op, loss])) + + +if __name__ == "__main__": + googletest.main() diff --git a/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py b/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py new file mode 100644 index 00000000..8261049e --- /dev/null +++ b/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py @@ -0,0 +1,7 @@ +from pathlib import Path +import sys +import pytest + +if __name__ == "__main__": + dirname = Path(__file__).absolute().parent + sys.exit(pytest.main(["-s", str(dirname)])) diff --git a/deepray/custom_ops/embedding_variable/variable_scope.py b/deepray/custom_ops/embedding_variable/variable_scope.py new file mode 100644 index 00000000..a1530297 --- /dev/null +++ b/deepray/custom_ops/embedding_variable/variable_scope.py @@ -0,0 +1,1277 @@ +import collections as collections_lib +import copy +import functools +import traceback + +from tensorflow.python.eager import context +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops.variable_scope import AUTO_REUSE +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util import function_utils +from tensorflow.python.util import tf_inspect +from tensorflow.python.util.tf_export import tf_export + +from deepray.utils import logging_util +from . import variables as ev_variables +from .python import kv_variable_ops + +logger = logging_util.get_logger() + + +@tf_export(v1=["VariableScope"]) +class VariableScope(object): + """Variable scope object to carry defaults to provide to `get_variable`. + + Many of the arguments we need for `get_variable` in a variable store are most + easily handled with a context. This object is used for the defaults. + + Attributes: + name: name of the current scope, used as prefix in get_variable. + initializer: default initializer passed to get_variable. + regularizer: default regularizer passed to get_variable. + reuse: Boolean, None, or tf.compat.v1.AUTO_REUSE, setting the reuse in + get_variable. When eager execution is enabled this argument is always + forced to be False. + caching_device: string, callable, or None: the caching device passed to + get_variable. + partitioner: callable or `None`: the partitioner passed to `get_variable`. + custom_getter: default custom getter passed to get_variable. + name_scope: The name passed to `tf.name_scope`. + dtype: default type passed to get_variable (defaults to DT_FLOAT). + use_resource: if False, create a normal Variable; if True create an + experimental ResourceVariable with well-defined semantics. Defaults to + False (will later change to True). When eager execution is enabled this + argument is always forced to be True. + constraint: An optional projection function to be applied to the variable + after being updated by an `Optimizer` (e.g. used to implement norm + constraints or value constraints for layer weights). The function must + take as input the unprojected Tensor representing the value of the + variable and return the Tensor for the projected value (which must have + the same shape). Constraints are not safe to use when doing asynchronous + distributed training. + """ + + def __init__( + self, + reuse, + name="", + initializer=None, + regularizer=None, + caching_device=None, + partitioner=None, + custom_getter=None, + name_scope="", + dtype=dtypes.float32, + use_resource=None, + constraint=None + ): + """Creates a new VariableScope with the given properties.""" + self._name = name + self._initializer = initializer + self._regularizer = regularizer + self._reuse = reuse + self._caching_device = caching_device + self._partitioner = partitioner + self._custom_getter = custom_getter + self._name_scope = name_scope + self._dtype = dtype + self._use_resource = use_resource + self._constraint = constraint + if context.executing_eagerly(): + if self._caching_device is not None: + raise NotImplementedError("Caching devices is not yet supported " + "when eager execution is enabled.") + self._reuse = AUTO_REUSE + self._use_resource = True + + @property + def name(self): + return self._name + + @property + def original_name_scope(self): + return self._name_scope + + @property + def reuse(self): + return self._reuse + + @property + def initializer(self): + return self._initializer + + @property + def dtype(self): + return self._dtype + + @property + def use_resource(self): + return self._use_resource + + @property + def regularizer(self): + return self._regularizer + + @property + def caching_device(self): + return self._caching_device + + @property + def partitioner(self): + return self._partitioner + + @property + def custom_getter(self): + return self._custom_getter + + @property + def constraint(self): + return self._constraint + + def reuse_variables(self): + """Reuse variables in this scope.""" + self._reuse = True + + def set_initializer(self, initializer): + """Set initializer for this scope.""" + self._initializer = initializer + + def set_dtype(self, dtype): + """Set data type for this scope.""" + self._dtype = dtype + + def set_use_resource(self, use_resource): + """Sets whether to use ResourceVariables for this scope.""" + if context.executing_eagerly() and not use_resource: + raise ValueError("When eager execution is enabled, " + "use_resource cannot be set to false.") + self._use_resource = use_resource + + def set_regularizer(self, regularizer): + """Set regularizer for this scope.""" + self._regularizer = regularizer + + def set_caching_device(self, caching_device): + """Set caching_device for this scope.""" + if context.executing_eagerly(): + raise NotImplementedError("Caching devices are not yet supported " + "when eager execution is enabled.") + self._caching_device = caching_device + + def set_partitioner(self, partitioner): + """Set partitioner for this scope.""" + self._partitioner = partitioner + + def set_custom_getter(self, custom_getter): + """Set custom getter for this scope.""" + self._custom_getter = custom_getter + + def get_collection(self, name): + """Get this scope's variables.""" + scope = self._name + "/" if self._name else "" + return ops.get_collection(name, scope) + + def trainable_variables(self): + """Get this scope's trainable variables.""" + return self.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + + def global_variables(self): + """Get this scope's global variables.""" + return self.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + + def local_variables(self): + """Get this scope's local variables.""" + return self.get_collection(ops.GraphKeys.LOCAL_VARIABLES) + + def get_variable( + self, + var_store, + name, + shape=None, + dtype=None, + initializer=None, + regularizer=None, + reuse=None, + trainable=None, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + custom_getter=None, + constraint=None, + ): + """Gets an existing variable with this name or create a new one.""" + if regularizer is None: + regularizer = self._regularizer + if caching_device is None: + caching_device = self._caching_device + if partitioner is None: + partitioner = self._partitioner + if custom_getter is None: + custom_getter = self._custom_getter + if context.executing_eagerly(): + reuse = False + use_resource = True + else: + if reuse is None: + reuse = self._reuse + if use_resource is None: + use_resource = self._use_resource + + full_name = self.name + "/" + name if self.name else name + # Variable names only depend on variable_scope (full_name here), + # not name_scope, so we reset it below for the time of variable creation. + with ops.name_scope(None): + # Check that `initializer` dtype and `dtype` are consistent before + # replacing them with defaults. + if dtype is not None and initializer is not None and not callable(initializer): + init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype + if init_dtype != dtype: + raise ValueError("Initializer type '%s' and explicit dtype '%s' " + "don't match." % (init_dtype, dtype)) + if initializer is None: + initializer = self._initializer + if constraint is None: + constraint = self._constraint + if dtype is None: + dtype = self._dtype + return var_store.get_variable( + full_name, + shape=shape, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + custom_getter=custom_getter, + constraint=constraint, + ) + + def get_embedding_variable( + self, + name, + shape=None, + dtype=None, + initializer=None, + regularizer=None, + reuse=None, + trainable=True, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + custom_getter=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Gets an existing variable with this name or create a new one.""" + if regularizer is None: + regularizer = self._regularizer + if caching_device is None: + caching_device = self._caching_device + if partitioner is None: + partitioner = self._partitioner + if custom_getter is None: + custom_getter = self._custom_getter + if not context.executing_eagerly(): + if reuse is None: + reuse = self._reuse + if use_resource is None: + use_resource = self._use_resource + else: + reuse = AUTO_REUSE + use_resource = True + + full_name = self.name + "/" + name if self.name else name + # Variable names only depend on variable_scope (full_name here), + # not name_scope, so we reset it below for the time of variable creation. + with ops.name_scope(None): + # Check that `initializer` dtype and `dtype` are consistent before + # replacing them with defaults. + if dtype is not None and initializer is not None and not callable(initializer): + init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype + if init_dtype != dtype: + raise ValueError("Initializer type '%s' and explicit dtype '%s' " + "don't match." % (init_dtype, dtype)) + if initializer is None: + initializer = self._initializer + if constraint is None: + constraint = self._constraint + if dtype is None: + dtype = self._dtype + if invalid_key is None: + invalid_key = -1 + return _VariableStore().get_variable( + full_name, + shape=shape, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + custom_getter=custom_getter, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + def get_dynamic_dimension_embedding_variable( + self, + var_store, + name, + shape=None, + embedding_block_num=None, + dtype=None, + initializer=None, + regularizer=None, + reuse=None, + trainable=True, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + custom_getter=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Gets an existing variable with this name or create a new one.""" + if regularizer is None: + regularizer = self._regularizer + if caching_device is None: + caching_device = self._caching_device + if partitioner is None: + partitioner = self._partitioner + if custom_getter is None: + custom_getter = self._custom_getter + if not context.executing_eagerly(): + if reuse is None: + reuse = self._reuse + if use_resource is None: + use_resource = self._use_resource + else: + reuse = AUTO_REUSE + use_resource = True + + full_name = self.name + "/" + name if self.name else name + # Variable names only depend on variable_scope (full_name here), + # not name_scope, so we reset it below for the time of variable creation. + with ops.name_scope(None): + # Check that `initializer` dtype and `dtype` are consistent before + # replacing them with defaults. + if dtype is not None and initializer is not None and not callable(initializer): + init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype + if init_dtype != dtype: + raise ValueError("Initializer type '%s' and explicit dtype '%s' " + "don't match." % (init_dtype, dtype)) + if initializer is None: + initializer = self._initializer + if constraint is None: + constraint = self._constraint + if dtype is None: + dtype = self._dtype + if invalid_key is None: + invalid_key = -1 + return var_store.get_variable( + full_name, + shape=shape, + embedding_block_num=embedding_block_num, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + custom_getter=custom_getter, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + def _get_partitioned_variable( + self, + var_store, + name, + shape=None, + dtype=None, + initializer=None, + regularizer=None, + trainable=None, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + constraint=None, + ): + """Gets an existing variable with this name or create a new one.""" + if initializer is None: + initializer = self._initializer + if regularizer is None: + regularizer = self._regularizer + if constraint is None: + constraint = self._constraint + if caching_device is None: + caching_device = self._caching_device + if partitioner is None: + partitioner = self._partitioner + if dtype is None: + dtype = self._dtype + if use_resource is None: + use_resource = self._use_resource + + if self._custom_getter is not None: + raise ValueError( + "Private access to _get_partitioned_variable is not allowed when " + "a custom getter is set. Current custom getter: %s. " + "It is likely that you're using create_partitioned_variables. " + "If so, consider instead using get_variable with a non-empty " + "partitioner parameter instead." % self._custom_getter + ) + + if partitioner is None: + raise ValueError("No partitioner was specified") + + # This allows the variable scope name to be used as the variable name if + # this function is invoked with an empty name arg, for backward + # compatibility with create_partitioned_variables(). + full_name_list = [] + if self.name: + full_name_list.append(self.name) + if name: + full_name_list.append(name) + full_name = "/".join(full_name_list) + + # Variable names only depend on variable_scope (full_name here), + # not name_scope, so we reset it below for the time of variable creation. + with ops.name_scope(None): + # pylint: disable=protected-access + return var_store._get_partitioned_variable( + full_name, + shape=shape, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=self.reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + constraint=constraint, + ) + # pylint: enable=protected-access + + +class _VariableStore(object): + """Variable store that carries a number of named Variables. + + New variable names and new variables can be created; all stored + variables are initialized with the initializer passed to __init__. + + Attributes: + vars: a dictionary with string names (same as passed in GetVar) as keys and + the corresponding TensorFlow Variables as values. + """ + + def __init__(self): + """Create a variable store.""" + self._vars = {} # A dictionary of the stored TensorFlow variables. + self._partitioned_vars = {} # A dict of the stored PartitionedVariables. + self._store_eager_variables = False + + def get_variable( + self, + name, + shape=None, + embedding_block_num=None, + dtype=dtypes.float32, + initializer=None, + regularizer=None, + reuse=None, + trainable=None, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + custom_getter=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Gets an existing variable with these parameters or create a new one. + + If a variable with the given name is already stored, we return the stored + variable. Otherwise, we create a new one. + + Set `reuse` to `True` when you only want to reuse existing Variables. + Set `reuse` to `False` when you only want to create new Variables. + Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you want + variables to be created if they don't exist or returned if they do. + + If initializer is `None` (the default), the default initializer passed in + the constructor is used. If that one is `None` too, we use a new + `glorot_uniform_initializer`. If initializer is a Tensor, we use + it as a value and derive the shape from the initializer. + + If a partitioner is provided, a `PartitionedVariable` is returned. + Accessing this object as a `Tensor` returns the shards concatenated along + the partition axis. + + Some useful partitioners are available. See, e.g., + `variable_axis_size_partitioner` and `min_max_variable_partitioner`. + + Args: + name: The name of the new or existing variable. + shape: Shape of the new or existing variable. + dtype: Type of the new or existing variable (defaults to `DT_FLOAT`). + initializer: Initializer for the variable. + regularizer: A (Tensor -> Tensor or None) function; the result of applying + it on a newly created variable will be added to the collection + GraphKeys.REGULARIZATION_LOSSES and can be used for regularization. + reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation of + variables. When eager execution is enabled this argument is always + forced to be False. + trainable: If `True` also add the variable to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable` + defaults to `True`, unless `synchronization` is set to `ON_READ`, in + which case it defaults to `False`. + collections: List of graph collections keys to add the `Variable` to. + Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`). + caching_device: Optional device string or function describing where the + Variable should be cached for reading. Defaults to the Variable's + device. If not `None`, caches on another device. Typical use is to + cache on the device where the Ops using the `Variable` reside, to + deduplicate copying through `Switch` and other conditional statements. + partitioner: Optional callable that accepts a fully defined `TensorShape` + and dtype of the `Variable` to be created, and returns a list of + partitions for each axis (currently only one axis can be partitioned). + validate_shape: If False, allows the variable to be initialized with a + value of unknown shape. If True, the default, the shape of initial_value + must be known. + use_resource: If False, creates a regular Variable. If True, creates + instead an experimental ResourceVariable which has well-defined + semantics. Defaults to False (will later change to True). When eager + execution is enabled this argument is always forced to be true. + custom_getter: Callable that takes as a first argument the true getter, + and allows overwriting the internal get_variable method. The signature + of `custom_getter` should match that of this method, + but the most future-proof version will allow for changes: `def + custom_getter(getter, *args, **kwargs)`. Direct access to + all `get_variable` parameters is also allowed: `def + custom_getter(getter, name, *args, **kwargs)`. A simple identity + custom getter that simply creates variables with modified names is: + ```python + def custom_getter(getter, name, *args, **kwargs): return getter(name + + '_suffix', *args, **kwargs) ``` + constraint: An optional projection function to be applied to the variable + after being updated by an `Optimizer` (e.g. used to implement norm + constraints or value constraints for layer weights). The function must + take as input the unprojected Tensor representing the value of the + variable and return the Tensor for the projected value (which must have + the same shape). Constraints are not safe to use when doing asynchronous + distributed training. + + Returns: + The created or existing `Variable` (or `PartitionedVariable`, if a + partitioner was used). + + Raises: + ValueError: when creating a new variable and shape is not declared, + when reusing a variable and specifying a conflicting shape, + or when violating reuse during variable creation. + RuntimeError: when eager execution is enabled and not called from an + EagerVariableStore. + """ + if custom_getter is not None and not callable(custom_getter): + raise ValueError("Passed a custom_getter which is not callable: %s" % custom_getter) + + # If a *_ref type is passed in an error would be triggered further down the + # stack. We prevent this using base_dtype to get a non-ref version of the + # type, before doing anything else. When _ref types are removed in favor of + # resources, this line can be removed. + try: + dtype = dtype.base_dtype + except AttributeError: + # .base_dtype not existing means that we will try and use the raw dtype + # which was passed in - this might be a NumPy type which is valid. + pass + + # This is the main logic of get_variable. However, custom_getter + # may override this logic. So we save it as a callable and pass + # it to custom_getter. + # Note: the parameters of _true_getter, and their documentation, match + # *exactly* item-for-item with the docstring of this method. + def _true_getter( # pylint: disable=missing-docstring + name, + shape=None, + embedding_block_num=None, + dtype=dtypes.float32, + initializer=None, + regularizer=None, + reuse=None, + trainable=None, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + use_resource=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000): + is_scalar = (shape is not None and isinstance(shape, collections_lib.abc.Sequence) and not shape) + # Partitioned variable case + if partitioner is not None and not is_scalar: + if not callable(partitioner): + raise ValueError("Partitioner must be callable, but received: %s" % partitioner) + with ops.name_scope(None): + return self._get_partitioned_variable( + name=name, + shape=shape, + embedding_block_num=embedding_block_num, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + # Special case for partitioned variable to allow reuse without having to + # specify partitioner. + if reuse is True and partitioner is None and name in self._partitioned_vars: + return self._get_partitioned_variable( + name=name, + shape=shape, + embedding_block_num=embedding_block_num, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=None, + validate_shape=validate_shape, + use_resource=use_resource, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + # Single variable case + if "%s/part_0" % name in self._vars: + raise ValueError( + "No partitioner was provided, but a partitioned version of the " + "variable was found: %s/part_0. Perhaps a variable of the same " + "name was already created with partitioning?" % name + ) + + return self._get_single_variable( + name=name, + shape=shape, + embedding_block_num=embedding_block_num, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + validate_shape=validate_shape, + use_resource=use_resource, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + if custom_getter is not None: + # Handle backwards compatibility with getter arguments that were added + # to the API after users started writing custom getters. + custom_getter_kwargs = { + "getter": _true_getter, + "name": name, + "shape": shape, + "embedding_block_num": embedding_block_num, + "dtype": dtype, + "initializer": initializer, + "regularizer": regularizer, + "reuse": reuse, + "trainable": trainable, + "collections": collections, + "caching_device": caching_device, + "partitioner": partitioner, + "validate_shape": validate_shape, + "use_resource": use_resource, + "invalid_key": invalid_key, + "evconfig": evconfig, + "ht_partition_num": ht_partition_num, + } + # `fn_args` and `has_kwargs` can handle functions, `functools.partial`, + # `lambda`. + if "constraint" in function_utils.fn_args(custom_getter) or function_utils.has_kwargs(custom_getter): + custom_getter_kwargs["constraint"] = constraint + return custom_getter(**custom_getter_kwargs) + else: + return _true_getter( + name, + shape=shape, + embedding_block_num=embedding_block_num, + dtype=dtype, + initializer=initializer, + regularizer=regularizer, + reuse=reuse, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=use_resource, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + def _get_single_variable( + self, + name, + shape=None, + embedding_block_num=None, + dtype=dtypes.float32, + initializer=None, + regularizer=None, + partition_info=None, + reuse=None, + trainable=None, + collections=None, + caching_device=None, + validate_shape=True, + use_resource=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 + ): + """Get or create a single Variable (e.g. + + a shard or entire variable). + + See the documentation of get_variable above (ignore partitioning components) + for details. + + Args: + name: see get_variable. + shape: see get_variable. + dtype: see get_variable. + initializer: see get_variable. + regularizer: see get_variable. + partition_info: _PartitionInfo object. + reuse: see get_variable. + trainable: see get_variable. + collections: see get_variable. + caching_device: see get_variable. + validate_shape: see get_variable. + use_resource: see get_variable. + constraint: see get_variable. + + Returns: + A Variable. See documentation of get_variable above. + + Raises: + ValueError: See documentation of get_variable above. + """ + # Set to true if initializer is a constant. + initializing_from_value = False + if initializer is not None and not callable(initializer): + initializing_from_value = True + if shape is not None and initializing_from_value: + raise ValueError("If initializer is a constant, do not specify shape.") + + dtype = dtypes.as_dtype(dtype) + shape = tensor_shape.as_shape(shape) + + if name in self._vars: + # Here we handle the case when returning an existing variable. + if reuse is False: + var = self._vars[name] + err_msg = ( + "Variable %s already exists, disallowed." + " Did you mean to set reuse=True or " + "reuse=tf.AUTO_REUSE in VarScope?" % name + ) + # ResourceVariables don't have an op associated with so no traceback + if isinstance(var, resource_variable_ops.ResourceVariable): + raise ValueError(err_msg) + tb = var.op.traceback[::-1] + # Throw away internal tf entries and only take a few lines. In some + # cases the traceback can be longer (e.g. if someone uses factory + # functions to create variables) so we take more than needed in the + # default case. + tb = [x for x in tb if "tensorflow/python" not in x[0]][:5] + raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(traceback.format_list(tb)))) + found_var = self._vars[name] + from tensorflow.python.ops.hash_table import hash_table + if isinstance(found_var, (hash_table.HashTable, hash_table.DistributedHashTable)): + raise ValueError( + "Trying to reuse variable %s, but an existing variable is a" + " HashTable or DistributedHashTable, can not reuse it." % (name) + ) + if not shape.is_compatible_with(found_var.get_shape()): + raise ValueError( + "Trying to share variable %s, but specified shape %s" + " and found shape %s." % (name, shape, found_var.get_shape()) + ) + if not dtype.is_compatible_with(found_var.dtype): + dtype_str = dtype.name + found_type_str = found_var.dtype.name + raise ValueError( + "Trying to share variable %s, but specified dtype %s" + " and found dtype %s." % (name, dtype_str, found_type_str) + ) + return found_var + + # Create the tensor to initialize the variable with default value. + if initializer is None: + initializer, initializing_from_value = self._get_default_initializer(name=name, shape=shape, dtype=dtype) + # Enter an init scope when creating the initializer. + with ops.init_scope(): + if initializing_from_value: + init_val = initializer + variable_dtype = None + else: + # Instantiate initializer if provided initializer is a type object. + if tf_inspect.isclass(initializer): + initializer = initializer() + if shape is not None and shape.is_fully_defined(): + if use_resource and invalid_key is not None: + s = [1 if isinstance(initializer, init_ops.Constant) else evconfig.default_value_dim] + shape.as_list() + evconfig.default_value_dim = 1 if isinstance(initializer, init_ops.Constant) else evconfig.default_value_dim + else: + s = shape.as_list() + init_val = functools.partial(initializer, shape=s, dtype=dtype) + variable_dtype = dtype.base_dtype + elif len(tf_inspect.getargspec(initializer).args) == len(tf_inspect.getargspec(initializer).defaults or []): + init_val = initializer + variable_dtype = None + else: + raise ValueError( + "The initializer passed is not valid. It should " + "be a callable with no arguments and the " + "shape should not be provided or an instance of " + "`tf.keras.initializers.*' and `shape` should be " + "fully defined." + ) + + v = default_variable_creator( + initial_value=init_val, + name=name, + trainable=trainable, + collections=collections, + caching_device=caching_device, + embedding_block_num=embedding_block_num, + dtype=variable_dtype, + validate_shape=validate_shape, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + initializer=initializer, + ht_partition_num=ht_partition_num + ) + if not context.executing_eagerly() or self._store_eager_variables: + # In eager mode we do not want to keep default references to Variable + # objects as this will prevent their memory from being released. + self._vars[name] = v + logging.vlog(1, "Created variable %s with shape %s and init %s", v.name, format(shape), initializer) + + # Run the regularizer if requested and save the resulting loss. + if regularizer: + with ops.colocate_with(v): + with ops.name_scope(name + "/Regularizer/"): + with ops.init_scope(): + loss = regularizer(v) + if loss is not None: + if context.executing_eagerly(): + v_name = "v_%s" % type(v) + loss_name = "loss_%s" % type(loss) + else: + v_name = v.name + loss_name = loss.name + logging.vlog( + 1, "Applied regularizer to %s and added the result %s " + "to REGULARIZATION_LOSSES.", v_name, loss_name + ) + ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, loss) + return v + + # Initialize variable when no initializer provided + def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32): + """Provide a default initializer and a corresponding value. + + Args: + name: see get_variable. + shape: see get_variable. + dtype: see get_variable. + + Returns: + initializer and initializing_from_value. See get_variable above. + + Raises: + ValueError: When giving unsupported dtype. + """ + del shape + # If dtype is DT_FLOAT, provide a uniform unit scaling initializer + if dtype.is_floating: + initializer = init_ops.glorot_uniform_initializer() + initializing_from_value = False + # If dtype is DT_INT/DT_UINT, provide a default value `zero` + # If dtype is DT_BOOL, provide a default value `FALSE` + elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool or dtype == dtypes.string: + initializer = init_ops.zeros_initializer() + initializing_from_value = False + # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here? + else: + raise ValueError("An initializer for variable %s of %s is required" % (name, dtype.base_dtype)) + + return initializer, initializing_from_value + + +# @tf_export(v1=["get_embedding_variable"]) +def get_embedding_variable_internal( + name, + embedding_dim, + key_dtype=dtypes.int64, + value_dtype=None, + initializer=None, + regularizer=None, + trainable=True, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + custom_getter=None, + constraint=None, + steps_to_live=None, + init_data_source=None, + ev_option=ev_variables.EmbeddingVariableOption() +): + if key_dtype == dtypes.int64: + invalid_key = 9223372036854775807 + elif key_dtype == dtypes.int32: + invalid_key = -1 + elif key_dtype == dtypes.string: + invalid_key = "" + else: + raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype) + l2_weight_threshold = -1.0 + if initializer is None and ev_option.init.initializer is None: + initializer = init_ops.truncated_normal_initializer() + elif ev_option.init.initializer is not None: + if initializer is not None: + logger.warning("Use initializer in InitializerOption.") + initializer = ev_option.init.initializer + if ev_option.evict is not None: + if isinstance(ev_option.evict, ev_variables.GlobalStepEvict): + if steps_to_live is not None: + logger.warning("Warning: steps_to_live is double set, the steps_to_live in EvcitConfig is valid") + steps_to_live = ev_option.evict.steps_to_live + elif isinstance(ev_option.evict, ev_variables.L2WeightEvict): + l2_weight_threshold = ev_option.evict.l2_weight_threshold + else: + l2_weight_threshold = -1.0 + if steps_to_live is not None and l2_weight_threshold > 0: + raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.") + return VariableScope(reuse=False).get_embedding_variable( + name, + shape=embedding_dim, + dtype=value_dtype, + initializer=initializer, + regularizer=regularizer, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=True, + custom_getter=custom_getter, + constraint=constraint, + invalid_key=invalid_key, + evconfig=ev_variables.EmbeddingVariableConfig( + steps_to_live=steps_to_live, + init_data_source=init_data_source, + ht_type=ev_option.ht_type, + l2_weight_threshold=l2_weight_threshold, + filter_strategy=ev_option.filter_strategy, + storage_type=ev_option.storage_option.storage_type, + storage_path=ev_option.storage_option.storage_path, + storage_size=ev_option.storage_option.storage_size, + storage_cache_strategy=ev_option.storage_option.cache_strategy, + layout=ev_option.storage_option.layout, + default_value_dim=ev_option.init.default_value_dim, + default_value_no_permission=ev_option.init.default_value_no_permission + ), + ht_partition_num=ev_option.ht_partition_num + ) + + +# @tf_export(v1=["get_embedding_variable_v2"]) +def get_embedding_variable_v2_internal( + name, + embedding_dim, + key_dtype=dtypes.int64, + value_dtype=None, + initializer=None, + regularizer=None, + trainable=True, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + custom_getter=None, + constraint=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + ht_partition_num=1000 +): + if key_dtype == dtypes.int64: + invalid_key = 9223372036854775807 + elif key_dtype == dtypes.int32: + invalid_key = -1 + elif key_dtype == dtypes.string: + invalid_key = "" + else: + raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype) + if initializer is None: + initializer = init_ops.truncated_normal_initializer() + return VariableScope(reuse=False).get_embedding_variable( + name, + shape=embedding_dim, + dtype=value_dtype, + initializer=initializer, + regularizer=regularizer, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=True, + custom_getter=custom_getter, + constraint=constraint, + invalid_key=invalid_key, + evconfig=evconfig, + ht_partition_num=ht_partition_num + ) + + +@tf_export(v1=["get_embedding_variable"]) +def get_embedding_variable( + name, + embedding_dim, + key_dtype=dtypes.int64, + value_dtype=None, + initializer=None, + regularizer=None, + trainable=True, + collections=None, + caching_device=None, + partitioner=None, + validate_shape=True, + custom_getter=None, + constraint=None, + steps_to_live=None, + init_data_source=None, + ev_option=ev_variables.EmbeddingVariableOption() +): + if key_dtype == dtypes.int64: + invalid_key = 9223372036854775807 + elif key_dtype == dtypes.int32: + invalid_key = -1 + elif key_dtype == dtypes.string: + invalid_key = "" + else: + raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype) + l2_weight_threshold = -1.0 + if initializer is None and ev_option.init.initializer is None: + initializer = init_ops.truncated_normal_initializer() + elif ev_option.init.initializer is not None: + if initializer is not None: + print("use initializer give in InitializerOption.") + initializer = ev_option.init.initializer + if steps_to_live is not None: + logger.warning("steps_to_live is deprecated," + " use tf.GlobaStepEvcit(steps_to_live)") + if ev_option.evict is not None: + if isinstance(ev_option.evict, ev_variables.GlobalStepEvict): + if steps_to_live is not None: + logger.warning("Warning: steps_to_live is double set, the steps_to_live in GlobalStepEvict is valid") + steps_to_live = ev_option.evict.steps_to_live + elif isinstance(ev_option.evict, ev_variables.L2WeightEvict): + l2_weight_threshold = ev_option.evict.l2_weight_threshold + else: + l2_weight_threshold = -1.0 + if steps_to_live is not None and l2_weight_threshold > 0: + raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.") + return VariableScope(reuse=False).get_embedding_variable( + name, + shape=embedding_dim, + dtype=value_dtype, + initializer=initializer, + regularizer=regularizer, + trainable=trainable, + collections=collections, + caching_device=caching_device, + partitioner=partitioner, + validate_shape=validate_shape, + use_resource=True, + custom_getter=custom_getter, + constraint=constraint, + invalid_key=invalid_key, + evconfig=ev_variables.EmbeddingVariableConfig( + steps_to_live=steps_to_live, + init_data_source=init_data_source, + ht_type=ev_option.ht_type, + l2_weight_threshold=l2_weight_threshold, + filter_strategy=ev_option.filter_strategy, + storage_type=ev_option.storage_option.storage_type, + storage_path=ev_option.storage_option.storage_path, + storage_size=ev_option.storage_option.storage_size, + storage_cache_strategy=ev_option.storage_option.cache_strategy, + layout=ev_option.storage_option.layout, + default_value_dim=ev_option.init.default_value_dim, + default_value_no_permission=ev_option.init.default_value_no_permission + ), + ht_partition_num=ev_option.ht_partition_num + ) + + +def default_variable_creator( + initial_value=None, + trainable=None, + collections=None, + validate_shape=True, + caching_device=None, + name=None, + variable_def=None, + dtype=None, + embedding_block_num=None, + import_scope=None, + constraint=None, + invalid_key=None, + evconfig=ev_variables.EmbeddingVariableConfig(), + initializer=None, + ht_partition_num=1000 +): + if invalid_key is not None: + emb_blocknum = embedding_block_num + if emb_blocknum is None: + ev = kv_variable_ops.EmbeddingVariable( + initial_value=initial_value, + trainable=trainable, + collections=collections, + validate_shape=validate_shape, + caching_device=caching_device, + name=name, + dtype=dtype, + constraint=constraint, + variable_def=variable_def, + import_scope=import_scope, + invalid_key=invalid_key, + evconfig=evconfig, + # initializer=initializer, + ht_partition_num=ht_partition_num + ) + if evconfig.init_data_source is not None: + ev.set_init_data_source_initializer(evconfig.init_data_source) + return ev + else: + evconfig.block_num = emb_blocknum + evlist = [] + block_evconfig = copy.copy(evconfig) + block_evconfig.handle_name = name + block_evconfig.emb_index = 0 + primary_ev = kv_variable_ops.EmbeddingVariable( + initial_value=initial_value, + trainable=trainable, + collections=collections, + validate_shape=validate_shape, + caching_device=caching_device, + name=name + "/block0", + dtype=dtype, + constraint=constraint, + variable_def=variable_def, + import_scope=import_scope, + invalid_key=invalid_key, + evconfig=block_evconfig, + initializer=initializer, + ht_partition_num=ht_partition_num + ) + if evconfig.init_data_source is not None: + primary_ev.set_init_data_source_initializer(evconfig.init_data_source) + evlist.append(primary_ev) + block_evconfig.primary = primary_ev + with ops.colocate_with(primary_ev): + block_evconfig.handle_name = primary_ev._block_handle_name + for i in range(emb_blocknum - 1): + slave_evconfig = copy.copy(block_evconfig) + slave_evconfig.emb_index = i + 1 + slave_evconfig._slot_num = primary_ev._slot_num + slave_ev = kv_variable_ops.EmbeddingVariable( + initial_value=initial_value, + trainable=trainable, + collections=collections, + validate_shape=validate_shape, + caching_device=caching_device, + name=name + "/block" + str(i + 1), + dtype=dtype, + constraint=constraint, + variable_def=variable_def, + import_scope=import_scope, + invalid_key=invalid_key, + evconfig=slave_evconfig, + initializer=initializer, + ht_partition_num=ht_partition_num + ) + if evconfig.init_data_source is not None: + slave_ev._set_init_data_source_initializer(evconfig.init_data_source) + evlist.append(slave_ev) + dyn_ev = kv_variable_ops.DynamicEmbeddingVariable(name, evlist) + return dyn_ev diff --git a/deepray/custom_ops/embedding_variable/variables.py b/deepray/custom_ops/embedding_variable/variables.py new file mode 100644 index 00000000..31a1c6ad --- /dev/null +++ b/deepray/custom_ops/embedding_variable/variables.py @@ -0,0 +1,206 @@ +from tensorflow.python.framework import dtypes +from tensorflow.python.lib.io import file_io +from tensorflow.python.util.tf_export import tf_export + +from deepray.custom_ops.embedding_variable import config_pb2 +from deepray.utils import logging_util + +logger = logging_util.get_logger() + + +@tf_export(v1=["InitializerOption"]) +class InitializerOption(object): + + def __init__(self, initializer=None, default_value_dim=4096, default_value_no_permission=.0): + self.initializer = initializer + self.default_value_dim = default_value_dim + self.default_value_no_permission = default_value_no_permission + if default_value_dim <= 0: + print("default value dim must larger than 1, the default value dim is set to default 4096.") + default_value_dim = 4096 + + +@tf_export(v1=["GlobalStepEvict"]) +class GlobalStepEvict(object): + + def __init__(self, steps_to_live=None): + self.steps_to_live = steps_to_live + + +@tf_export(v1=["L2WeightEvict"]) +class L2WeightEvict(object): + + def __init__(self, l2_weight_threshold=-1.0): + self.l2_weight_threshold = l2_weight_threshold + if l2_weight_threshold <= 0 and l2_weight_threshold != -1.0: + logger.warning("l2_weight_threshold is invalid, l2_weight-based eviction is disabled") + + +@tf_export(v1=["CheckpointOption"]) +class CheckpointOption(object): + + def __init__( + self, + ckpt_to_load_from=None, + tensor_name_in_ckpt=None, + always_load_from_specific_ckpt=False, + init_data_source=None + ): + self.ckpt_to_load_from = ckpt_to_load_from + self.tensor_name_in_ckpt = tensor_name_in_ckpt + self.always_load_from_specific_ckpt = always_load_from_specific_ckpt + self.init_data_source = init_data_source + + +@tf_export(v1=["StorageOption"]) +class StorageOption(object): + + def __init__( + self, + storage_type=None, + storage_path=None, + storage_size=[1024 * 1024 * 1024], + cache_strategy=config_pb2.CacheStrategy.LFU, + layout=None + ): + self.storage_type = storage_type + self.storage_path = storage_path + self.storage_size = storage_size + self.cache_strategy = cache_strategy + self.layout = layout + if not isinstance(storage_size, list): + raise ValueError("storage_size should be list type") + if len(storage_size) < 4: + for i in range(len(storage_size), 4): + storage_size.append(1024 * 1024 * 1024) + if storage_path is not None: + if storage_type is None: + raise ValueError("storage_type musnt'be None when storage_path is set") + else: + if not file_io.file_exists(storage_path): + file_io.recursive_create_dir(storage_path) + else: + if storage_type is not None and storage_type in [ + config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_SSDHASH, + config_pb2.StorageType.DRAM_LEVELDB + ]: + raise ValueError("storage_path musnt'be None when storage_type is set") + + +@tf_export(v1=["EmbeddingVariableOption"]) +class EmbeddingVariableOption(object): + + def __init__( + self, + ht_type="", + ht_partition_num=1000, + evict_option=None, + ckpt=None, + filter_option=None, + storage_option=StorageOption(), + init_option=InitializerOption() + ): + self.ht_type = ht_type + self.ht_partition_num = ht_partition_num + self.evict = evict_option + self.ckpt = ckpt + self.filter_strategy = filter_option + self.storage_option = storage_option + self.init = init_option + + +@tf_export(v1=["CounterFilter"]) +class CounterFilter(object): + + def __init__(self, filter_freq=0): + self.filter_freq = filter_freq + + +@tf_export(v1=["CBFFilter"]) +class CBFFilter(object): + + def __init__(self, filter_freq=0, max_element_size=0, false_positive_probability=-1.0, counter_type=dtypes.uint64): + if false_positive_probability != -1.0: + if false_positive_probability <= 0.0: + raise ValueError("false_positive_probablity must larger than 0") + else: + if max_element_size <= 0: + raise ValueError("max_element_size must larger than 0 when false_positive_probability is not -1.0") + else: + if max_element_size != 0: + raise ValueError("max_element_size can't be set when false_probability is -1.0") + self.max_element_size = max_element_size + self.false_positive_probability = false_positive_probability + self.counter_type = counter_type + self.filter_freq = filter_freq + + +class EmbeddingVariableConfig(object): + + def __init__( + self, + steps_to_live=None, + steps_to_live_l2reg=None, + l2reg_theta=None, + l2reg_lambda=None, + l2_weight_threshold=-1.0, + ht_type=None, + filter_strategy=None, + ckpt_to_load_from=None, + tensor_name_in_ckpt=None, + always_load_from_specific_ckpt=False, + init_data_source=None, + handle_name=None, + emb_index=None, + slot_index=None, + block_num=None, + primary=None, + slot_num=None, + storage_type=config_pb2.StorageType.DRAM, + storage_path=None, + storage_size=None, + storage_cache_strategy=config_pb2.CacheStrategy.LFU, + layout=None, + default_value_dim=4096, + default_value_no_permission=.0 + ): + self.steps_to_live = steps_to_live + self.steps_to_live_l2reg = steps_to_live_l2reg + self.l2reg_theta = l2reg_theta + self.l2reg_lambda = l2reg_lambda + self.ckpt_to_load_from = ckpt_to_load_from + self.tensor_name_in_ckpt = tensor_name_in_ckpt + self.always_load_from_specific_ckpt = always_load_from_specific_ckpt + self.init_data_source = init_data_source + self.handle_name = handle_name + self.emb_index = emb_index + self.slot_index = slot_index + self.block_num = block_num + self.primary = primary + self.slot_num = slot_num + self.ht_type = ht_type + self.l2_weight_threshold = l2_weight_threshold + self.filter_strategy = filter_strategy + self.storage_type = storage_type + self.storage_path = storage_path + self.storage_size = storage_size + self.storage_cache_strategy = storage_cache_strategy + self.layout = layout + self.default_value_dim = default_value_dim + self.default_value_no_permission = default_value_no_permission + + def reveal(self): + if self.steps_to_live is None: + self.steps_to_live = 0 + if self.steps_to_live_l2reg is None: + self.steps_to_live_l2reg = 0 + if self.l2reg_theta is None: + self.l2reg_theta = 0 + if self.l2reg_lambda is None: + self.l2reg_lambda = 0 + if self.ht_type is None: + self.ht_type = '' + if self.emb_index is None: + self.emb_index = 0 + if self.slot_index is None: + self.slot_index = 0 diff --git a/deepray/custom_ops/ffm_ops/BUILD b/deepray/custom_ops/ffm_ops/BUILD index 3e2bce28..b60b9fea 100644 --- a/deepray/custom_ops/ffm_ops/BUILD +++ b/deepray/custom_ops/ffm_ops/BUILD @@ -1,5 +1,4 @@ load("//deepray:deepray.bzl", "custom_op_library") -load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION") licenses(["notice"]) # Apache 2.0 @@ -10,11 +9,14 @@ custom_op_library( "cc/kernels/ffm_kernels.h", "cc/ops/ffm_ops.cc", ], - copts = [CPLUSPLUS_VERSION], - cuda_srcs = [ + copts = ["-Wno-unused-result"], + gpu_srcs = [ "cc/kernels/ffm_kernels.h", "cc/kernels/ffm_kernels.cu.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + ], ) py_library( @@ -42,5 +44,9 @@ py_test( main = "python/tests/run_all_test.py", deps = [ ":ffm_ops", + "//deepray/layers", + "@pypi_numpy//:pkg", + "@pypi_pytest//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc index 87384401..e3c05133 100644 --- a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc +++ b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc @@ -15,10 +15,11 @@ #if GOOGLE_CUDA #define EIGEN_USE_GPU +#include "ffm_kernels.h" + #include #include -#include "ffm_kernels.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" diff --git a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc index 7aa66618..ec7f6ebb 100644 --- a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc +++ b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -43,7 +44,7 @@ REGISTER_OP("FFM") } ctx->set_output(0, ctx->Matrix(batch_size, out_dims)); - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("FFMGrad") @@ -57,7 +58,7 @@ REGISTER_OP("FFMGrad") .SetShapeFn([](shape_inference::InferenceContext *ctx) { ctx->set_output(0, ctx->input(1)); ctx->set_output(1, ctx->input(2)); - return Status::OK(); + return TFOkStatus; }); } // namespace tensorflow diff --git a/deepray/custom_ops/ffm_ops/python/ffm_ops.py b/deepray/custom_ops/ffm_ops/python/ffm_ops.py index 4b993931..c40fd3e0 100644 --- a/deepray/custom_ops/ffm_ops/python/ffm_ops.py +++ b/deepray/custom_ops/ffm_ops/python/ffm_ops.py @@ -16,7 +16,7 @@ from deepray.utils.resource_loader import LazySO -gen_ffm_ops = LazySO("custom_ops/feature_cross/_ffm_ops.so") +gen_ffm_ops = LazySO("custom_ops/ffm_ops/_ffm_ops.so") def ffm(left: tf.Tensor, right: tf.Tensor, dim_size: int, int_type: str = 'multiply') -> tf.Tensor: diff --git a/deepray/custom_ops/multiplex_1/BUILD b/deepray/custom_ops/multiplex_1/BUILD index 6d6c699b..6c807e89 100644 --- a/deepray/custom_ops/multiplex_1/BUILD +++ b/deepray/custom_ops/multiplex_1/BUILD @@ -9,11 +9,20 @@ custom_op_library( "multiplex_1_kernel.cc", "multiplex_1_op.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + ], ) py_library( - name = "multiplex_1_op", - srcs = ["multiplex_1_op.py"], + name = "multiplex_1", + srcs = glob( + [ + "python/*.py", + "python/**/*.py", + "*.py", + ], + ), data = [":multiplex_1_kernel.so"], srcs_version = "PY3", visibility = ["//visibility:public"], @@ -29,6 +38,8 @@ py_test( "no_mac", # TODO(b/216321151): Re-enable this test. ], deps = [ - ":multiplex_1_op", + ":multiplex_1", + "@pypi_numpy//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/build_deps/toolchains/gpu/crosstool/BUILD b/deepray/custom_ops/multiplex_1/__init__.py similarity index 100% rename from build_deps/toolchains/gpu/crosstool/BUILD rename to deepray/custom_ops/multiplex_1/__init__.py diff --git a/deepray/custom_ops/multiplex_1/multiplex_1_op.cc b/deepray/custom_ops/multiplex_1/multiplex_1_op.cc index f2c8d015..3de4974b 100644 --- a/deepray/custom_ops/multiplex_1/multiplex_1_op.cc +++ b/deepray/custom_ops/multiplex_1/multiplex_1_op.cc @@ -13,16 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" -/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()". -This code is for compatibility.*/ -#if TF_VERSION_INTEGER >= 2100 -#define TFOkStatus ::tensorflow::OkStatus() -#else -#define TFOkStatus ::tensorflow::Status::OK() -#endif +using namespace tensorflow; // Use a namespace when registering by prepending the // package's name to the op’s name and separate with a '>'. diff --git a/deepray/custom_ops/multiplex_2/BUILD b/deepray/custom_ops/multiplex_2/BUILD index b4dfb447..5478998b 100644 --- a/deepray/custom_ops/multiplex_2/BUILD +++ b/deepray/custom_ops/multiplex_2/BUILD @@ -13,7 +13,7 @@ custom_op_library( "multiplex_2_kernel.h", "multiplex_2_op.cc", ], - cuda_srcs = [ + gpu_srcs = [ "multiplex_2_kernel.h", "multiplex_2_kernel.cu.cc", ], @@ -21,7 +21,13 @@ custom_op_library( py_library( name = "multiplex_2_op", - srcs = ["multiplex_2_op.py"], + srcs = glob( + [ + "python/*.py", + "python/**/*.py", + "*.py", + ], + ), data = ["multiplex_2_kernel.so"], srcs_version = "PY3", visibility = ["//visibility:public"], @@ -38,5 +44,7 @@ py_test( ], deps = [ ":multiplex_2_op", + "@pypi_numpy//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/build_deps/toolchains/gpu/cuda/BUILD b/deepray/custom_ops/multiplex_2/__init__.py similarity index 100% rename from build_deps/toolchains/gpu/cuda/BUILD rename to deepray/custom_ops/multiplex_2/__init__.py diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc index 7a1e37b0..8174b934 100644 --- a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc +++ b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "deepray/custom_ops/multiplex_2/multiplex_2_kernel.h" +#include "multiplex_2_kernel.h" // Please use the appropriate namespace for your project namespace tensorflow { diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc index c405fb4c..bed5e149 100644 --- a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc +++ b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc @@ -15,7 +15,7 @@ limitations under the License. #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#include "deepray/custom_ops/multiplex_2/multiplex_2_kernel.h" +#include "multiplex_2_kernel.h" // Please use the appropriate namespace for your project namespace tensorflow { diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_op.cc b/deepray/custom_ops/multiplex_2/multiplex_2_op.cc index d59e88ac..0748d0ef 100644 --- a/deepray/custom_ops/multiplex_2/multiplex_2_op.cc +++ b/deepray/custom_ops/multiplex_2/multiplex_2_op.cc @@ -41,7 +41,7 @@ REGISTER_OP("Examples>MultiplexDense") TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused)); c->set_output(0, out); - return ::tensorflow::Status::OK(); + return ::tensorflow::OkStatus(); }) .Doc(R"doc( Return elements chosen from `a` or `b` depending on `cond`. diff --git a/deepray/custom_ops/multiplex_3/BUILD b/deepray/custom_ops/multiplex_3/BUILD index 068baf4d..008948cf 100644 --- a/deepray/custom_ops/multiplex_3/BUILD +++ b/deepray/custom_ops/multiplex_3/BUILD @@ -14,11 +14,20 @@ custom_op_library( "multiplex_3_kernel.cc", "multiplex_3_op.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + ], ) py_library( name = "multiplex_3_op", - srcs = ["multiplex_3_op.py"], + srcs = glob( + [ + "python/*.py", + "python/**/*.py", + "*.py", + ], + ), data = [":multiplex_3_kernel.so"], srcs_version = "PY3", visibility = ["//visibility:public"], @@ -38,5 +47,7 @@ py_test( ], deps = [ ":multiplex_3_op", + "@pypi_numpy//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/third_party/cucollection/BUILD b/deepray/custom_ops/multiplex_3/__init__.py similarity index 100% rename from third_party/cucollection/BUILD rename to deepray/custom_ops/multiplex_3/__init__.py diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc b/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc index 374376bf..6c34ffa7 100644 --- a/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc +++ b/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/platform/errors.h" @@ -204,7 +205,7 @@ class MultiplexSparseOp : public OpKernel { indices_tensor.shape().DebugString(), " values: ", values_tensor.shape().DebugString()); } - return Status::OK(); + return TFOkStatus; } }; diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_op.cc b/deepray/custom_ops/multiplex_3/multiplex_3_op.cc index 80be6976..4e852b69 100644 --- a/deepray/custom_ops/multiplex_3/multiplex_3_op.cc +++ b/deepray/custom_ops/multiplex_3/multiplex_3_op.cc @@ -13,9 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" +using namespace tensorflow; + // Use a namespace when registering by prepending the // package's name to the op’s name and separate with a '>'. // This is the recommendation for out-of-tree ops to avoid name collisions in @@ -52,7 +55,7 @@ REGISTER_OP("Examples>MultiplexSparse") c->set_output(0, c->Matrix(num_rows, dense_rank)); c->set_output(1, c->Vector(num_rows)); c->set_output(2, c->Vector(dense_rank)); - return ::tensorflow::Status::OK(); + return TFOkStatus; }) .Doc(R"doc( Return elements chosen from `a` or `b` depending on `cond`. diff --git a/deepray/custom_ops/multiplex_4/BUILD b/deepray/custom_ops/multiplex_4/BUILD index b699319b..7cac986d 100644 --- a/deepray/custom_ops/multiplex_4/BUILD +++ b/deepray/custom_ops/multiplex_4/BUILD @@ -10,6 +10,9 @@ custom_op_library( "multiplex_4_kernel.cc", "multiplex_4_op.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + ], ) py_library( @@ -38,6 +41,8 @@ py_test( deps = [ ":model_using_multiplex", ":multiplex_4_op", + "@pypi_numpy//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/multiplex_4/__init__.py b/deepray/custom_ops/multiplex_4/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/custom_ops/multiplex_4/multiplex_4_op.cc b/deepray/custom_ops/multiplex_4/multiplex_4_op.cc index 88a5ec06..102d2142 100644 --- a/deepray/custom_ops/multiplex_4/multiplex_4_op.cc +++ b/deepray/custom_ops/multiplex_4/multiplex_4_op.cc @@ -15,6 +15,7 @@ limitations under the License. #include +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -42,7 +43,7 @@ Status MultiplexShapeFunction(InferenceContext* c) { TF_RETURN_IF_ERROR(c->Merge(c->input(i), c->input(last), &unused)); } c->set_output(0, c->input(last)); - return Status::OK(); + return TFOkStatus; } REGISTER_OP("Examples>MultiplexDense") diff --git a/deepray/custom_ops/parquet_dataset/BUILD b/deepray/custom_ops/parquet_dataset/BUILD index f8b14c26..f5ce380a 100644 --- a/deepray/custom_ops/parquet_dataset/BUILD +++ b/deepray/custom_ops/parquet_dataset/BUILD @@ -1,5 +1,4 @@ -load("//deepray:deepray.bzl", "custom_op_library") -load("//deepray:tensorflow.bzl", "pybind_extension") +load("//deepray:deepray.bzl", "custom_op_library", "pybind_extension") licenses(["notice"]) # Apache 2.0 @@ -32,10 +31,9 @@ cc_library( "DEEPREC_ARROW_ZEROCOPY", ], deps = [ - "@com_github_apache_arrow//:arrow", - "@eigen3", "@local_config_tf//:libtensorflow_framework", "@local_config_tf//:tf_header_lib", + "@org_apache_arrow//:arrow", ], ) @@ -49,9 +47,9 @@ cc_library( ], deps = [ ":arrow_util", - "@com_github_apache_arrow//:arrow", "@local_config_tf//:libtensorflow_framework", "@local_config_tf//:tf_header_lib", + "@org_apache_arrow//:arrow", ], ) @@ -60,7 +58,9 @@ pybind_extension( srcs = [ "cc/kernels/parquet_pybind.cc", ], - copts = ["-fexceptions"], + copts = [ + "-fexceptions", + ], features = ["-use_header_modules"], module_name = "_parquet_pybind", deps = [ @@ -122,6 +122,8 @@ py_binary( name = "read_parquet_deepray", srcs = ["read_parquet_deepray.py"], deps = [ - "//deepray/custom_ops/parquet_dataset", + ":parquet_dataset", + "@pypi_fastparquet//:pkg", + "@pypi_pandas//:pkg", ], ) diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc index 564df01e..8f57ab9b 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc @@ -14,9 +14,6 @@ limitations under the License. ==============================================================================*/ #include "arrow_util.h" -#include -#include -#include #include #include @@ -26,7 +23,10 @@ limitations under the License. #include #include +#include "arrow/array.h" +#include "arrow/util/thread_pool.h" #include "eigen.h" +#include "tensorflow/core/framework/allocation_description.pb.h" namespace tensorflow { namespace data { @@ -252,7 +252,7 @@ class RaggedTensorBuilder : public ::arrow::ArrayVisitor { #define CASE_ARROW_ENUM_SET_DTYPE(PTR, ENUM) \ case ENUM: { \ *PTR = DataTypeToEnum::Type>::value; \ - return Status::OK(); \ + return OkStatus(); \ } Status MakeDataTypeAndRaggedRankFromArrowDataType( @@ -280,7 +280,7 @@ Status MakeDataTypeAndRaggedRankFromArrowDataType( return errors::Unimplemented("Arrow data type ", arrow_dtype->ToString(), " not supported."); } - return Status::OK(); + return OkStatus(); } Status MakeTensorsFromArrowArray( @@ -297,7 +297,7 @@ Status MakeTensorsFromArrowArray( RaggedTensorBuilder builder(dtype, ragged_rank); TF_RETURN_IF_ARROW_ERROR(builder.Build(arrow_array, output_tensors)); - return Status::OK(); + return OkStatus(); } int UpdateArrowCpuThreadPoolCapacityFromEnv() { @@ -315,7 +315,7 @@ ::arrow::Status OpenArrowFile( const std::string& filename) { #if DEEPREC_ARROW_HDFS if (filename.rfind("hdfs://", 0) == 0) { - ::arrow::internal::Uri uri; + ::arrow::util::Uri uri; ARROW_RETURN_NOT_OK(uri.Parse(filename)); ARROW_ASSIGN_OR_RAISE(auto options, ::arrow::fs::HdfsOptions::FromUri(uri)); std::shared_ptr<::arrow::io::HadoopFileSystem> fs; diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h b/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h index f84dc9f3..d61a2140 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/public/version.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "unsupported/Eigen/CXX11/Tensor" // NOTE: EIGEN_MAX_ALIGN_BYTES is 64 in TF 1.x. See: // DeepRec/third_party/eigen.BUILD#L67 diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc index 677b4dfa..7000331d 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.h" +#include "parquet_batch_reader.h" #include #include @@ -20,17 +20,17 @@ limitations under the License. #include #include "absl/strings/match.h" -#include "deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.h" +#include "arrow_util.h" namespace tensorflow { namespace data { class ParquetBatchReader::Impl { public: - Impl(const string& filename, const int64 batch_size, - const std::vector& field_names, - const DataTypeVector& field_dtypes, - const std::vector& field_ragged_ranks, + Impl(const string &filename, const int64 batch_size, + const std::vector &field_names, + const DataTypeVector &field_dtypes, + const std::vector &field_ragged_ranks, const int64 partition_count, const int64 partition_index, const bool drop_remainder) : filename_(filename), @@ -44,7 +44,7 @@ class ParquetBatchReader::Impl { Status Open() { if (TF_PREDICT_TRUE(batch_reader_)) { - return Status::OK(); + return OkStatus(); } if (TF_PREDICT_FALSE(partition_index_ >= partition_count_)) { return errors::InvalidArgument("Partition index ", partition_index_, @@ -71,15 +71,15 @@ class ParquetBatchReader::Impl { " must has distinct column names"); } for (size_t i = 0; i < field_names_.size(); ++i) { - auto& cname = field_names_[i]; + auto &cname = field_names_[i]; int column_index = schema->GetFieldIndex(cname); if (TF_PREDICT_FALSE(column_index < 0)) { return errors::NotFound("No column called `", cname, "` found in ", filename_); } column_indices_.push_back(column_index); - const auto& expected_dtype = field_dtypes_[i]; - const auto& expected_ragged_rank = field_ragged_ranks_[i]; + const auto &expected_dtype = field_dtypes_[i]; + const auto &expected_ragged_rank = field_ragged_ranks_[i]; DataType actual_dtype; int32 actual_ragged_rank = 0; TF_RETURN_IF_ERROR(ArrowUtil::MakeDataTypeAndRaggedRankFromArrowDataType( @@ -101,10 +101,10 @@ class ParquetBatchReader::Impl { TF_RETURN_IF_ARROW_ERROR(reader_->GetRecordBatchReader( row_group_indices_, column_indices_, &batch_reader_)); - return Status::OK(); + return OkStatus(); } - Status Read(std::vector* output_tensors) { + Status Read(std::vector *output_tensors) { // Read next batch from parquet file. std::shared_ptr<::arrow::RecordBatch> batch; TF_RETURN_IF_ARROW_ERROR(batch_reader_->ReadNext(&batch)); @@ -123,7 +123,7 @@ class ParquetBatchReader::Impl { field_dtypes_[i], field_ragged_ranks_[i], arrays[i], output_tensors)); } - return Status::OK(); + return OkStatus(); } private: @@ -142,9 +142,9 @@ class ParquetBatchReader::Impl { }; ParquetBatchReader::ParquetBatchReader( - const string& filename, const int64 batch_size, - const std::vector& field_names, const DataTypeVector& field_dtypes, - const std::vector& field_ragged_ranks, const int64 partition_count, + const string &filename, const int64 batch_size, + const std::vector &field_names, const DataTypeVector &field_dtypes, + const std::vector &field_ragged_ranks, const int64 partition_count, const int64 partition_index, const bool drop_remainder) : pimpl_(new ParquetBatchReader::Impl( filename, batch_size, field_names, field_dtypes, field_ragged_ranks, @@ -152,7 +152,7 @@ ParquetBatchReader::ParquetBatchReader( Status ParquetBatchReader::Open() { return pimpl_->Open(); } -Status ParquetBatchReader::Read(std::vector* output_tensors) { +Status ParquetBatchReader::Read(std::vector *output_tensors) { return pimpl_->Read(output_tensors); } diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc index 1bdcd582..7ab81028 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h" +#include "parquet_dataset_ops.h" #include @@ -83,14 +83,14 @@ class ParquetTabularDatasetOp::Dataset : public DatasetBase { return output_shapes_; } - Status CheckExternalState() const override { return Status::OK(); } + Status CheckExternalState() const override { return OkStatus(); } string DebugString() const override { return "ParquetTabularDatasetOp::Dataset"; } Status InputDatasets(std::vector* inputs) const override { - return Status::OK(); + return OkStatus(); } protected: @@ -122,7 +122,7 @@ class ParquetTabularDatasetOp::Dataset : public DatasetBase { {"partition_index", partition_index}, {"drop_remainder", drop_remainder}}, output)); - return Status::OK(); + return OkStatus(); } private: @@ -159,7 +159,7 @@ class ParquetTabularDatasetOp::Dataset::Iterator return s; } *end_of_sequence = true; - return Status::OK(); + return OkStatus(); } protected: diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h index 11659001..047a0277 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h @@ -15,7 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_DATA_PARQUET_DATASET_OPS_H_ #define TENSORFLOW_CORE_KERNELS_DATA_PARQUET_DATASET_OPS_H_ -#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.h" +#include "parquet_batch_reader.h" #include "tensorflow/core/framework/dataset.h" namespace tensorflow { diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc index fc06b1c0..a516bb21 100644 --- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc +++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc @@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include -#include - #include #include #include #include #include "arrow_util.h" +#include "pybind11/complex.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace tensorflow { namespace data { diff --git a/deepray/custom_ops/parquet_dataset/python/dataframe.py b/deepray/custom_ops/parquet_dataset/python/dataframe.py index c7488515..b12ebc7e 100644 --- a/deepray/custom_ops/parquet_dataset/python/dataframe.py +++ b/deepray/custom_ops/parquet_dataset/python/dataframe.py @@ -23,9 +23,10 @@ from __future__ import print_function import collections + import numpy as np +import tensorflow as tf from six.moves import xrange # pylint: disable=redefined-builtin - from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor @@ -59,16 +60,12 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None): self._ragged_rank = ragged_rank if shape: shape = tensor_shape.TensorShape(shape) - shape_rank = 0 - for _ in shape: - shape_rank += 1 - if ragged_rank is not None and ragged_rank != shape_rank: + for d in shape: + if d is None: + raise ValueError(f'Field {name} has incomplete shape: {shape}') + if ragged_rank is not None and ragged_rank > 1: raise ValueError(f'Field {name} is a nested list ({ragged_rank}) ' f'with shape {shape}') - self._ragged_rank = shape_rank - elif ragged_rank is not None: - shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)]) - self._shape = shape @property @@ -130,16 +127,15 @@ def output_classes(self): def output_types(self): return self.map(lambda i: self._dtype if i == 0 else dtypes.int32) - def output_shapes(self, batch_size=None): + @property + def output_shapes(self): if self._shape is None: - return self.map(lambda i: tensor_shape.TensorShape(batch_size) if i == 0 else tensor_shape.TensorShape(None)) - return self.map( - lambda i: tensor_shape.TensorShape(batch_size).concatenate(self._shape) - if i == 0 else tensor_shape.TensorShape(None) - ) + return self.map(lambda _: tf.TensorShape(None)) + return self.map(lambda i: tf.TensorShape(None).concatenate(self._shape) if i == 0 else tf.TensorShape(None)) - def output_specs(self, batch_size=None): - shape = tensor_shape.TensorShape(batch_size) + @property + def output_specs(self): + shape = tf.TensorShape(None) if self._shape is not None: shape = shape.concatenate(self._shape) specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)] diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py index a8766954..67330c7b 100644 --- a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py +++ b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function +import tensorflow as tf from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers from tensorflow.python.framework import dtypes @@ -32,6 +33,7 @@ from .parquet_pybind import parquet_filenames_and_fields _parquet_dataset_ops_so = LazySO("custom_ops/parquet_dataset/_parquet_dataset_ops.so") +gen_parquet_ops = _parquet_dataset_ops_so.ops class DataFrameValueSpec(type_spec.BatchableTypeSpec): @@ -40,24 +42,22 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec): def value_type(self): return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor - def __init__(self, field, batch_size=None): + def __init__(self, field): """Constructs a type specification for a `tf.RaggedTensor`. Args: field: The field definition. - batch_size: The batch_size of DataFrame. """ if field.incomplete: raise ValueError(f'Field {field} is incomplete, please specify dtype and ragged_rank') self._field = field - self._batch_size = batch_size def _serialize(self): return (self._field.dtype, self._field.ragged_rank) @property def _component_specs(self): - return self._field.output_specs(self._batch_size) + return self._field.output_specs def _to_components(self, value): if isinstance(value, DataFrame.Value): @@ -81,7 +81,7 @@ def _to_legacy_output_types(self): return self._field.output_types def _to_legacy_output_shapes(self): - return self._field.output_shapes(self._batch_size) + return self._field.output_shapes def _to_legacy_output_classes(self): return self._field.output_classes @@ -105,12 +105,18 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in self._filename = ops.convert_to_tensor(filename, dtype=dtypes.string, name='filename') self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64, name='batch_size') self._fields = fields - self._output_specs = { - f.name: ( - DataFrameValueSpec(f, batch_size if drop_remainder else None) if f.ragged_rank > 0 else - tensor_spec.TensorSpec(shape=[batch_size if drop_remainder else None], dtype=f.dtype) - ) for f in self._fields - } + self._output_specs = {} + for f in self._fields: + item = None + if f.ragged_rank > 0: + item = DataFrameValueSpec(f) + else: + shape = tf.TensorShape(batch_size if drop_remainder else None) + if f.shape: + shape = shape.concatenate(f.shape) + item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype) + self._output_specs[f.name] = item + self._field_names = nest.flatten({f.name: f.name for f in self._fields}) self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields}) self._field_ragged_ranks = nest.flatten({f.name: f.ragged_rank for f in self._fields}) @@ -118,7 +124,7 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in self._partition_index = partition_index self._drop_remainder = drop_remainder - variant_tensor = _parquet_dataset_ops_so.ops.parquet_tabular_dataset_v1( + variant_tensor = gen_parquet_ops.parquet_tabular_dataset_v1( self._filename, self._batch_size, field_names=self._field_names, @@ -227,9 +233,12 @@ def element_spec(self): def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, num_sequential_reads=1): """Internal method to create a `ParquetDataset`.""" if num_parallel_reads is None: + # Sequential Reading return filenames.flat_map(dataset_creator) if num_parallel_reads == dataset_ops.AUTOTUNE: + # Auto-tuned Parallel Reading return filenames.interleave(dataset_creator, num_parallel_calls=num_parallel_reads) + # Specified Parallel Reading return readers.ParallelInterleaveDataset( filenames, dataset_creator, diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py index 7ce65b64..8bd32f7e 100644 --- a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py +++ b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py @@ -20,15 +20,16 @@ import numpy as np from six import string_types as string - from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.platform import tf_logging as logging -from .dataframe import DataFrame +from tensorflow.python.types import core + from deepray.custom_ops.parquet_dataset import _parquet_pybind as _lib +from .dataframe import DataFrame def parquet_fields(filename, fields=None, lower=False): @@ -121,7 +122,7 @@ def parquet_filenames_and_fields(filenames, fields, lower=False): raise ValueError(f'Field {f} must be `hb.data.DataFrame.Field`.') if f.incomplete: raise ValueError(f'Field {f} is incomplete, please specify dtype and ragged_rank') - elif isinstance(filenames, ops.Tensor): + elif isinstance(filenames, core.Tensor): if filenames.dtype != dtypes.string: raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.') if fields is None: diff --git a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py index 32d1393f..10d3fc26 100644 --- a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py +++ b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py @@ -18,16 +18,19 @@ from __future__ import division from __future__ import print_function +import numpy as np +import pandas as pd import os +from six.moves import xrange # pylint: disable=redefined-builtin import tempfile -import numpy as np -import pandas as pd import tensorflow as tf +# from tensorflow.python.data.experimental.ops import parquet_dataset_ops +from deepray.custom_ops.parquet_dataset import parquet_dataset_ops + from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.platform import test - -from deepray.custom_ops.parquet_dataset import parquet_dataset_ops +from tensorflow.python.data.ops.dataset_ops import AUTOTUNE class ParquetDatasetTest(test_base.DatasetTestBase): @@ -42,154 +45,152 @@ def setUpClass(self): def test_read(self): batch_size = 32 - ds = parquet_dataset_ops.ParquetDataset( - self._filename, - batch_size=batch_size, - fields=[parquet_dataset_ops.DataFrame.Field('A', tf.int64), - parquet_dataset_ops.DataFrame.Field('C', tf.int64)] - ) - ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() + with tf.Graph().as_default() as graph: + ds = parquet_dataset_ops.ParquetDataset( + self._filename, + batch_size=batch_size, + fields=[ + parquet_dataset_ops.DataFrame.Field('A', tf.int64), + parquet_dataset_ops.DataFrame.Field('C', tf.int64) + ] + ) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() a = self._df['A'] c = self._df['C'] - i = 0 - for result in ds.take(3): - print(result) - # i += 1 - # start_row = i * batch_size - # end_row = (i + 1) * batch_size - # np.testing.assert_equal(result['A'], a[start_row:end_row].to_numpy()) - # np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) - - # def test_schema_auto_detection_read(self): - # batch_size = 32 - # with tf.Graph().as_default() as graph: - # ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # c = self._df['C'] - # with tf.Session(graph=graph) as sess: - # for i in xrange(3): - # result = sess.run(batch) - # start_row = i * batch_size - # end_row = (i + 1) * batch_size - # np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) - - # def test_dtype_auto_detection_read(self): - # batch_size = 32 - # with tf.Graph().as_default() as graph: - # ds = parquet_dataset_ops.ParquetDataset( - # [self._filename], - # batch_size=batch_size, - # fields=['B', 'C']) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # c = self._df['C'] - # with tf.Session(graph=graph) as sess: - # for i in xrange(3): - # result = sess.run(batch) - # start_row = i * batch_size - # end_row = (i + 1) * batch_size - # np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) - - # def test_dtype_auto_detection_read_lower(self): - # batch_size = 32 - # with tf.Graph().as_default() as graph: - # actual_fields = parquet_dataset_ops.ParquetDataset.read_schema( - # self._filename, ['B', 'D'], lower=True) - # fld = actual_fields[1].name - # ds = parquet_dataset_ops.ParquetDataset( - # [self._filename], - # batch_size=batch_size, - # fields=actual_fields) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # c = self._df[fld] - # with tf.Session(graph=graph) as sess: - # for i in xrange(3): - # result = sess.run(batch) - # start_row = i * batch_size - # end_row = (i + 1) * batch_size - # np.testing.assert_equal(result[fld], c[start_row:end_row].to_numpy()) - - # def test_read_from_generator(self): - # num_epochs = 2 - # batch_size = 100 - # with tf.Graph().as_default() as graph: - # def gen_filenames(): - # for i in xrange(num_epochs + 1): - # if i == num_epochs: - # return # raise StopIteration - # yield self._filename - # filenames = tf.data.Dataset.from_generator( - # gen_filenames, tf.string, tf.TensorShape([])) - # fields = [ - # parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), - # parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)] - # ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields)) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # with tf.Session(graph=graph) as sess: - # for _ in xrange(len(self._df) * num_epochs // batch_size): - # sess.run(batch) - # with self.assertRaises(tf.errors.OutOfRangeError): - # sess.run(batch) - - # def test_read_from_generator_parallel(self): - # num_epochs = 2 - # batch_size = 100 - # with tf.Graph().as_default() as graph: - # def gen_filenames(): - # for i in xrange(num_epochs + 1): - # if i == num_epochs: - # return # raise StopIteration - # yield self._filename - # filenames = tf.data.Dataset.from_generator( - # gen_filenames, tf.string, tf.TensorShape([])) - # fields = [ - # parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), - # parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)] - # ds = filenames.apply( - # parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=3)) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # with tf.Session(graph=graph) as sess: - # for _ in xrange(len(self._df) * num_epochs // batch_size): - # sess.run(batch) - # with self.assertRaises(tf.errors.OutOfRangeError): - # sess.run(batch) - - # def test_read_from_generator_parallel_auto(self): - # num_epochs = 2 - # batch_size = 100 - # with tf.Graph().as_default() as graph: - # def gen_filenames(): - # for i in xrange(num_epochs + 1): - # if i == num_epochs: - # return # raise StopIteration - # yield self._filename - # filenames = tf.data.Dataset.from_generator( - # gen_filenames, tf.string, tf.TensorShape([])) - # fields = [ - # parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), - # parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)] - # ds = filenames.apply( - # parquet_dataset_ops.read_parquet( - # batch_size, fields=fields, num_parallel_reads=AUTOTUNE)) - # ds = ds.prefetch(4) - # batch = tf.data.make_one_shot_iterator(ds).get_next() - - # with tf.Session(graph=graph) as sess: - # for _ in xrange(len(self._df) * num_epochs // batch_size): - # sess.run(batch) - # with self.assertRaises(tf.errors.OutOfRangeError): - # sess.run(batch) + with tf.Session(graph=graph) as sess: + for i in xrange(3): + result = sess.run(batch) + start_row = i * batch_size + end_row = (i + 1) * batch_size + np.testing.assert_equal(result['A'], a[start_row:end_row].to_numpy()) + np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) + + def test_schema_auto_detection_read(self): + batch_size = 32 + with tf.Graph().as_default() as graph: + ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + c = self._df['C'] + with tf.Session(graph=graph) as sess: + for i in xrange(3): + result = sess.run(batch) + start_row = i * batch_size + end_row = (i + 1) * batch_size + np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) + + def test_dtype_auto_detection_read(self): + batch_size = 32 + with tf.Graph().as_default() as graph: + ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=['B', 'C']) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + c = self._df['C'] + with tf.Session(graph=graph) as sess: + for i in xrange(3): + result = sess.run(batch) + start_row = i * batch_size + end_row = (i + 1) * batch_size + np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy()) + + def test_dtype_auto_detection_read_lower(self): + batch_size = 32 + with tf.Graph().as_default() as graph: + actual_fields = parquet_dataset_ops.ParquetDataset.read_schema(self._filename, ['B', 'D'], lower=True) + fld = actual_fields[1].name + ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=actual_fields) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + c = self._df[fld] + with tf.Session(graph=graph) as sess: + for i in xrange(3): + result = sess.run(batch) + start_row = i * batch_size + end_row = (i + 1) * batch_size + np.testing.assert_equal(result[fld], c[start_row:end_row].to_numpy()) + + def test_read_from_generator(self): + num_epochs = 2 + batch_size = 100 + with tf.Graph().as_default() as graph: + + def gen_filenames(): + for i in xrange(num_epochs + 1): + if i == num_epochs: + return # raise StopIteration + yield self._filename + + filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([])) + fields = [ + parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), + parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0) + ] + ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields)) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + with tf.Session(graph=graph) as sess: + for _ in xrange(len(self._df) * num_epochs // batch_size): + sess.run(batch) + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + def test_read_from_generator_parallel(self): + num_epochs = 2 + batch_size = 100 + with tf.Graph().as_default() as graph: + + def gen_filenames(): + for i in xrange(num_epochs + 1): + if i == num_epochs: + return # raise StopIteration + yield self._filename + + filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([])) + fields = [ + parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), + parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0) + ] + ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=3)) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + with tf.Session(graph=graph) as sess: + for _ in xrange(len(self._df) * num_epochs // batch_size): + sess.run(batch) + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + def test_read_from_generator_parallel_auto(self): + num_epochs = 2 + batch_size = 100 + with tf.Graph().as_default() as graph: + + def gen_filenames(): + for i in xrange(num_epochs + 1): + if i == num_epochs: + return # raise StopIteration + yield self._filename + + filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([])) + fields = [ + parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0), + parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0) + ] + ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=AUTOTUNE)) + ds = ds.prefetch(4) + batch = tf.data.make_one_shot_iterator(ds).get_next() + + with tf.Session(graph=graph) as sess: + for _ in xrange(len(self._df) * num_epochs // batch_size): + sess.run(batch) + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) if __name__ == "__main__": diff --git a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py index 34411a21..ccd3a14e 100644 --- a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py +++ b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py @@ -12,6 +12,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '' _workspace = tempfile.mkdtemp() _filename = os.path.join(_workspace, 'test.parquet') +print(_filename) # _df = pd.DataFrame( # np.random.randint(0, 100, size=(200, 4), dtype=np.int64), # columns=list('ABCd')) @@ -34,17 +35,18 @@ ds = parquet_dataset_ops.ParquetDataset( _filename, batch_size=batch_size, - fields=[ - parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1), - parquet_dataset_ops.DataFrame.Field( - 'B', - tf.int64, - shape=[3], - ), - parquet_dataset_ops.DataFrame.Field('C', tf.int32), - parquet_dataset_ops.DataFrame.Field('D', tf.int64), - parquet_dataset_ops.DataFrame.Field('E', tf.string), - ] + fields=['A', 'C'] + # fields=[ + # parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1), + # parquet_dataset_ops.DataFrame.Field( + # 'B', + # tf.int64, + # shape=[3], + # ), + # parquet_dataset_ops.DataFrame.Field('C', tf.int32), + # parquet_dataset_ops.DataFrame.Field('D', tf.int64), + # parquet_dataset_ops.DataFrame.Field('E', tf.string), + # ] ) ds = ds.prefetch(4) diff --git a/deepray/custom_ops/seq2seq/BUILD b/deepray/custom_ops/seq2seq/BUILD index 6eb65487..740c8275 100644 --- a/deepray/custom_ops/seq2seq/BUILD +++ b/deepray/custom_ops/seq2seq/BUILD @@ -11,8 +11,36 @@ custom_op_library( "cc/kernels/beam_search_ops.h", "cc/ops/beam_search_ops.cc", ], - cuda_srcs = [ + gpu_srcs = [ "cc/kernels/beam_search_ops.h", "cc/kernels/beam_search_ops_gpu.cu.cc", ], ) + +py_library( + name = "seq2seq", + srcs = glob( + [ + "python/*.py", + "*.py", + ], + ), + data = [ + ":_beam_search_ops.so", + ], +) + +py_test( + name = "seq2seq_test", + size = "medium", + srcs = glob(["python/tests/*"]), + main = "python/tests/run_all_test.py", + deps = [ + ":seq2seq", + "//deepray/utils", + "@pypi_numpy//:pkg", + "@pypi_pytest//:pkg", + "@pypi_pytest_xdist//:pkg", + "@pypi_tensorflow//:pkg", + ], +) diff --git a/deepray/custom_ops/seq2seq/__init__.py b/deepray/custom_ops/seq2seq/__init__.py new file mode 100644 index 00000000..39f88075 --- /dev/null +++ b/deepray/custom_ops/seq2seq/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Additional layers for sequence to sequence models.""" + +from .python import attention_wrapper +from .python import basic_decoder +from .python import beam_search_decoder +from .python import decoder +from .python import loss +from .python import sampler \ No newline at end of file diff --git a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc index 7ea86176..6a5f20ec 100644 --- a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc +++ b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc @@ -32,8 +32,13 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/public/version.h" #include "tensorflow/core/util/work_sharder.h" +#if TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 16 +#include "unsupported/Eigen/CXX11/Tensor" +#else #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#endif namespace tensorflow { namespace deepray { diff --git a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h index ae85f60b..297592d7 100644 --- a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h +++ b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h @@ -18,7 +18,12 @@ limitations under the License. #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/public/version.h" +#if TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 16 +#include "unsupported/Eigen/CXX11/Tensor" +#else #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#endif namespace tensorflow { class OpKernelContext; diff --git a/deepray/seq2seq/README.md b/deepray/custom_ops/seq2seq/python/README.md similarity index 90% rename from deepray/seq2seq/README.md rename to deepray/custom_ops/seq2seq/python/README.md index 7430fb3d..bdb3d6a8 100644 --- a/deepray/seq2seq/README.md +++ b/deepray/custom_ops/seq2seq/python/README.md @@ -1,14 +1,14 @@ -# Deepray - Seq2seq +# Addons - Seq2seq ## Contents -https://www.tensorflow.org/deepray/api_docs/python/dp/seq2seq +https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq ## Contribution Guidelines #### Standard API In order to conform with the current API standard, all objects must: * Inherit from proper base class within each module, eg `BaseDecoder` in decoder.py for customized decoder or `_BaseAttentionMechanism` for new attentions. - * Register as a keras global object so it can be serialized properly: `@tf.keras.utils.register_keras_serializable(package='Deepray')` + * Register as a keras global object so it can be serialized properly: `@tf.keras.utils.register_keras_serializable(package='Addons')` #### Testing Requirements * Simple unittests that demonstrate the class is behaving as expected on @@ -67,7 +67,7 @@ logits = outputs.rnn_output ``` ``` python -import deepray as dp +import tensorflow_addons as tfa # TF 2.0, new style @@ -79,9 +79,9 @@ encoder_outputs, state_h, state_c = encoder( encoder_state = (state_h, state_c) # Decoder RNN cell with attention -attention_mechanism = dp.seq2seq.LuongAttention(num_units, encoder_outputs) +attention_mechanism = tfa.seq2seq.LuongAttention(num_units, encoder_outputs) decoder_cell = tf.keras.layers.LSTMCell(num_units) -decoder_cell = dp.seq2seq.AttentionWrapper( +decoder_cell = tfa.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=num_units, @@ -89,11 +89,11 @@ decoder_cell = dp.seq2seq.AttentionWrapper( ) # Sampler -sampler = dp.seq2seq.sampler.TrainingSampler() +sampler = tfa.seq2seq.sampler.TrainingSampler() # Decoder projection_layer = tf.keras.layers.Dense(num_outputs) -decoder = dp.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer) +decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer) # Dynamic decoding decoder_initial_state = decoder_cell.get_initial_state(inputs=decoder_inputs) @@ -145,14 +145,14 @@ outputs, _ = tf.contrib.seq2seq.dynamic_decode(decoder, ...) ``` python # TF 2.0, new style -import deepray as dp +import tensorflow_addons as tfa # Replicate encoder infos beam_width times -decoder_initial_state = dp.seq2seq.tile_batch( +decoder_initial_state = tfa.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) # Define a beam-search decoder -decoder = dp.seq2seq.BeamSearchDecoder( +decoder = tfa.seq2seq.BeamSearchDecoder( cell=decoder_cell, beam_width=beam_width, output_layer=projection_layer, diff --git a/deepray/custom_ops/seq2seq/python/__init__.py b/deepray/custom_ops/seq2seq/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/seq2seq/attention_wrapper.py b/deepray/custom_ops/seq2seq/python/attention_wrapper.py similarity index 95% rename from deepray/seq2seq/attention_wrapper.py rename to deepray/custom_ops/seq2seq/python/attention_wrapper.py index 830ac6a1..b8eb8ad2 100644 --- a/deepray/seq2seq/attention_wrapper.py +++ b/deepray/custom_ops/seq2seq/python/attention_wrapper.py @@ -17,10 +17,12 @@ import collections import functools import math +from typing import Optional, Callable, Union, List import numpy as np - import tensorflow as tf +from packaging.version import Version +from typeguard import typechecked from deepray.utils import keras_utils from deepray.utils.types import ( @@ -31,8 +33,10 @@ Number, ) -from typeguard import typechecked -from typing import Optional, Callable, Union, List +if Version(tf.__version__) < Version("2.13"): + SERIALIZATION_ARGS = {} +else: + SERIALIZATION_ARGS = {"use_legacy_format": True} class AttentionMechanism(tf.keras.layers.Layer): @@ -355,11 +359,19 @@ def deserialize_inner_layer_from_config(cls, config, custom_objects): config = config.copy() query_layer_config = config.pop("query_layer", None) if query_layer_config: - query_layer = tf.keras.layers.deserialize(query_layer_config, custom_objects=custom_objects) + query_layer = tf.keras.layers.deserialize( + query_layer_config, + custom_objects=custom_objects, + **SERIALIZATION_ARGS, + ) config["query_layer"] = query_layer memory_layer_config = config.pop("memory_layer", None) if memory_layer_config: - memory_layer = tf.keras.layers.deserialize(memory_layer_config, custom_objects=custom_objects) + memory_layer = tf.keras.layers.deserialize( + memory_layer_config, + custom_objects=custom_objects, + **SERIALIZATION_ARGS, + ) config["memory_layer"] = memory_layer return config @@ -375,7 +387,7 @@ def state_size(self): return self.alignments_size def initial_alignments(self, batch_size, dtype): - """Creates the initial alignment values for the `dp.seq2seq.AttentionWrapper` + """Creates the initial alignment values for the `tfa.seq2seq.AttentionWrapper` class. This is important for attention mechanisms that use the previous @@ -395,7 +407,7 @@ def initial_alignments(self, batch_size, dtype): return tf.zeros([batch_size, self._alignments_size], dtype=dtype) def initial_state(self, batch_size, dtype): - """Creates the initial state values for the `dp.seq2seq.AttentionWrapper` class. + """Creates the initial state values for the `tfa.seq2seq.AttentionWrapper` class. This is important for attention mechanisms that use the previous alignment to calculate the alignment at the next time step @@ -767,7 +779,9 @@ def get_config(self): "normalize": self.normalize, "probability_fn": self.probability_fn_name, "kernel_initializer": tf.keras.initializers.serialize( - self.kernel_initializer) + self.kernel_initializer, + **SERIALIZATION_ARGS, + ) } # yapf: enable @@ -776,7 +790,10 @@ def get_config(self): @classmethod def from_config(cls, config, custom_objects=None): - config = AttentionMechanism.deserialize_inner_layer_from_config(config, custom_objects=custom_objects) + config = AttentionMechanism.deserialize_inner_layer_from_config( + config, + custom_objects=custom_objects, + ) return cls(**config) @@ -917,7 +934,7 @@ def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, s test-time, and when hard attention is not desired. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. See the docstring for - `dp.seq2seq.monotonic_attention` for more information. + `tfa.seq2seq.monotonic_attention` for more information. seed: (optional) Random seed for pre-sigmoid noise. Returns: @@ -974,7 +991,7 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism): to construct its attention distributions. Since the attention scores are passed through a sigmoid, a learnable scalar bias parameter is applied after the score function and before the sigmoid. Otherwise, it is - equivalent to `dp.seq2seq.BahdanauAttention`. This approach is proposed in + equivalent to `tfa.seq2seq.BahdanauAttention`. This approach is proposed in Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, "Online and Linear-Time Attention by Enforcing Monotonic Alignments." @@ -1015,7 +1032,7 @@ def __init__( of the memory is large. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. See the docstring for - `dp.seq2seq.monotonic_attention` for more information. + `tfa.seq2seq.monotonic_attention` for more information. kernel_initializer: (optional), the name of the initializer for the attention kernel. dtype: The data type for the query and memory layers of the attention @@ -1128,7 +1145,9 @@ def get_config(self): "score_bias_init": self.score_bias_init, "mode": self.mode, "kernel_initializer": tf.keras.initializers.serialize( - self.kernel_initializer), + self.kernel_initializer, + **SERIALIZATION_ARGS, + ), } # yapf: enable @@ -1149,7 +1168,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism): memory it can't attend to any prior points at subsequence output timesteps. It achieves this by using the `_monotonic_probability_fn` instead of `softmax` to construct its attention distributions. Otherwise, it is equivalent to - `dp.seq2seq.LuongAttention`. This approach is proposed in + `tfa.seq2seq.LuongAttention`. This approach is proposed in [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, "Online and Linear-Time Attention by Enforcing Monotonic Alignments." @@ -1189,7 +1208,7 @@ def __init__( of the memory is large. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. See the docstring for - `dp.seq2seq.monotonic_attention` for more information. + `tfa.seq2seq.monotonic_attention` for more information. dtype: The data type for the query and memory layers of the attention mechanism. name: Name to use when creating ops. @@ -1289,7 +1308,7 @@ class AttentionWrapperState( ), ) ): - """State of a `dp.seq2seq.AttentionWrapper`. + """State of a `tfa.seq2seq.AttentionWrapper`. Attributes: cell_state: The state of the wrapped RNN cell at the previous time @@ -1318,8 +1337,8 @@ def clone(self, **kwargs): >>> batch_size = 1 >>> memory = tf.random.normal(shape=[batch_size, 3, 100]) >>> encoder_state = [tf.zeros((batch_size, 100)), tf.zeros((batch_size, 100))] - >>> attention_mechanism = dp.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size) - >>> attention_cell = dp.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10) + >>> attention_mechanism = tfa.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size) + >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10) >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32) >>> decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) @@ -1488,11 +1507,11 @@ class AttentionWrapper(tf.keras.layers.AbstractRNNCell): >>> memory = tf.random.uniform([batch_size, max_time, hidden_size]) >>> memory_sequence_length = tf.fill([batch_size], max_time) >>> - >>> attention_mechanism = dp.seq2seq.LuongAttention(hidden_size) + >>> attention_mechanism = tfa.seq2seq.LuongAttention(hidden_size) >>> attention_mechanism.setup_memory(memory, memory_sequence_length) >>> >>> cell = tf.keras.layers.LSTMCell(hidden_size) - >>> cell = dp.seq2seq.AttentionWrapper( + >>> cell = tfa.seq2seq.AttentionWrapper( ... cell, attention_mechanism, attention_layer_size=hidden_size) >>> >>> inputs = tf.random.uniform([batch_size, hidden_size]) @@ -1520,11 +1539,11 @@ def __init__( ): """Construct the `AttentionWrapper`. - **NOTE** If you are using the `dp.seq2seq.BeamSearchDecoder` with a cell wrapped + **NOTE** If you are using the `tfa.seq2seq.BeamSearchDecoder` with a cell wrapped in `AttentionWrapper`, then you must ensure that: - The encoder output has been tiled to `beam_width` via - `dp.seq2seq.tile_batch` (NOT `tf.tile`). + `tfa.seq2seq.tile_batch` (NOT `tf.tile`). - The `batch_size` argument passed to the `get_initial_state` method of this wrapper is equal to `true_batch_size * beam_width`. - The initial state created with `get_initial_state` above contains a @@ -1538,18 +1557,18 @@ def __init__( >>> sequence_length = tf.convert_to_tensor([5]) >>> encoder_outputs = tf.random.uniform(shape=(batch_size, 5, 10)) >>> encoder_final_state = [tf.zeros((batch_size, 10)), tf.zeros((batch_size, 10))] - >>> tiled_encoder_outputs = dp.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width) - >>> tiled_encoder_final_state = dp.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width) - >>> tiled_sequence_length = dp.seq2seq.tile_batch(sequence_length, multiplier=beam_width) - >>> attention_mechanism = dp.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) - >>> attention_cell = dp.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism) + >>> tiled_encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width) + >>> tiled_encoder_final_state = tfa.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width) + >>> tiled_sequence_length = tfa.seq2seq.tile_batch(sequence_length, multiplier=beam_width) + >>> attention_mechanism = tfa.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) + >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism) >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size * beam_width, dtype=tf.float32) >>> decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state) Args: cell: A layer that implements the `tf.keras.layers.AbstractRNNCell` interface. - attention_mechanism: A list of `dp.seq2seq.AttentionMechanism` + attention_mechanism: A list of `tfa.seq2seq.AttentionMechanism` instances single instance. attention_layer_size: A list of Python integers or a single Python integer, the depth of the attention (output) layer(s). If `None` @@ -1673,7 +1692,7 @@ def cell_input_fn(inputs, attention): "When constructing AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory " "(encoder output) and initial_cell_state. Are you using " "the BeamSearchDecoder? You may need to tile your " - "initial state via the dp.seq2seq.tile_batch " + "initial state via the tfa.seq2seq.tile_batch " "function with argument multiple=beam_width." ) with tf.control_dependencies( @@ -1752,10 +1771,10 @@ def output_size(self): @property def state_size(self): - """The `state_size` property of `dp.seq2seq.AttentionWrapper`. + """The `state_size` property of `tfa.seq2seq.AttentionWrapper`. Returns: - A `dp.seq2seq.AttentionWrapperState` tuple containing shapes used + A `tfa.seq2seq.AttentionWrapperState` tuple containing shapes used by this object. """ return AttentionWrapperState( @@ -1769,11 +1788,11 @@ def state_size(self): ) # sometimes a TensorArray def get_initial_state(self, inputs=None, batch_size=None, dtype=None): - """Return an initial (zero) state tuple for this `dp.seq2seq.AttentionWrapper`. + """Return an initial (zero) state tuple for this `tfa.seq2seq.AttentionWrapper`. **NOTE** Please see the initializer documentation for details of how - to call `get_initial_state` if using a `dp.seq2seq.AttentionWrapper` - with a `dp.seq2seq.BeamSearchDecoder`. + to call `get_initial_state` if using a `tfa.seq2seq.AttentionWrapper` + with a `tfa.seq2seq.BeamSearchDecoder`. Args: inputs: The inputs that will be fed to this cell. @@ -1781,7 +1800,7 @@ def get_initial_state(self, inputs=None, batch_size=None, dtype=None): dtype: The internal state data type. Returns: - An `dp.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and, + An `tfa.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and, possibly, empty `TensorArray` objects. Raises: @@ -1803,7 +1822,7 @@ def get_initial_state(self, inputs=None, batch_size=None, dtype=None): "(encoder output) and the requested batch size. Are you using " "the BeamSearchDecoder? If so, make sure your encoder output " "has been tiled to beam_width via " - "dp.seq2seq.tile_batch, and the batch_size= argument " + "tfa.seq2seq.tile_batch, and the batch_size= argument " "passed to get_initial_state is batch_size * beam_width." ) with tf.control_dependencies(self._batch_size_checks(batch_size, error_message)): # pylint: disable=bad-continuation @@ -1844,7 +1863,7 @@ def call(self, inputs, state, **kwargs): Args: inputs: (Possibly nested tuple of) Tensor, the input at this time step. - state: An instance of `dp.seq2seq.AttentionWrapperState` containing + state: An instance of `tfa.seq2seq.AttentionWrapperState` containing tensors from the previous time step. **kwargs: Dict, other keyword arguments for the cell call method. @@ -1852,11 +1871,11 @@ def call(self, inputs, state, **kwargs): A tuple `(attention_or_cell_output, next_state)`, where: - `attention_or_cell_output` depending on `output_attention`. - - `next_state` is an instance of `dp.seq2seq.AttentionWrapperState` + - `next_state` is an instance of `tfa.seq2seq.AttentionWrapperState` containing the state calculated at this time step. Raises: - TypeError: If `state` is not an instance of `dp.seq2seq.AttentionWrapperState`. + TypeError: If `state` is not an instance of `tfa.seq2seq.AttentionWrapperState`. """ if not isinstance(state, AttentionWrapperState): try: @@ -1880,7 +1899,7 @@ def call(self, inputs, state, **kwargs): "When applying AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory " "(encoder output) and the query (decoder output). Are you using " "the BeamSearchDecoder? You may need to tile your memory input " - "via the dp.seq2seq.tile_batch function with argument " + "via the tfa.seq2seq.tile_batch function with argument " "multiple=beam_width." ) with tf.control_dependencies(self._batch_size_checks(cell_batch_size, error_message)): # pylint: disable=bad-continuation diff --git a/deepray/seq2seq/basic_decoder.py b/deepray/custom_ops/seq2seq/python/basic_decoder.py similarity index 87% rename from deepray/seq2seq/basic_decoder.py rename to deepray/custom_ops/seq2seq/python/basic_decoder.py index 48a7a75f..de3ea9b9 100644 --- a/deepray/seq2seq/basic_decoder.py +++ b/deepray/custom_ops/seq2seq/python/basic_decoder.py @@ -15,26 +15,25 @@ """A basic decoder that may sample to generate the next input.""" import collections +from typing import Optional import tensorflow as tf +from typeguard import typechecked -from deepray.seq2seq import decoder -from deepray.seq2seq import sampler as sampler_py from deepray.utils import keras_utils - -from typeguard import typechecked -from typing import Optional +from . import decoder +from . import sampler as sampler_py class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))): - """Outputs of a `dp.seq2seq.BasicDecoder` step. + """Outputs of a `tfa.seq2seq.BasicDecoder` step. Attributes: rnn_output: The output for this step. If the `output_layer` argument - of `dp.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it + of `tfa.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it is the output of the RNN cell. sample_id: The token IDs sampled for this step, as returned by the - `sampler` instance passed to `dp.seq2seq.BasicDecoder`. + `sampler` instance passed to `tfa.seq2seq.BasicDecoder`. """ pass @@ -43,11 +42,11 @@ class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_outp class BasicDecoder(decoder.BaseDecoder): """Basic sampling decoder for training and inference. - The `dp.seq2seq.Sampler` instance passed as argument is responsible to sample from + The `tfa.seq2seq.Sampler` instance passed as argument is responsible to sample from the output distribution and produce the input for the next decoding step. The decoding loop is implemented by the decoder in its `__call__` method. - Example using `dp.seq2seq.TrainingSampler` for training: + Example using `tfa.seq2seq.TrainingSampler` for training: >>> batch_size = 4 >>> max_time = 7 @@ -58,10 +57,10 @@ class BasicDecoder(decoder.BaseDecoder): >>> >>> embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_size) >>> decoder_cell = tf.keras.layers.LSTMCell(hidden_size) - >>> sampler = dp.seq2seq.TrainingSampler() + >>> sampler = tfa.seq2seq.TrainingSampler() >>> output_layer = tf.keras.layers.Dense(output_vocab_size) >>> - >>> decoder = dp.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer) + >>> decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer) >>> >>> input_ids = tf.random.uniform( ... [batch_size, max_time], maxval=input_vocab_size, dtype=tf.int64) @@ -76,10 +75,10 @@ class BasicDecoder(decoder.BaseDecoder): >>> logits.shape TensorShape([4, 7, 64]) - Example using `dp.seq2seq.GreedyEmbeddingSampler` for inference: + Example using `tfa.seq2seq.GreedyEmbeddingSampler` for inference: - >>> sampler = dp.seq2seq.GreedyEmbeddingSampler(embedding_layer) - >>> decoder = dp.seq2seq.BasicDecoder( + >>> sampler = tfa.seq2seq.GreedyEmbeddingSampler(embedding_layer) + >>> decoder = tfa.seq2seq.BasicDecoder( ... decoder_cell, sampler, output_layer, maximum_iterations=10) >>> >>> initial_state = decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32) @@ -106,11 +105,11 @@ def __init__( Args: cell: A layer that implements the `tf.keras.layers.AbstractRNNCell` interface. - sampler: A `dp.seq2seq.Sampler` instance. + sampler: A `tfa.seq2seq.Sampler` instance. output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e., `tf.keras.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. - **kwargs: Other keyword arguments of `dp.seq2seq.BaseDecoder`. + **kwargs: Other keyword arguments of `tfa.seq2seq.BaseDecoder`. """ keras_utils.assert_like_rnncell("cell", cell) self.cell = cell diff --git a/deepray/seq2seq/beam_search_decoder.py b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py similarity index 98% rename from deepray/seq2seq/beam_search_decoder.py rename to deepray/custom_ops/seq2seq/python/beam_search_decoder.py index 5626057b..25082eac 100644 --- a/deepray/seq2seq/beam_search_decoder.py +++ b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py @@ -15,19 +15,18 @@ """A decoder that performs beam search.""" import collections -import numpy as np +from typing import Callable, Optional +import numpy as np import tensorflow as tf +from typeguard import typechecked from deepray import options -from deepray.seq2seq import attention_wrapper -from deepray.seq2seq import decoder from deepray.utils import keras_utils from deepray.utils.resource_loader import LazySO from deepray.utils.types import FloatTensorLike, TensorLike, Number - -from typeguard import typechecked -from typing import Callable, Optional +from . import attention_wrapper +from . import decoder _beam_search_so = LazySO("custom_ops/seq2seq/_beam_search_ops.so") @@ -44,7 +43,7 @@ class BeamSearchDecoderState( ), ) ): - """State of a `dp.seq2seq.BeamSearchDecoder`. + """State of a `tfa.seq2seq.BeamSearchDecoder`. Attributes: cell_state: The cell state returned at the previous time step. @@ -64,12 +63,12 @@ class BeamSearchDecoderState( class BeamSearchDecoderOutput( collections.namedtuple("BeamSearchDecoderOutput", ("scores", "predicted_ids", "parent_ids")) ): - """Outputs of a `dp.seq2seq.BeamSearchDecoder` step. + """Outputs of a `tfa.seq2seq.BeamSearchDecoder` step. Attributes: scores: The scores this step, which are the log probabilities over the output vocabulary, possibly penalized by length - and attention coverage. When `dp.seq2seq.BeamSearchDecoder` is created with + and attention coverage. When `tfa.seq2seq.BeamSearchDecoder` is created with `output_all_scores=False` (default), this will be a `float32` `Tensor` of shape `[batch_size, beam_width]` containing the top scores corresponding to the predicted IDs. When `output_all_scores=True`, @@ -93,7 +92,7 @@ class FinalBeamSearchDecoderOutput( predicted_ids: The final prediction. A tensor of shape `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if `output_time_major` is True). Beams are ordered from best to worst. - beam_search_decoder_output: An instance of `dp.seq2seq.BeamSearchDecoderOutput` that + beam_search_decoder_output: An instance of `tfa.seq2seq.BeamSearchDecoderOutput` that describes the state of the beam search. """ @@ -220,7 +219,7 @@ def gather_tree( """ if not options.is_custom_kernel_disabled(): try: - return _beam_search_so.ops.deepray_gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token) + return _beam_search_so.ops.addons_gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token) except tf.errors.NotFoundError: options.warn_fallback("gather_tree") @@ -686,10 +685,10 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder): """Beam search decoder. **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in - `dp.seq2seq.AttentionWrapper`, then you must ensure that: + `tfa.seq2seq.AttentionWrapper`, then you must ensure that: - The encoder output has been tiled to `beam_width` via - `dp.seq2seq.tile_batch` (NOT `tf.tile`). + `tfa.seq2seq.tile_batch` (NOT `tf.tile`). - The `batch_size` argument passed to the `get_initial_state` method of this wrapper is equal to `true_batch_size * beam_width`. - The initial state created with `get_initial_state` above contains a @@ -699,11 +698,11 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder): An example: ``` - tiled_encoder_outputs = dp.seq2seq.tile_batch( + tiled_encoder_outputs = tfa.seq2seq.tile_batch( encoder_outputs, multiplier=beam_width) - tiled_encoder_final_state = dp.seq2seq.tile_batch( + tiled_encoder_final_state = tfa.seq2seq.tile_batch( encoder_final_state, multiplier=beam_width) - tiled_sequence_length = dp.seq2seq.tile_batch( + tiled_sequence_length = tfa.seq2seq.tile_batch( sequence_length, multiplier=beam_width) attention_mechanism = MyFavoriteAttentionMechanism( num_units=attention_depth, @@ -716,7 +715,7 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder): cell_state=tiled_encoder_final_state) ``` - Meanwhile, with `dp.seq2seq.AttentionWrapper`, coverage penalty is suggested to use + Meanwhile, with `tfa.seq2seq.AttentionWrapper`, coverage penalty is suggested to use when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages the decoding to cover all inputs. """ diff --git a/deepray/seq2seq/decoder.py b/deepray/custom_ops/seq2seq/python/decoder.py similarity index 97% rename from deepray/seq2seq/decoder.py rename to deepray/custom_ops/seq2seq/python/decoder.py index 97875555..f5b9bad5 100644 --- a/deepray/seq2seq/decoder.py +++ b/deepray/custom_ops/seq2seq/python/decoder.py @@ -15,14 +15,14 @@ """Base classes and functions for dynamic decoding.""" import abc - -import tensorflow as tf -from deepray.utils.types import TensorLike -from typeguard import typechecked from typing import Any, Optional, Tuple, Union +import tensorflow as tf # TODO: Find public API alternatives to these from tensorflow.python.ops import control_flow_util +from typeguard import typechecked + +from deepray.utils.types import TensorLike class Decoder(metaclass=abc.ABCMeta): @@ -37,7 +37,7 @@ class Decoder(metaclass=abc.ABCMeta): finished. - `training`: boolean whether it should behave in training mode or in inference mode. - - `outputs`: instance of `dp.seq2seq.BasicDecoderOutput`. Result of the decoding, at + - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at each time step. """ @@ -105,12 +105,12 @@ def tracks_own_finished(self): """Describes whether the Decoder keeps track of finished states. Most decoders will emit a true/false `finished` value independently - at each time step. In this case, the `dp.seq2seq.dynamic_decode` function keeps + at each time step. In this case, the `tfa.seq2seq.dynamic_decode` function keeps track of which batch entries are already finished, and performs a logical OR to insert new batches to the finished set. Some decoders, however, shuffle batches / beams between time steps and - `dp.seq2seq.dynamic_decode` will mix up the finished state across these entries + `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries because it does not track the reshuffle across time steps. In this case, it is up to the decoder to declare that it will keep track of its own finished state by setting this property to `True`. @@ -135,7 +135,7 @@ class BaseDecoder(tf.keras.layers.Layer): finished. - `training`: boolean whether it should behave in training mode or in inference mode. - - `outputs`: instance of `dp.seq2seq.BasicDecoderOutput`. Result of the decoding, at + - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at each time step. """ @@ -238,12 +238,12 @@ def tracks_own_finished(self): """Describes whether the Decoder keeps track of finished states. Most decoders will emit a true/false `finished` value independently - at each time step. In this case, the `dp.seq2seq.dynamic_decode` function keeps + at each time step. In this case, the `tfa.seq2seq.dynamic_decode` function keeps track of which batch entries are already finished, and performs a logical OR to insert new batches to the finished set. Some decoders, however, shuffle batches / beams between time steps and - `dp.seq2seq.dynamic_decode` will mix up the finished state across these entries + `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries because it does not track the reshuffle across time steps. In this case, it is up to the decoder to declare that it will keep track of its own finished state by setting this property to `True`. @@ -274,7 +274,7 @@ def dynamic_decode( Calls `initialize()` once and `step()` repeatedly on the decoder object. Args: - decoder: A `dp.seq2seq.Decoder` or `dp.seq2seq.BaseDecoder` instance. + decoder: A `tfa.seq2seq.Decoder` or `tfa.seq2seq.BaseDecoder` instance. output_time_major: Python boolean. Default: `False` (batch major). If `True`, outputs are returned as time major tensors (this mode is faster). Otherwise, outputs are returned as batch major tensors (this diff --git a/deepray/seq2seq/loss.py b/deepray/custom_ops/seq2seq/python/loss.py similarity index 99% rename from deepray/seq2seq/loss.py rename to deepray/custom_ops/seq2seq/python/loss.py index 148e8262..7d3cb9b6 100644 --- a/deepray/seq2seq/loss.py +++ b/deepray/custom_ops/seq2seq/python/loss.py @@ -14,11 +14,12 @@ # ============================================================================== """Loss functions for sequence models.""" -import tensorflow as tf -from deepray.utils.types import TensorLike +from typing import Callable, Optional +import tensorflow as tf from typeguard import typechecked -from typing import Callable, Optional + +from deepray.utils.types import TensorLike def sequence_loss( diff --git a/deepray/seq2seq/sampler.py b/deepray/custom_ops/seq2seq/python/sampler.py similarity index 99% rename from deepray/seq2seq/sampler.py rename to deepray/custom_ops/seq2seq/python/sampler.py index af6b5f1c..069d70db 100644 --- a/deepray/seq2seq/sampler.py +++ b/deepray/custom_ops/seq2seq/python/sampler.py @@ -15,13 +15,14 @@ """Objects sampling from the decoder output distribution and producing the next input.""" import abc +from typing import Callable, Optional import tensorflow as tf -from deepray.seq2seq import decoder -from deepray.utils.types import Initializer, TensorLike from typeguard import typechecked -from typing import Callable, Optional + from deepray.utils import types +from deepray.utils.types import Initializer, TensorLike +from . import decoder _transpose_batch_time = decoder._transpose_batch_time @@ -31,7 +32,7 @@ class Sampler(metaclass=abc.ABCMeta): Sampler classes implement the logic of sampling from the decoder output distribution and producing the inputs for the next decoding step. In most cases, they should not be - used directly but passed to a `dp.seq2seq.BasicDecoder` instance that will manage the + used directly but passed to a `tfa.seq2seq.BasicDecoder` instance that will manage the sampling. Here is an example using a training sampler directly to implement a custom decoding @@ -41,7 +42,7 @@ class Sampler(metaclass=abc.ABCMeta): >>> max_time = 7 >>> hidden_size = 16 >>> - >>> sampler = dp.seq2seq.TrainingSampler() + >>> sampler = tfa.seq2seq.TrainingSampler() >>> cell = tf.keras.layers.LSTMCell(hidden_size) >>> >>> input_tensors = tf.random.uniform([batch_size, max_time, hidden_size]) diff --git a/deepray/custom_ops/seq2seq/python/tests/__init__.py b/deepray/custom_ops/seq2seq/python/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/seq2seq/tests/attention_wrapper_test.py b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py similarity index 98% rename from deepray/seq2seq/tests/attention_wrapper_test.py rename to deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py index 6332f32b..82c91ba5 100644 --- a/deepray/seq2seq/tests/attention_wrapper_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py @@ -12,17 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for dp.seq2seq.attention_wrapper.""" +"""Tests for tfa.seq2seq.attention_wrapper.""" import collections -import pytest import numpy as np +import pytest import tensorflow as tf +from packaging.version import Version -from deepray.seq2seq import attention_wrapper as wrapper -from deepray.seq2seq import basic_decoder -from deepray.seq2seq import sampler as sampler_py +from deepray.custom_ops.seq2seq import attention_wrapper as wrapper +from deepray.custom_ops.seq2seq import basic_decoder +from deepray.custom_ops.seq2seq import sampler as sampler_py class DummyData: @@ -123,6 +124,9 @@ def test_save_load_layer(attention_cls): model.compile("rmsprop", "mse") y_ref = model.predict_on_batch([x_test, dummy_data.query, dummy_data.state]) + if Version(tf.__version__) >= Version("2.13"): + model.use_legacy_config = True + config = model.get_config() weights = model.get_weights() loaded_model = tf.keras.Model.from_config(config, custom_objects={attention_cls.__name__: attention_cls}) @@ -173,7 +177,7 @@ def test_masking(): @pytest.mark.parametrize("attention_cls", attention_classes) def test_memory_re_setup(attention_cls): - class MyModel(tf.keras.Model): + class MyModel(tf.keras.models.Model): def __init__(self, vocab, embedding_dim, memory_size, units): super().__init__() @@ -749,7 +753,7 @@ def test_luong_monotonic_scaled(): def test_attention_state_with_keras_rnn(): - # See https://github.com/tensorflow/deepray/issues/1095. + # See https://github.com/tensorflow/addons/issues/1095. cell = tf.keras.layers.LSTMCell(8) mechanism = wrapper.LuongAttention(units=8, memory=tf.ones((2, 4, 8))) diff --git a/deepray/seq2seq/tests/basic_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py similarity index 99% rename from deepray/seq2seq/tests/basic_decoder_test.py rename to deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py index 74a62e29..5e47e17d 100644 --- a/deepray/seq2seq/tests/basic_decoder_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py @@ -12,16 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for dp.seq2seq.basic_decoder.""" +"""Tests for tfa.seq2seq.basic_decoder.""" import numpy as np import pytest - import tensorflow as tf -from deepray.seq2seq import attention_wrapper -from deepray.seq2seq import basic_decoder -from deepray.seq2seq import sampler as sampler_py +from deepray.custom_ops.seq2seq import attention_wrapper +from deepray.custom_ops.seq2seq import basic_decoder +from deepray.custom_ops.seq2seq import sampler as sampler_py @pytest.mark.parametrize("use_output_layer", [True, False]) diff --git a/deepray/seq2seq/tests/beam_search_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py similarity index 98% rename from deepray/seq2seq/tests/beam_search_decoder_test.py rename to deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py index 7ea81644..854554d0 100644 --- a/deepray/seq2seq/tests/beam_search_decoder_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for dp.seq2seq.seq2seq.beam_search_decoder.""" +"""Tests for tfa.seq2seq.seq2seq.beam_search_decoder.""" import numpy as np import pytest import tensorflow as tf -from deepray.seq2seq import attention_wrapper -from deepray.seq2seq import beam_search_decoder, gather_tree +from deepray.custom_ops.seq2seq import attention_wrapper +from deepray.custom_ops.seq2seq import beam_search_decoder @pytest.mark.usefixtures("run_custom_and_py_ops") @@ -42,7 +42,7 @@ def test_gather_tree(): expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]], [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([1, 0, 2]) - res = gather_tree( + res = beam_search_decoder.gather_tree( predicted_ids, parent_ids, max_sequence_lengths=max_sequence_lengths, diff --git a/deepray/seq2seq/tests/beam_search_ops_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py similarity index 94% rename from deepray/seq2seq/tests/beam_search_ops_test.py rename to deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py index 0d457da6..06270686 100644 --- a/deepray/seq2seq/tests/beam_search_ops_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for dp.seq2seq.beam_search_ops.""" +"""Tests for tfa.seq2seq.beam_search_ops.""" import itertools @@ -20,7 +20,7 @@ import pytest import tensorflow as tf -from deepray.seq2seq import gather_tree +from deepray.custom_ops.seq2seq import beam_search_decoder def _transpose_batch_time(x): @@ -35,7 +35,7 @@ def test_gather_tree_one(): parent_ids = _transpose_batch_time([[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]]]) max_sequence_lengths = [3] expected_result = _transpose_batch_time([[[2, 2, 2], [6, 5, 6], [7, 8, 9], [10, 10, 10]]]) - beams = gather_tree( + beams = beam_search_decoder.gather_tree( step_ids=step_ids, parent_ids=parent_ids, max_sequence_lengths=max_sequence_lengths, @@ -54,7 +54,7 @@ def test_bad_parent_values_on_cpu(): max_sequence_lengths = [3] with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"): - _ = gather_tree( + _ = beam_search_decoder.gather_tree( step_ids=step_ids, parent_ids=parent_ids, max_sequence_lengths=max_sequence_lengths, @@ -73,7 +73,7 @@ def test_bad_parent_values_on_gpu(): max_sequence_lengths = [3] with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"): - _ = gather_tree( + _ = beam_search_decoder.gather_tree( step_ids=step_ids, parent_ids=parent_ids, max_sequence_lengths=max_sequence_lengths, @@ -92,7 +92,7 @@ def test_gather_tree_batch(): step_ids = np.random.randint(0, high=end_token + 1, size=(max_time, batch_size, beam_width)) parent_ids = np.random.randint(0, high=beam_width - 1, size=(max_time, batch_size, beam_width)) - beams = gather_tree( + beams = beam_search_decoder.gather_tree( step_ids=step_ids.astype(np.int32), parent_ids=parent_ids.astype(np.int32), max_sequence_lengths=max_sequence_lengths, diff --git a/deepray/seq2seq/tests/decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py similarity index 97% rename from deepray/seq2seq/tests/decoder_test.py rename to deepray/custom_ops/seq2seq/python/tests/decoder_test.py index b5eb9d8c..7b07c73a 100644 --- a/deepray/seq2seq/tests/decoder_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for dp.seq2seq.decoder.""" +"""Tests for tfa.seq2seq.decoder.""" import numpy as np import pytest import tensorflow as tf -from deepray.seq2seq import basic_decoder -from deepray.seq2seq import decoder -from deepray.seq2seq import sampler as sampler_py +from deepray.custom_ops.seq2seq import basic_decoder +from deepray.custom_ops.seq2seq import decoder +from deepray.custom_ops.seq2seq import sampler as sampler_py from deepray.utils import test_utils @@ -28,6 +28,7 @@ @pytest.mark.parametrize("maximum_iterations", [None, 1, tf.constant(1, dtype=tf.int32)]) @pytest.mark.parametrize("time_major", [True, False]) def test_dynamic_decode_rnn(time_major, maximum_iterations): + sequence_length = [3, 4, 3, 1, 0] batch_size = 5 max_time = 8 diff --git a/deepray/seq2seq/tests/loss_test.py b/deepray/custom_ops/seq2seq/python/tests/loss_test.py similarity index 99% rename from deepray/seq2seq/tests/loss_test.py rename to deepray/custom_ops/seq2seq/python/tests/loss_test.py index 6d48f578..e187292d 100644 --- a/deepray/seq2seq/tests/loss_test.py +++ b/deepray/custom_ops/seq2seq/python/tests/loss_test.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for tf.deepray.seq2seq.python.loss_ops.""" +"""Tests for tf.addons.seq2seq.python.loss_ops.""" import pytest import numpy as np import tensorflow as tf -from deepray.seq2seq import loss +from deepray.custom_ops.seq2seq import loss def get_test_data(): @@ -98,6 +98,7 @@ def test_sequence_loss(average_across_timesteps, average_across_batch, zero_weig @pytest.mark.parametrize("average_across_timesteps", [True, False]) @pytest.mark.parametrize("average_across_batch", [True, False]) def test_sequence_loss_class(average_across_timesteps, average_across_batch): + ( batch_size, sequence_length, diff --git a/deepray/custom_ops/seq2seq/python/tests/run_all_test.py b/deepray/custom_ops/seq2seq/python/tests/run_all_test.py new file mode 100644 index 00000000..62a5c7ed --- /dev/null +++ b/deepray/custom_ops/seq2seq/python/tests/run_all_test.py @@ -0,0 +1,9 @@ +from pathlib import Path +import sys + +import pytest + +if __name__ == "__main__": + dirname = Path(__file__).absolute().parent + # sys.exit(pytest.main([str(dirname)])) + sys.exit(pytest.main(["-n 20", "-s", "-v", str(dirname)])) diff --git a/deepray/custom_ops/simple_hash_table/BUILD b/deepray/custom_ops/simple_hash_table/BUILD index 4d911a5b..660ba05e 100644 --- a/deepray/custom_ops/simple_hash_table/BUILD +++ b/deepray/custom_ops/simple_hash_table/BUILD @@ -12,7 +12,7 @@ custom_op_library( "simple_hash_table_op.cc", ], deps = [ - "@com_google_absl//absl/container:flat_hash_map", + "//deepray/custom_ops/utils:ok_status_util", ], ) @@ -21,8 +21,6 @@ py_library( srcs = ["simple_hash_table_op.py"], data = ["simple_hash_table_kernel.so"], srcs_version = "PY3", - deps = [ - ], ) py_library( @@ -56,5 +54,7 @@ py_test( ], deps = [ ":simple_hash_table", + "@pypi_absl_py//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc index 94ab7103..a603fee0 100644 --- a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc +++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc @@ -17,18 +17,11 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "absl/strings/str_cat.h" +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/platform/strcat.h" -/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()". -This code is for compatibility.*/ -#if TF_VERSION_INTEGER >= 2100 -#define TFOkStatus ::tensorflow::OkStatus() -#else -#define TFOkStatus ::tensorflow::Status::OK() -#endif - // Please use the appropriate namespace for your project namespace tensorflow { namespace custom_op_examples { @@ -100,6 +93,8 @@ class SimpleHashTableResource : public ::tensorflow::ResourceBase { Status Import(const Tensor& keys, const Tensor& values) { const auto key_values = keys.flat(); const auto value_values = values.flat(); + LOG(INFO) << "key_values = " << key_values; + LOG(INFO) << "value_values = " << value_values; mutex_lock l(mu_); table_.clear(); diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc index cbc9022f..e96c2f46 100644 --- a/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc +++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc @@ -15,17 +15,10 @@ limitations under the License. #include +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" -/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()". -This code is for compatibility.*/ -#if TF_VERSION_INTEGER >= 2100 -#define TFOkStatus ::tensorflow::OkStatus() -#else -#define TFOkStatus ::tensorflow::Status::OK() -#endif - // Please use the appropriate namespace for your project namespace tensorflow { namespace custom_op_examples { diff --git a/deepray/custom_ops/sleep/BUILD b/deepray/custom_ops/sleep/BUILD index ab6cf8bc..3a49e002 100644 --- a/deepray/custom_ops/sleep/BUILD +++ b/deepray/custom_ops/sleep/BUILD @@ -11,7 +11,7 @@ custom_op_library( "sleep_op.cc", ], deps = [ - "@com_google_absl//absl/container:flat_hash_map", + "//deepray/custom_ops/utils:ok_status_util", ], ) @@ -45,5 +45,6 @@ py_test( ], deps = [ ":sleep_op", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/sleep/sleep_op.cc b/deepray/custom_ops/sleep/sleep_op.cc index 0fe77599..df6b37c7 100644 --- a/deepray/custom_ops/sleep/sleep_op.cc +++ b/deepray/custom_ops/sleep/sleep_op.cc @@ -13,9 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" +using namespace tensorflow; // NOLINT(build/namespaces) + // Use a namespace when registering by prepending the // package's name to the op’s name and separate with a '>'. // This is the recommendation for out-of-tree ops to avoid name collisions in @@ -28,7 +31,7 @@ using ::tensorflow::shape_inference::InferenceContext; ::tensorflow::Status ScalarOutput(InferenceContext* c) { c->set_output(0, c->Scalar()); - return ::tensorflow::Status::OK(); + return TFOkStatus; } REGISTER_OP("Examples>AsyncSleep") diff --git a/deepray/custom_ops/text/BUILD b/deepray/custom_ops/text/BUILD index b9641b60..55b151e3 100644 --- a/deepray/custom_ops/text/BUILD +++ b/deepray/custom_ops/text/BUILD @@ -1,10 +1,10 @@ -load("//deepray:deepray.bzl", "custom_op_library") +load("//build_deps/pip_tf:defs.bzl", "tf_custom_op_library") licenses(["notice"]) # Apache 2.0 package(default_visibility = ["//visibility:public"]) -custom_op_library( +tf_custom_op_library( name = "_skip_gram_ops.so", srcs = [ "cc/kernels/skip_gram_kernels.cc", @@ -12,7 +12,7 @@ custom_op_library( ], ) -custom_op_library( +tf_custom_op_library( name = "_parse_time_op.so", srcs = select({ "//deepray:windows": [], diff --git a/deepray/custom_ops/training_ops/BUILD b/deepray/custom_ops/training_ops/BUILD index 8ff1851a..c830984b 100644 --- a/deepray/custom_ops/training_ops/BUILD +++ b/deepray/custom_ops/training_ops/BUILD @@ -1,3 +1,4 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("//deepray:deepray.bzl", "custom_op_library") licenses(["notice"]) # Apache 2.0 @@ -9,10 +10,15 @@ custom_op_library( "cc/kernels/training_ops.h", "cc/ops/training_ops.cc", ], - cuda_srcs = [ + copts = if_cuda(["-DGOOGLE_CUDA=1"]), + gpu_srcs = [ "cc/kernels/training_ops.h", "cc/kernels/training_ops_gpu.cu.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + # "@org_tensorflow//tensorflow/core/kernels:training_op_helpers", + ], ) py_library( diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc b/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc index 0cf0988e..0e455820 100644 --- a/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc +++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc @@ -18,6 +18,7 @@ limitations under the License. #include // NOLINT +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -31,7 +32,6 @@ using GPUDevice = Eigen::GpuDevice; using Index = Eigen::Index; namespace functor { - template struct SparseApplyAdam { Status operator()(const CPUDevice& d, typename TTypes::Matrix var, @@ -46,7 +46,7 @@ struct SparseApplyAdam { typename TTypes::ConstVec indices, const int64 inner_dim) { const Tindex N = static_cast(indices.dimension(0)); - if (N == 0) return Status::OK(); + if (N == 0) return TFOkStatus; const Tindex first_dim_size = static_cast(var.dimension(0)); const T beta1_power_scalar = beta1_power(); const T beta2_power_scalar = beta2_power(); @@ -120,11 +120,10 @@ struct SparseApplyAdam { d.parallelFor(N, cost, DoWork); } - return Status::OK(); + return TFOkStatus; } }; - -} // namespace functor +} // End of namespace functor template class SparseApplyAdamOp : public OpKernel { @@ -133,7 +132,7 @@ class SparseApplyAdamOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } - void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { + void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS { const bool sparse = true; auto locks = MaybeLockVariableInputMutexesInOrder( ctx, use_exclusive_lock_, sparse, {0, 1, 2}); @@ -290,4 +289,187 @@ REGISTER_KERNELS(GPU, double, int64); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #undef REGISTER_KERNELS +namespace functor { +template +struct ApplyAdamAsync { + void operator()(const CPUDevice& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::Scalar beta1_power, + typename TTypes::Scalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov) { + auto alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) / + (T(1) - beta1_power()); + + // beta1 == μ + // beta2 == ν + // v == n + // var == θ + m.device(d) = m * beta1() + grad * (T(1) - beta1()); + v.device(d) = v * beta2() + grad.square() * (T(1) - beta2()); + if (use_nesterov) { + var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) / + (v.sqrt() + epsilon()); + } else { + var.device(d) -= (m * alpha) / (v.sqrt() + epsilon()); + } + + // update beta1_power && beta2_power + beta1_power.device(d) = beta1_power * beta1(); + beta2_power.device(d) = beta2_power * beta2(); + } +}; +} // namespace functor + +template +class ApplyAdamAsyncOp : public OpKernel { + public: + explicit ApplyAdamAsyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_)); + } + + void Compute(OpKernelContext* ctx) override { + const bool sparse = false; + auto locks = MaybeLockVariableInputMutexesInOrder( + ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3, 4}); + + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 0, use_exclusive_lock_, false, &var)); + Tensor m; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 1, use_exclusive_lock_, false, &m)); + Tensor v; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 2, use_exclusive_lock_, false, &v)); + Tensor beta1_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 3, use_exclusive_lock_, false, &beta1_power)); + Tensor beta2_power; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 4, use_exclusive_lock_, false, &beta2_power)); + + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(0))); + OP_REQUIRES( + ctx, m.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(1))); + OP_REQUIRES( + ctx, v.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(2))); + OP_REQUIRES( + ctx, beta1_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(3))); + OP_REQUIRES( + ctx, beta2_power.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(4))); + + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar : ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + + const Tensor& grad = ctx->input(9); + OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()), + errors::InvalidArgument("var and m do not have the same shape", + var.shape().DebugString(), " ", + m.shape().DebugString())); + OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()), + errors::InvalidArgument("var and v do not have the same shape", + var.shape().DebugString(), " ", + v.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + + const Device& device = ctx->template eigen_device(); + functor::ApplyAdamAsync()( + device, var.flat(), m.flat(), v.flat(), + beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + grad.flat(), use_nesterov_); + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; + bool use_nesterov_; +}; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdamAsync").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdamAsyncOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamAsync") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyAdamAsyncOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_bfloat16(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAdamAsync::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::Flat v, \ + typename TTypes::Scalar beta1_power, \ + typename TTypes::Scalar beta2_power, \ + typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar beta1, \ + typename TTypes::ConstScalar beta2, \ + typename TTypes::ConstScalar epsilon, \ + typename TTypes::ConstFlat grad, bool use_nesterov); \ + extern template struct ApplyAdamAsync; + +DECLARE_GPU_SPEC(Eigen::half) +DECLARE_GPU_SPEC(float) +DECLARE_GPU_SPEC(double) +#undef DECLARE_GPU_SPEC +} // end of namespace functor + +#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T); + +TF_CALL_half(REGISTER_GPU_KERNELS); +TF_CALL_float(REGISTER_GPU_KERNELS); +TF_CALL_double(REGISTER_GPU_KERNELS); + +#undef REGISTER_GPU_KERNELS +#endif // end of GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#undef REGISTER_KERNELS + } // namespace tensorflow diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops.h b/deepray/custom_ops/training_ops/cc/kernels/training_ops.h index f657bf23..8a9d2a48 100644 --- a/deepray/custom_ops/training_ops/cc/kernels/training_ops.h +++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops.h @@ -43,6 +43,19 @@ struct SparseApplyAdam { const int64 inner_dim); }; +template +struct ApplyAdamAsync { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::Scalar beta1_power, + typename TTypes::Scalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov); +}; + } // end namespace functor } // end namespace tensorflow diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc b/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc index 7c627898..60a29a5b 100644 --- a/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc +++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc @@ -17,6 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/util/gpu_kernel_helper.h" #include "training_ops.h" @@ -71,7 +72,7 @@ struct SparseApplyAdam { typename TTypes::ConstVec indices, const int64 inner_dim) { const Tindex N = static_cast(indices.dimension(0)); - if (N == 0) return Status::OK(); + if (N == 0) return TFOkStatus; const Tindex first_dim_size = var.dimension(0); const Tindex grad_size = grad.size(); @@ -87,6 +88,62 @@ struct SparseApplyAdam { } }; +template +__global__ __launch_bounds__(1024) void ApplyAdamAsyncKernel( + T* var, T* m, T* v, T* beta1_power, T* beta2_power, const T* lr_scalar, + const T* beta1_scalar, const T* beta2_scalar, const T* epsilon_scalar, + const T* grad, const bool use_nesterov, const int32 grad_size) { + T lr = *lr_scalar; + T beta1 = *beta1_scalar; + T beta2 = *beta2_scalar; + T epsilon = *epsilon_scalar; + T alpha = lr * sqrt(static_cast(1) - *beta2_power) / + (static_cast(1) - *beta1_power); + + // beta1 == μ + // beta2 == ν + // v == n + // var == θ + GPU_1D_KERNEL_LOOP(index, grad_size) { + m[index] = m[index] * beta1 + grad[index] * (static_cast(1) - beta1); + v[index] = v[index] * beta2 + + grad[index] * grad[index] * (static_cast(1) - beta2); + if (use_nesterov) { + var[index] -= + ((grad[index] * (static_cast(1) - beta1) + beta1 * m[index]) * + alpha) / + (sqrt(v[index]) + epsilon); + } else { + var[index] -= (m[index] * alpha) / (sqrt(v[index]) + epsilon); + } + } +} + +template +struct ApplyAdamAsync { + void operator()(const GPUDevice& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::Scalar beta1_power, + typename TTypes::Scalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov) { + int32 grad_size = grad.size(); + + GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d); + GpuLaunchKernel(ApplyAdamAsyncKernel, config.block_count, + config.thread_per_block, 0, d.stream(), var.data(), + m.data(), v.data(), beta1_power.data(), beta2_power.data(), + lr.data(), beta1.data(), beta2.data(), epsilon.data(), + grad.data(), use_nesterov, grad_size); + // update beta1_power && beta2_power + beta1_power.device(d) = beta1_power * beta1; + beta2_power.device(d) = beta2_power * beta2; + } +}; + } // namespace functor #define EXPLICITLY_INSTANTIATE_FUNCTOR(T) \ @@ -97,6 +154,11 @@ EXPLICITLY_INSTANTIATE_FUNCTOR(float); EXPLICITLY_INSTANTIATE_FUNCTOR(double); #undef EXPLICITLY_INSTANTIATE_FUNCTOR +#define REGISTER_ALL_TYPE(type) \ + template struct functor::ApplyAdamAsync; +TF_CALL_GPU_NUMBER_TYPES(REGISTER_ALL_TYPE); +#undef REGISTER_ALL_TYPE + } // end namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/deepray/custom_ops/training_ops/cc/ops/training_ops.cc b/deepray/custom_ops/training_ops/cc/ops/training_ops.cc index fda482d0..5ddf902e 100644 --- a/deepray/custom_ops/training_ops/cc/ops/training_ops.cc +++ b/deepray/custom_ops/training_ops/cc/ops/training_ops.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -39,7 +40,7 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); if (!sparse) { TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); - return Status::OK(); + return TFOkStatus; } // Indices is a vector where indices.dim[0].rank == grad[0].rank. ShapeHandle indices; @@ -53,7 +54,7 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first)); TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); - return Status::OK(); + return TFOkStatus; } static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { @@ -72,7 +73,7 @@ static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { if (c->num_outputs() > 0) { c->set_output(0, s); } - return Status::OK(); + return TFOkStatus; } REGISTER_OP("SparseApplyAdam") @@ -114,12 +115,50 @@ REGISTER_OP("ResourceSparseApplyAdam") return ApplyAdamShapeFn(c, true /* sparse */); }); -REGISTER_OP("ResourceApplyAdam") +static Status ApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // beta2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return TFOkStatus; +} + +REGISTER_OP("ApplyAdamAsync") + .Input("var: Ref(T)") + .Input("m: Ref(T)") + .Input("v: Ref(T)") + .Input("beta1_power: Ref(T)") + .Input("beta2_power: Ref(T)") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamAsyncShapeFn(c, false /* sparse */); + }); + +REGISTER_OP("ResourceApplyAdamAsync") .Input("var: resource") .Input("m: resource") .Input("v: resource") - .Input("beta1_power: T") - .Input("beta2_power: T") + .Input("beta1_power: resource") + .Input("beta2_power: resource") .Input("lr: T") .Input("beta1: T") .Input("beta2: T") @@ -129,7 +168,7 @@ REGISTER_OP("ResourceApplyAdam") .Attr("use_locking: bool = false") .Attr("use_nesterov: bool = false") .SetShapeFn([](InferenceContext* c) { - return ApplyAdamShapeFn(c, false /* sparse */); + return ApplyAdamAsyncShapeFn(c, false /* sparse */); }); } // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/unique_ops/BUILD b/deepray/custom_ops/unique_ops/BUILD index e240e9a2..9fa689f7 100644 --- a/deepray/custom_ops/unique_ops/BUILD +++ b/deepray/custom_ops/unique_ops/BUILD @@ -1,5 +1,5 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("//deepray:deepray.bzl", "custom_op_library") -load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION") licenses(["notice"]) # Apache 2.0 @@ -11,44 +11,42 @@ package( ) cc_library( - name = "random", + name = "unique_ali_util", srcs = [ - "cc/kernels/random.cc", - "cc/kernels/random.h", - ], - copts = [CPLUSPLUS_VERSION], - deps = [ - "@local_config_tf//:libtensorflow_framework", - "@local_config_tf//:tf_header_lib", + "cc/kernels/task_runner.h", + "cc/kernels/unique_ali_op_util.h", ], -) - -cc_test( - name = "random_test", - srcs = ["cc/kernels/random_test.cc"], deps = [ - ":random", - "@com_google_googletest//:gtest_main", + "//deepray/custom_ops/utils:ok_status_util", + "//deepray/custom_ops/utils:random", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/functional:any_invocable", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status:statusor", + "@sparsehash_c11//:dense_hash_map", ], ) custom_op_library( name = "_unique_ops.so", srcs = [ - "cc/kernels/task_runner.h", "cc/kernels/unique_ali_op.cc", - "cc/kernels/unique_ali_op_util.h", "cc/ops/unique_ops.cc", ], - copts = [CPLUSPLUS_VERSION], - cuda_srcs = [ + copts = [ + "-Wno-unused-variable", + "-Wno-unused-result", + ] + if_cuda(["-DGOOGLE_CUDA=1"]), + gpu_srcs = [ "cc/kernels/unique_ali_op_gpu.cu.cc", ], visibility = ["//visibility:public"], deps = [ - ":random", + ":unique_ali_util", "@com_google_absl//absl/container:flat_hash_map", - "@sparsehash_c11//:dense_hash_map", + "@com_google_absl//absl/functional:any_invocable", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status:statusor", ], ) @@ -73,5 +71,8 @@ py_test( main = "python/tests/run_all_test.py", deps = [ ":unique_ops", + "@pypi_numpy//:pkg", + "@pypi_pytest//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random.cc b/deepray/custom_ops/unique_ops/cc/kernels/random.cc deleted file mode 100644 index 1bf84917..00000000 --- a/deepray/custom_ops/unique_ops/cc/kernels/random.cc +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "random.h" - -#include - -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/env_var.h" - -namespace tensorflow { -namespace random { - -namespace { -std::mt19937_64* InitRngWithRandomSeed() { - std::random_device device("/dev/urandom"); - return new std::mt19937_64(device()); -} -std::mt19937_64 InitRngWithDefaultSeed() { return std::mt19937_64(); } - -} // anonymous namespace - -uint64 New64() { - static std::mt19937_64* rng = InitRngWithRandomSeed(); - static mutex mu(LINKER_INITIALIZED); - mutex_lock l(mu); - return (*rng)(); -} - -uint64 New64DefaultSeed() { - static std::mt19937_64 rng = InitRngWithDefaultSeed(); - static mutex mu(LINKER_INITIALIZED); - mutex_lock l(mu); - return rng(); -} - -uint64 New64Configuable() { - int64 random_64; - CHECK( - ReadInt64FromEnvVar("DEEPREC_CONFIG_RAND_64", New64(), &random_64).ok()); - return static_cast(random_64); -} - -} // namespace random -} // namespace tensorflow diff --git a/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h b/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h index 922f0596..566e8c4b 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h +++ b/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h @@ -18,8 +18,8 @@ limitations under the License. #include -#include "tensorflow/core/lib/core/blocking_counter.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/blocking_counter.h" namespace tensorflow { diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc index 047ff3bd..c6056334 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc +++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc @@ -17,36 +17,28 @@ limitations under the License. #include #include -#include "absl/container/flat_hash_map.h" -#include "sparsehash/dense_hash_map" #include "task_runner.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/util/env_var.h" #include "unique_ali_op_util.h" namespace tensorflow { namespace { -const char *kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL"; -const char *kUniqueOpHashMapEnv = "DEEPREC_UNIQUE_OP_HASH_MAP"; -const char *kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT"; -const char *kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE"; -const char *kMultiMapString = "MULTIMAP"; -const char *kStlHashMapString = "STL"; -const char *kAbslHashMapString = "ABSL"; -const char *kGoogleHashMapString = "GOOGLE"; +const char* kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL"; +const char* kUniqueOpHashMapEnv = "DEEPREC_UNIQUE_OP_HASH_MAP"; +const char* kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT"; +const char* kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE"; +const char* kMultiMapString = "MULTIMAP"; +const char* kStlHashMapString = "STL"; +const char* kAbslHashMapString = "ABSL"; +const char* kGoogleHashMapString = "GOOGLE"; const int64 kDefaultUniqueRatioHint = 4; } // namespace template class UniqueAliOp : public OpKernel { public: - explicit UniqueAliOp(OpKernelConstruction *context) : OpKernel(context) { + explicit UniqueAliOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK( context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, kPartitionSize, &partition_size_)); @@ -101,14 +93,14 @@ class UniqueAliOp : public OpKernel { } } - void Compute(OpKernelContext *context) override { + void Compute(OpKernelContext* context) override { VLOG(2) << "Unique V2 executed"; ComputeInternal(context); } private: - void ComputeInternal(OpKernelContext *context) { - const Tensor &input = context->input(0); + void ComputeInternal(OpKernelContext* context) { + const Tensor& input = context->input(0); Tensor idx; Tensor output; Tensor output_counter; @@ -117,7 +109,7 @@ class UniqueAliOp : public OpKernel { context, input, &idx, &output, &output_counter, num_outputs(), partition_size_, serial_, unique_ratio_hint_, map_flag_); } else { - const Tensor &axis_tensor = context->input(1); + const Tensor& axis_tensor = context->input(1); UniqueWithAxis(context, input, axis_tensor, &idx, &output, &output_counter, num_outputs(), partition_size_, serial_, unique_ratio_hint_, map_flag_); @@ -129,33 +121,65 @@ class UniqueAliOp : public OpKernel { } } + protected: bool serial_ = false; int64 partition_size_ = 0; int64 unique_ratio_hint_; UniqueMaps map_flag_ = GOOGLE; // "GOOGLE" dense hash map is default }; +template +class UniqueWithCountAliOp : public UniqueAliOp { + using UniqueAliOp::serial_; + using UniqueAliOp::partition_size_; + using UniqueAliOp::unique_ratio_hint_; + using UniqueAliOp::map_flag_; + using OpKernel::num_outputs; + + public: + explicit UniqueWithCountAliOp(OpKernelConstruction* context) + : UniqueAliOp(context) { + OP_REQUIRES_OK(context, context->GetAttr("N", &num_sparse_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + Tensor idx; + Tensor output; + Tensor output_counter; + UniqueWithExtraCounts( + context, input, &idx, &output, &output_counter, num_outputs(), + partition_size_, serial_, unique_ratio_hint_, num_sparse_, map_flag_); + context->set_output(0, output); + context->set_output(1, idx); + context->set_output(2, output_counter); + } + + private: + int num_sparse_; +}; + #define REGISTER_UNIQUE(type) \ REGISTER_KERNEL_BUILDER(Name("Deepray>Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Deepray>Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithCounts") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -165,7 +189,7 @@ class UniqueAliOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithCountsV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -175,7 +199,17 @@ class UniqueAliOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp) + UniqueAliOp) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); REGISTER_UNIQUE(tstring) #undef REGISTER_UNIQUE @@ -199,7 +233,17 @@ REGISTER_UNIQUE(tstring) .HostMemory("count") \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); + UniqueAliOp) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) \ + REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp); TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); REGISTER_UNIQUE(tstring) #undef REGISTER_UNIQUE diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc index c3677d26..05075327 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc +++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc @@ -19,23 +19,15 @@ limitations under the License. #include "cub/device/device_radix_sort.cuh" #include "cub/device/device_scan.cuh" -#include "cub/device/device_select.cuh" -#include "cub/iterator/constant_input_iterator.cuh" #include "cub/iterator/counting_input_iterator.cuh" #include "cub/iterator/transform_input_iterator.cuh" +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/platform/cuda.h" -#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/util/gpu_kernel_helper.h" #include "tensorflow/core/util/gpu_solvers.h" // For ScratchSpace -#include "tensorflow/stream_executor/stream_executor.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { using GPUDevice = Eigen::GpuDevice; @@ -149,7 +141,7 @@ class UniqueAliV2GpuOp : public AsyncOpKernel { &device, this](int64 N_out) { TF_RETURN_IF_ERROR(ctx->allocate_output(0, {N_out}, &output_tensor)); TF_RETURN_IF_ERROR(ctx->allocate_output(1, {N}, &idx_tensor)); - return Status::OK(); + return TFOkStatus; }; if (N == 0) { OP_REQUIRES_OK_ASYNC(ctx, allocate_output(0), done); @@ -242,7 +234,7 @@ class UniqueAliV2GpuOp : public AsyncOpKernel { ->ThenMemcpy(N_out.mutable_data(), wrapped_num_out, sizeof(TIndex)) .ok(), errors::Internal("Failed to launch copy from device to host."), done); - ctx->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + ctx->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute( stream, [ref_output_indices]() { ref_output_indices.Unref(); }); stream->BlockHostUntilDone(); int64_t uniq_size = (*N_out.data()) + 1; diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h index c27afd2e..54287e9d 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h +++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h @@ -22,21 +22,14 @@ limitations under the License. #include #include -#include "absl/container/flat_hash_map.h" -#include "random.h" +#include "deepray/custom_ops/utils/ok_status_util.h" +#include "deepray/custom_ops/utils/random.h" #include "sparsehash/dense_hash_map" #include "task_runner.h" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/core/blocking_counter.h" -#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/util/env_var.h" -#include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -192,7 +185,9 @@ void NewSizes(OpKernelContext* context, const Tensor& input, template void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx, - int64 axis, int64* uniq_size, Tensor* output) { + int64 axis, int64* uniq_size, int num_sparse, + google::dense_hash_map* counter_map, + Tensor* output) { auto Tin = input.flat(); const int64 N = input.NumElements(); auto idx_vec = idx->template vec(); @@ -207,6 +202,22 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx, } } + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * N); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + auto idx_it = uniq.find(ids); + if (idx_it != uniq.end()) { + counter_map->emplace(idx_it->second, counter_vec(k)); + } + } + } + *uniq_size = static_cast(uniq.size()); TensorShape output_shape(input.shape()); output_shape.set_dim(axis, *uniq_size); @@ -224,6 +235,8 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx, template void ParallelComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx, int64 axis, int64* uniq_size, + int num_sparse, + google::dense_hash_map* counter_map, Tensor* output) { // Struct INode was used to store an inverse mapping for each node in the // hash map container. @@ -424,6 +437,25 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input, TaskRunner t3_runner(GlobalIndexTask, thread_pool, num_tasks_t1); t3_runner.Run(); + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * N); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + for (int j = 0; j < num_tasks_t1; ++j) { + const INode* inode = uniq_maps[j].GetINodeByKey(ids); + if (inode != nullptr) { + counter_map->emplace(inode->index_, counter_vec(k)); + continue; + } + } + } + } + // Parallel Step 4: Write output indicies Tensor. int32 max_tasks_t4 = (N + kPartitionSize - 1) / kPartitionSize; int32 num_tasks_t4 = std::max(std::min(max_threads, max_tasks_t4), 1); @@ -458,7 +490,9 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input, template void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, int64 axis, int64* uniq_size_out, int32 num_buckets, - int64 unique_ratio_hint, Tensor* output) { + int64 unique_ratio_hint, int num_sparse, + google::dense_hash_map* counter_map, + Tensor* output) { auto Tin = input.vec(); const int64 N = input.NumElements(); @@ -475,7 +509,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, Partitioner map_parter(N, num_partitions); auto PartitionTask = [N, num_buckets, &Tin, &partitions, &map_parter, &idx_vec](int32 task_id, int32 num_tasks) { - auto st = Status::OK(); + auto st = TFOkStatus; int64* partition = partitions.get() + task_id * num_buckets; for (int64 i = 0; i < num_buckets; ++i) { partition[i] = -1; @@ -499,7 +533,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, }; SummaryTaskRunner t0_runner( - PartitionTask, Status::OK(), thread_pool, num_partitions); + PartitionTask, TFOkStatus, thread_pool, num_partitions); t0_runner.Run(); OP_REQUIRES_OK(context, t0_runner.summary()); @@ -543,6 +577,24 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, int64 uniq_size = global_offsets[num_buckets - 1] + uniq_maps[num_buckets - 1].size(); + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * uniq_size); + + google::dense_hash_map extra_unique_id_map; + extra_unique_id_map.set_empty_key(std::numeric_limits::max()); + extra_unique_id_map.resize(2 * uniq_size); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + auto counts = counter_vec(k); + extra_unique_id_map.emplace(ids, counts); + } + } + *uniq_size_out = uniq_size; AllocatorAttributes attr; attr.set_on_host(true); @@ -552,7 +604,8 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, auto key_output_vec = output->template vec(); auto OutputTask = [&key_output_vec, &uniq_maps, &global_offsets, &Tin, - &idx_vec, &map_parter](int32 task_id, int32 num_tasks) { + &idx_vec, &map_parter, &counter_map, + extra_unique_id_map](int32 task_id, int32 num_tasks) { TIndex offset = global_offsets[task_id]; for (auto iter = uniq_maps[task_id].begin(); iter != uniq_maps[task_id].end(); ++iter) { @@ -566,7 +619,10 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, next_idx = idx_vec(cur_idx); idx_vec(cur_idx) = offset; } - + auto it = extra_unique_id_map.find(iter->first); + if (it != extra_unique_id_map.end()) { + counter_map->emplace(offset, it->second); + } ++offset; } }; @@ -631,8 +687,10 @@ void MultipleElements(OpKernelContext* context, const Tensor& input, } template -void CheckCountOutput(OpKernelContext* context, Tensor* output_counter, - Tensor* idx, int num_outputs, int64 uniq_size) { +void CheckCountOutput(OpKernelContext* context, Tensor* output, + Tensor* output_counter, Tensor* idx, int num_outputs, + int64 uniq_size, int num_sparse, + google::dense_hash_map counter_map) { if (num_outputs > 2) { auto idx_vec = idx->template vec(); AllocatorAttributes attr; @@ -646,13 +704,19 @@ void CheckCountOutput(OpKernelContext* context, Tensor* output_counter, for (int64 i = 0; i < N; ++i) { count_output_vec(idx_vec(i))++; } + if (num_sparse > 0) { + for (auto& it : counter_map) { + count_output_vec(it.first) += (it.second - 1); + } + } } } template -void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input, - Tensor* idx, int64 axis, int64* uniq_size, - int64 N, bool serial, Tensor* output) { +void ComputeInternalWithHashMap( + OpKernelContext* context, const Tensor& input, Tensor* idx, int64 axis, + int64* uniq_size, int64 N, int num_sparse, bool serial, + google::dense_hash_map* counter_map, Tensor* output) { OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), errors::InvalidArgument("unique expects a 1D vector.")); // TODO(dga): Make unique polymorphic for returning int32 and int64 @@ -664,10 +728,10 @@ void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input, if (N >= kPartitionLimit && !serial) { ParallelComputeV1(context, input, idx, axis, uniq_size, - output); + num_sparse, counter_map, output); } else { SerialComputeV1(context, input, idx, axis, uniq_size, - output); + num_sparse, counter_map, output); } } @@ -676,7 +740,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs, int64 partition_size, bool serial, int64 axis, int64 unique_ratio_hint, std::vector& new_sizes, - UniqueMaps map_flag) { + UniqueMaps map_flag, int num_sparse = 0) { typedef google::dense_hash_map DefaultHashMap; AllocatorAttributes attr; @@ -686,6 +750,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx, TensorShape({new_sizes[1]}), idx, attr)); int64 uniq_size_out; + google::dense_hash_map counter_map; if (new_sizes[0] == 1 && new_sizes[2] == 1) { // Specialized and faster implementation when unique is run over single @@ -704,35 +769,40 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx, MultiMapCompute>( context, input, idx, axis, &uniq_size_out, num_buckets, - unique_ratio_hint, output); + unique_ratio_hint, num_sparse, &counter_map, output); } else { SerialComputeV1(context, input, idx, axis, - &uniq_size_out, output); + &uniq_size_out, num_sparse, + &counter_map, output); } break; case STL: ComputeInternalWithHashMap>( - context, input, idx, axis, &uniq_size_out, N, serial, output); + context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, + &counter_map, output); break; case ABSL: ComputeInternalWithHashMap>( - context, input, idx, axis, &uniq_size_out, N, serial, output); + context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, + &counter_map, output); break; case GOOGLE: ComputeInternalWithHashMap( - context, input, idx, axis, &uniq_size_out, N, serial, output); + context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, + &counter_map, output); break; default: ComputeInternalWithHashMap( - context, input, idx, axis, &uniq_size_out, N, serial, output); + context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, + &counter_map, output); } } else { MultipleElements(context, input, idx, output, &uniq_size_out, axis, new_sizes); } - CheckCountOutput(context, output_counter, idx, num_outputs, - uniq_size_out); + CheckCountOutput(context, output, output_counter, idx, num_outputs, + uniq_size_out, num_sparse, counter_map); } template @@ -763,6 +833,21 @@ void UniqueWithAxis(OpKernelContext* context, const Tensor& input, unique_ratio_hint, new_sizes, map_flag); } +template +void UniqueWithExtraCounts(OpKernelContext* context, const Tensor& input, + Tensor* idx, Tensor* output, Tensor* output_counter, + int num_outputs, int64 partition_size, bool serial, + int64 unique_ratio_hint, int num_sparse, + UniqueMaps map_flag) { + int64 axis = 0; + std::vector new_sizes{1, input.NumElements(), 1}; + OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), + errors::InvalidArgument("unique expects a 1D vector.")); + UniqueInternal(context, input, idx, output, output_counter, + num_outputs, partition_size, serial, axis, + unique_ratio_hint, new_sizes, map_flag, num_sparse); +} + } // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_UNIQUE_ALI_OP_UTIL_H_ diff --git a/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc b/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc index f8158336..67c83d6e 100644 --- a/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc +++ b/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -46,10 +47,9 @@ REGISTER_OP("Deepray>UniqueV2") .SetShapeFn([](InferenceContext* c) { c->set_output(0, c->Vector(InferenceContext::kUnknownDim)); c->set_output(1, c->input(0)); - return Status::OK(); + return TFOkStatus; }); -// -------------------------------------------------------------------------- REGISTER_OP("Deepray>UniqueWithCounts") .Input("x: T") .Output("y: T") @@ -62,7 +62,7 @@ REGISTER_OP("Deepray>UniqueWithCounts") c->set_output(0, uniq); c->set_output(1, c->input(0)); c->set_output(2, uniq); - return Status::OK(); + return TFOkStatus; }); REGISTER_OP("Deepray>UniqueWithCountsV2") @@ -79,7 +79,25 @@ REGISTER_OP("Deepray>UniqueWithCountsV2") c->set_output(0, uniq); c->set_output(1, c->input(0)); c->set_output(2, uniq); - return Status::OK(); + return TFOkStatus; + }); + +REGISTER_OP("Deepray>UniqueWithExtraCounts") + .Input("x: T") + .Input("extra_indices: N * T") + .Input("extra_counts: N * out_idx") + .Output("y: T") + .Output("idx: out_idx") + .Output("count: out_idx") + .Attr("T: type") + .Attr("N: int >= 0") + .Attr("out_idx: {int32, int64} = DT_INT32") + .SetShapeFn([](InferenceContext* c) { + auto uniq = c->Vector(InferenceContext::kUnknownDim); + c->set_output(0, uniq); + c->set_output(1, c->input(0)); + c->set_output(2, uniq); + return TFOkStatus; }); } // namespace tensorflow \ No newline at end of file diff --git a/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py new file mode 100644 index 00000000..bdf8a334 --- /dev/null +++ b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py @@ -0,0 +1,349 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.kernels.unique_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import numpy as np + +# set environ before tf initializing global varialbes +PreservedKey = 1 << 10 +os.environ["DEEPREC_CONFIG_RAND_64"] = str(PreservedKey) + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import constant_op +from tensorflow.python.platform import test + +from deepray.custom_ops.unique_ops import gen_array_ops + + +class UniqueTest(test.TestCase): + + def testInt32(self): + x = np.random.randint(0, high=1000, size=700000) + with self.cached_session(use_gpu=True) as sess: + y, idx = gen_array_ops.deepray_unique(x) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def testInt32OutIdxInt64(self): + x = np.random.randint(2, high=1000, size=700000) + with self.cached_session(use_gpu=True) as sess: + y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def testInt64OutIdxInt64(self): + np.random.seed(0) + x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64) + with self.cached_session(use_gpu=True) as sess: + y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def testInt64OutIdxInt32(self): + np.random.seed(0) + x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64) + with self.cached_session(use_gpu=True) as sess: + y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int32) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def testString(self): + indx = np.random.randint(65, high=122, size=70000) + x = [chr(i) for i in indx] + with self.cached_session() as sess: + y, idx = gen_array_ops.deepray_unique(x) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii')) + + def testInt32Axis(self): + for dtype in [np.int32, np.int64]: + x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]]) + with self.cached_session() as sess: + y0, idx0 = gen_array_ops.deepray_unique_v2(x, axis=np.array([0], dtype)) + tf_y0, tf_idx0 = sess.run([y0, idx0]) + y1, idx1 = gen_array_ops.deepray_unique_v2(x, axis=np.array([1], dtype)) + tf_y1, tf_idx1 = sess.run([y1, idx1]) + self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]])) + self.assertAllEqual(tf_idx0, np.array([0, 0, 1])) + self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]])) + self.assertAllEqual(tf_idx1, np.array([0, 1, 1])) + + def testInt32V2(self): + # This test is only temporary, once V2 is used + # by default, the axis will be wrapped to allow `axis=None`. + x = np.random.randint(2, high=10, size=7000) + with self.cached_session() as sess: + y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32)) + tf_y, tf_idx = sess.run([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def IllegalIdForMultMapUnique(self): + recover_env = False + if 'DEEPREC_UNIQUE_OP_PARTITION_SIZE' in os.environ: + recover_env = True + old_env = os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] + os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = '2' + + with self.cached_session() as sess: + x = np.array([-1, 0, 1, PreservedKey], dtype=np.int64) + y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64) + with self.assertRaisesRegexp( + errors_impl.InvalidArgumentError, "Input id is preserved key of dense_hash_map, " + "not supported: " + str(PreservedKey) + ): + tf_y, tf_idx = sess.run([y, idx]) + + del os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] + if recover_env: + os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = old_env + + def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False): + recover_env = False + if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ: + recover_env = True + old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type + self.testInt32() + self.testInt32OutIdxInt64() + self.testInt64OutIdxInt64() + self.testInt64OutIdxInt32() + self.testInt32Axis() + self.testInt32V2() + if test_illegal_key: + self.IllegalIdForMultMapUnique() + + del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + if recover_env: + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env + + def testUniqueMultiMap(self): + self.RunUniqueWithDifferentMaps('MULTIMAP', True) + + def testUniqueStlMap(self): + self.RunUniqueWithDifferentMaps('STL') + + def testUniqueAbslMap(self): + self.RunUniqueWithDifferentMaps('ABSL') + + def testUniqueDenseHashMap(self): + self.RunUniqueWithDifferentMaps('GOOGLE') + + +class UniqueWithCountsTest(test.TestCase): + + def testInt32(self): + x = np.random.randint(2, high=1000, size=700000) + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_counts(x) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + self.assertEqual(count, np.sum(x == value)) + + def testInt32OutIdxInt64(self): + x = np.random.randint(2, high=1000, size=700000) + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_counts(x, out_idx=dtypes.int64) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + self.assertEqual(count, np.sum(x == value)) + + def testString(self): + indx = np.random.randint(65, high=122, size=7000) + x = [chr(i) for i in indx] + + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_counts(x) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii')) + for value, count in zip(tf_y, tf_count): + v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)] + self.assertEqual(count, sum(v)) + + def testInt32Axis(self): + for dtype in [np.int32, np.int64]: + x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]]) + with self.cached_session() as sess: + y0, idx0, count0 = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([0], dtype)) + tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0]) + y1, idx1, count1 = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([1], dtype)) + tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1]) + self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]])) + self.assertAllEqual(tf_idx0, np.array([0, 0, 1])) + self.assertAllEqual(tf_count0, np.array([2, 1])) + self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]])) + self.assertAllEqual(tf_idx1, np.array([0, 1, 1])) + self.assertAllEqual(tf_count1, np.array([1, 2])) + + def testInt32V2(self): + # This test is only temporary, once V2 is used + # by default, the axis will be wrapped to allow `axis=None`. + x = np.random.randint(2, high=10, size=7000) + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([], np.int32)) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + self.assertEqual(count, np.sum(x == value)) + + def RunUniqueWithCountsWithDifferentMaps(self, map_type): + recover_env = False + if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ: + recover_env = True + old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type + self.testInt32() + self.testInt32OutIdxInt64() + self.testInt32Axis() + self.testInt32V2() + + del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + if recover_env: + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env + + def testUniqueWithCountsMultiMap(self): + self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP') + + def testUniqueWithCountsStlMap(self): + self.RunUniqueWithCountsWithDifferentMaps('STL') + + def testUniqueWithCountsAbslMap(self): + self.RunUniqueWithCountsWithDifferentMaps('ABSL') + + def testUniqueWithCountsDenseHashMap(self): + self.RunUniqueWithCountsWithDifferentMaps('GOOGLE') + + +class UniqueWithExtraCountsTest(test.TestCase): + + def testInt32(self): + x = np.random.randint(2, high=1000, size=700000) + extra_x = x[:5].tolist() + extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)] + extra_count = [500 for _ in range(5)] + extra_count_tensor = [constant_op.constant(extra_count, dtypes.int32)] + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + if value in extra_x: + self.assertEqual(count, np.sum(x == value) + 499) + else: + self.assertEqual(count, np.sum(x == value)) + + def testInt32OutIdxInt64(self): + x = np.random.randint(2, high=1000, size=700000) + extra_x = x[:5].tolist() + extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)] + extra_count = [500 for _ in range(5)] + extra_count_tensor = [constant_op.constant(extra_count, dtypes.int64)] + with self.cached_session() as sess: + y, idx, count = gen_array_ops.deepray_unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + if value in extra_x: + self.assertEqual(count, np.sum(x == value) + 499) + else: + self.assertEqual(count, np.sum(x == value)) + + def RunUniqueWithCountsWithDifferentMaps(self, map_type): + recover_env = False + if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ: + recover_env = True + old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type + self.testInt32() + self.testInt32OutIdxInt64() + + del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + if recover_env: + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env + + def testUniqueWithCountsMultiMap(self): + self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP') + + def testUniqueWithCountsStlMap(self): + self.RunUniqueWithCountsWithDifferentMaps('STL') + + def testUniqueWithCountsAbslMap(self): + self.RunUniqueWithCountsWithDifferentMaps('ABSL') + + def testUniqueWithCountsDenseHashMap(self): + self.RunUniqueWithCountsWithDifferentMaps('GOOGLE') + + +if __name__ == '__main__': + test.main() diff --git a/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py b/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py deleted file mode 100644 index a3b8a470..00000000 --- a/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py +++ /dev/null @@ -1,303 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for tensorflow.kernels.unique_op.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import numpy as np -from tensorflow.python.framework import dtypes -from tensorflow.python.platform import test -from tensorflow.python.framework import errors_impl - -from deepray.custom_ops.unique_ops import gen_array_ops - -unique = gen_array_ops.deepray_unique - -# set environ before tf initializing global varialbes -PreservedKey = 1 << 10 -os.environ["DEEPREC_CONFIG_RAND_64"] = str(PreservedKey) - - -class UniqueTest(test.TestCase): - - def testInt32(self): - x = np.random.randint(2, high=10, size=7000) - with self.cached_session() as sess: - y, idx = gen_array_ops.deepray_unique(x) - tf_y, tf_idx = self.evaluate([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]]) - - def testInt32OutIdxInt64(self): - x = np.random.randint(2, high=10, size=7000) - with self.cached_session() as sess: - y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64) - tf_y, tf_idx = self.evaluate([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]]) - - def testInt64OutIdxInt64(self): - np.random.seed(0) - x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64) - with self.cached_session(use_gpu=True) as sess: - y, idx = unique(x, out_idx=dtypes.int64) - tf_y, tf_idx = sess.run([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]]) - - def testInt64OutIdxInt32(self): - np.random.seed(0) - x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64) - with self.cached_session(use_gpu=True) as sess: - y, idx = unique(x, out_idx=dtypes.int32) - tf_y, tf_idx = sess.run([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]]) - - def testString(self): - indx = np.random.randint(65, high=122, size=7000) - x = [chr(i) for i in indx] - with self.cached_session() as sess: - y, idx = gen_array_ops.deepray_unique(x) - tf_y, tf_idx = self.evaluate([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii')) - - def testInt32Axis(self): - for dtype in [np.int32, np.int64]: - x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]]) - with self.cached_session() as sess: - y0, idx0 = gen_array_ops.deepray_unique_v2(x, axis=np.array([0], dtype)) - tf_y0, tf_idx0 = self.evaluate([y0, idx0]) - y1, idx1 = gen_array_ops.deepray_unique_v2(x, axis=np.array([1], dtype)) - tf_y1, tf_idx1 = self.evaluate([y1, idx1]) - self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]])) - self.assertAllEqual(tf_idx0, np.array([0, 0, 1])) - self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]])) - self.assertAllEqual(tf_idx1, np.array([0, 1, 1])) - - def testInt32V2(self): - # This test is only temporary, once V2 is used - # by default, the axis will be wrapped to allow `axis=None`. - x = np.random.randint(2, high=10, size=7000) - with self.cached_session() as sess: - y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32)) - tf_y, tf_idx = self.evaluate([y, idx]) - - self.assertEqual(len(x), len(tf_idx)) - self.assertEqual(len(tf_y), len(np.unique(x))) - for i in range(len(x)): - self.assertEqual(x[i], tf_y[tf_idx[i]]) - - def IllegalIdForMultMapUnique(self): - recover_env = False - if 'DEEPREC_UNIQUE_OP_PARTITION_SIZE' in os.environ: - recover_env = True - old_env = os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] - os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = '2' - - with self.cached_session() as sess: - x = np.array([-1, 0, 1, PreservedKey], dtype=np.int64) - y, idx = unique(x, out_idx=dtypes.int64) - with self.assertRaisesRegexp( - errors_impl.InvalidArgumentError, "Input id is preserved key of dense_hash_map, " - "not supported: " + str(PreservedKey) - ): - tf_y, tf_idx = sess.run([y, idx]) - - del os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] - if recover_env: - os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = old_env - - def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False): - recover_env = False - if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ: - recover_env = True - old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] - - os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type - self.testInt32() - self.testInt32OutIdxInt64() - self.testInt64OutIdxInt64() - self.testInt64OutIdxInt32() - self.testInt32Axis() - self.testInt32V2() - if test_illegal_key: - self.IllegalIdForMultMapUnique() - - del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] - if recover_env: - os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env - - def testUniqueMultiMap(self): - self.RunUniqueWithDifferentMaps('MULTIMAP') - - def testUniqueStlMap(self): - self.RunUniqueWithDifferentMaps('STL') - - def testUniqueAbslMap(self): - self.RunUniqueWithDifferentMaps('ABSL') - - def testUniqueDenseHashMap(self): - self.RunUniqueWithDifferentMaps('GOOGLE') - - # def testBool(self): - # x = np.random.choice([True, False], size=7000) - # with self.cached_session() as sess: - # y, idx = gen_array_ops.deepray_unique(x) - # tf_y, tf_idx = self.evaluate([y, idx]) - - # self.assertEqual(len(x), len(tf_idx)) - # self.assertEqual(len(tf_y), len(np.unique(x))) - # for i in range(len(x)): - # self.assertEqual(x[i], tf_y[tf_idx[i]]) - - # def testBoolV2(self): - # x = np.random.choice([True, False], size=7000) - # with self.cached_session() as sess: - # y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32)) - # tf_y, tf_idx = self.evaluate([y, idx]) - - # self.assertEqual(len(x), len(tf_idx)) - # self.assertEqual(len(tf_y), len(np.unique(x))) - # for i in range(len(x)): - # self.assertEqual(x[i], tf_y[tf_idx[i]]) - - -# class UniqueWithCountsTest(test.TestCase): - -# def testInt32(self): -# x = np.random.randint(2, high=10, size=7000) -# with self.cached_session() as sess: -# y, idx, count = array_ops.unique_with_counts(x) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]]) -# for value, count in zip(tf_y, tf_count): -# self.assertEqual(count, np.sum(x == value)) - -# def testInt32OutIdxInt64(self): -# x = np.random.randint(2, high=10, size=7000) -# with self.cached_session() as sess: -# y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]]) -# for value, count in zip(tf_y, tf_count): -# self.assertEqual(count, np.sum(x == value)) - -# def testString(self): -# indx = np.random.randint(65, high=122, size=7000) -# x = [chr(i) for i in indx] - -# with self.cached_session() as sess: -# y, idx, count = array_ops.unique_with_counts(x) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii')) -# for value, count in zip(tf_y, tf_count): -# v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)] -# self.assertEqual(count, sum(v)) - -# def testInt32Axis(self): -# for dtype in [np.int32, np.int64]: -# x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]]) -# with self.cached_session() as sess: -# y0, idx0, count0 = gen_array_ops.deepray_unique_with_counts_v2( -# x, axis=np.array([0], dtype)) -# tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0]) -# y1, idx1, count1 = gen_array_ops.deepray_unique_with_counts_v2( -# x, axis=np.array([1], dtype)) -# tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1]) -# self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]])) -# self.assertAllEqual(tf_idx0, np.array([0, 0, 1])) -# self.assertAllEqual(tf_count0, np.array([2, 1])) -# self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]])) -# self.assertAllEqual(tf_idx1, np.array([0, 1, 1])) -# self.assertAllEqual(tf_count1, np.array([1, 2])) - -# def testInt32V2(self): -# # This test is only temporary, once V2 is used -# # by default, the axis will be wrapped to allow `axis=None`. -# x = np.random.randint(2, high=10, size=7000) -# with self.cached_session() as sess: -# y, idx, count = gen_array_ops.deepray_unique_with_counts_v2( -# x, axis=np.array([], np.int32)) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]]) -# for value, count in zip(tf_y, tf_count): -# self.assertEqual(count, np.sum(x == value)) - -# def testBool(self): -# x = np.random.choice([True, False], size=7000) -# with self.cached_session() as sess: -# y, idx, count = array_ops.unique_with_counts(x) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]]) -# for value, count in zip(tf_y, tf_count): -# self.assertEqual(count, np.sum(x == value)) - -# def testBoolV2(self): -# x = np.random.choice([True, False], size=7000) -# with self.cached_session() as sess: -# y, idx, count = gen_array_ops.deepray_unique_with_counts_v2( -# x, axis=np.array([], np.int32)) -# tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) - -# self.assertEqual(len(x), len(tf_idx)) -# self.assertEqual(len(tf_y), len(np.unique(x))) -# for i in range(len(x)): -# self.assertEqual(x[i], tf_y[tf_idx[i]]) -# for value, count in zip(tf_y, tf_count): -# self.assertEqual(count, np.sum(x == value)) - -if __name__ == '__main__': - test.main() diff --git a/deepray/custom_ops/utils/BUILD b/deepray/custom_ops/utils/BUILD new file mode 100644 index 00000000..a8ee59b8 --- /dev/null +++ b/deepray/custom_ops/utils/BUILD @@ -0,0 +1,127 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library") +load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_copts") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "spin_rw_lock", + srcs = [ + "spin_rw_lock.h", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "spin_lock", + srcs = [ + "spin_lock.h", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "ok_status_util", + srcs = [ + "ok_status_util.h", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "random", + srcs = [ + "random.cc", + "random.h", + ], + deps = [ + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cc_library( + name = "check_util", + srcs = [ + "check.h", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "tensor_testutil", + testonly = 1, + srcs = ["tensor_testutil.cc"], + hdrs = ["tensor_testutil.h"], + copts = tf_copts(), + visibility = ["//visibility:public"], + deps = [ + "@com_google_googletest//:gtest", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cc_test( + name = "tensor_testutil_test", + size = "small", + srcs = ["tensor_testutil_test.cc"], + deps = [ + ":tensor_testutil", + "@com_google_googletest//:gtest_main", + ], +) + +cc_library( + name = "kernel_benchmark_testlib", + testonly = 1, + srcs = ["kernel_benchmark_testlib.cc"], + hdrs = ["kernel_benchmark_testlib.h"], + copts = tf_copts(), + visibility = ["//visibility:public"], + deps = [ + "@com_google_benchmark//:benchmark", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cc_library( + name = "fake_input", + testonly = 1, + srcs = ["fake_input.cc"], + hdrs = ["fake_input.h"], + copts = tf_copts(), + visibility = ["//visibility:public"], + deps = [ + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cuda_library( + name = "ops_testutil", + testonly = 1, + srcs = ["ops_testutil.cc"], + hdrs = ["ops_testutil.h"], + deps = [ + ":tensor_testutil", + "@local_config_tf//:libtensorflow_cc", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) + +cc_test( + name = "ops_testutil_test", + size = "small", + srcs = ["ops_testutil_test.cc"], + linkopts = [ + "-lm", + ], + deps = [ + ":fake_input", + ":ops_testutil", + "@com_google_benchmark//:benchmark", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/deepray/custom_ops/utils/check.h b/deepray/custom_ops/utils/check.h new file mode 100644 index 00000000..066f786d --- /dev/null +++ b/deepray/custom_ops/utils/check.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CHECK_H +#define CHECK_H + +#include +#include + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#endif // CHECK_H diff --git a/deepray/custom_ops/utils/fake_input.cc b/deepray/custom_ops/utils/fake_input.cc new file mode 100644 index 00000000..9e751a51 --- /dev/null +++ b/deepray/custom_ops/utils/fake_input.cc @@ -0,0 +1,239 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "fake_input.h" + +#include + +#include "tensorflow/core/framework/attr_value.pb.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op_def.pb.h" +#include "tensorflow/core/framework/op_def_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace { + +class FakeInputImpl { + public: + FakeInputImpl(const OpDef* op_def, int in_index, const NodeDef* node_def, + NodeDefBuilder* builder); + void SetN(int n); + void SetDataType(DataType dt); + void SetTypeList(DataTypeSlice dts); + Status AddInputToBuilder(); + + private: + static string FakeNodeName(int in_index); + Status GetN(int* n) const; + Status GetDataType(DataType* dt) const; + void NSources(int n, DataType dt) const; + void SourceList(DataTypeSlice dts) const; + + const OpDef* const op_def_; + const OpDef::ArgDef* const arg_; + const string in_node_; + const NodeDef* const node_def_; + NodeDefBuilder* const builder_; + + bool n_specified_; + int n_; + bool dt_specified_; + DataType dt_; + bool dts_specified_; + DataTypeSlice dts_; +}; + +FakeInputImpl::FakeInputImpl(const OpDef* op_def, int in_index, + const NodeDef* node_def, NodeDefBuilder* builder) + : op_def_(op_def), + arg_(&op_def->input_arg(in_index)), + in_node_(FakeNodeName(in_index)), + node_def_(node_def), + builder_(builder), + n_specified_(false), + dt_specified_(false), + dts_specified_(false) {} + +void FakeInputImpl::SetN(int n) { + n_specified_ = true; + n_ = n; +} + +void FakeInputImpl::SetDataType(DataType dt) { + dt_specified_ = true; + dt_ = dt; +} + +void FakeInputImpl::SetTypeList(DataTypeSlice dts) { + dts_specified_ = true; + dts_ = dts; +} + +Status FakeInputImpl::AddInputToBuilder() { + if (dts_specified_) { + SourceList(dts_); + + } else if (n_specified_ || !arg_->number_attr().empty()) { + int n; + TF_RETURN_IF_ERROR(GetN(&n)); + + DataType dt; + if (n > 0) { + TF_RETURN_IF_ERROR(GetDataType(&dt)); + } else { + dt = DT_FLOAT; + } + + NSources(n, dt); + } else { + if (!dt_specified_ && !arg_->type_list_attr().empty()) { + DataTypeVector dts; + Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts); + if (!status.ok()) { + return errors::InvalidArgument( + "Could not infer list of types for input '", arg_->name(), + "': ", status.message()); + } + SourceList(dts); + return OkStatus(); + } + + DataType dt; + TF_RETURN_IF_ERROR(GetDataType(&dt)); + builder_->Input(in_node_, 0, dt); + } + return OkStatus(); +} + +// static +string FakeInputImpl::FakeNodeName(int in_index) { + char c = 'a' + (in_index % 26); + return string(&c, 1); +} + +Status FakeInputImpl::GetN(int* n) const { + if (n_specified_) { + *n = n_; + } else { + Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n); + if (!status.ok()) { + return errors::InvalidArgument("Could not infer length of input '", + arg_->name(), "': ", status.message()); + } + } + return OkStatus(); +} + +Status FakeInputImpl::GetDataType(DataType* dt) const { + if (dt_specified_) { + *dt = dt_; + return OkStatus(); // Ignore is_ref field of arg_. + } else if (arg_->type() != DT_INVALID) { + *dt = arg_->type(); + } else if (!arg_->type_attr().empty()) { + Status status = GetNodeAttr(*node_def_, arg_->type_attr(), dt); + if (!status.ok()) { + // Check if the type attr has a default + const OpDef::AttrDef* attr = FindAttr(arg_->type_attr(), *op_def_); + if (attr && attr->has_default_value()) { + *dt = attr->default_value().type(); + } else { + return errors::InvalidArgument("Could not infer type for input '", + arg_->name(), "': ", status.message()); + } + } + } else { + return errors::InvalidArgument("No type or type_attr field in arg '", + arg_->name(), "'"); + } + if (arg_->is_ref()) { + *dt = MakeRefType(*dt); + } + return OkStatus(); +} + +void FakeInputImpl::NSources(int n, DataType dt) const { + std::vector srcs; + srcs.reserve(n); + for (int i = 0; i < n; ++i) { + srcs.emplace_back(in_node_, i, dt); + } + builder_->Input(gtl::ArraySlice(srcs)); +} + +void FakeInputImpl::SourceList(DataTypeSlice dts) const { + std::vector srcs; + srcs.reserve(dts.size()); + for (size_t i = 0; i < dts.size(); ++i) { + srcs.emplace_back(in_node_, i, dts[i]); + } + builder_->Input(gtl::ArraySlice(srcs)); +} + +} // namespace + +// Public interface ------------------------------------------------------------ + +FakeInputFunctor FakeInput() { + return [](const OpDef& op_def, int in_index, const NodeDef& node_def, + NodeDefBuilder* builder) { + FakeInputImpl impl(&op_def, in_index, &node_def, builder); + return impl.AddInputToBuilder(); + }; +} + +FakeInputFunctor FakeInput(DataType dt) { + return [dt](const OpDef& op_def, int in_index, const NodeDef& node_def, + NodeDefBuilder* builder) { + FakeInputImpl impl(&op_def, in_index, &node_def, builder); + impl.SetDataType(dt); + return impl.AddInputToBuilder(); + }; +} + +FakeInputFunctor FakeInput(int n) { + return [n](const OpDef& op_def, int in_index, const NodeDef& node_def, + NodeDefBuilder* builder) { + FakeInputImpl impl(&op_def, in_index, &node_def, builder); + impl.SetN(n); + return impl.AddInputToBuilder(); + }; +} + +FakeInputFunctor FakeInput(int n, DataType dt) { + return [n, dt](const OpDef& op_def, int in_index, const NodeDef& node_def, + NodeDefBuilder* builder) { + FakeInputImpl impl(&op_def, in_index, &node_def, builder); + impl.SetN(n); + impl.SetDataType(dt); + return impl.AddInputToBuilder(); + }; +} + +FakeInputFunctor FakeInput(DataTypeSlice dts) { + // Make a copy to ensure the data will still be around when the lambda is + // called. + DataTypeVector dtv(dts.begin(), dts.end()); + return [dtv](const OpDef& op_def, int in_index, const NodeDef& node_def, + NodeDefBuilder* builder) { + FakeInputImpl impl(&op_def, in_index, &node_def, builder); + impl.SetTypeList(dtv); + return impl.AddInputToBuilder(); + }; +} + +} // namespace tensorflow diff --git a/deepray/custom_ops/utils/fake_input.h b/deepray/custom_ops/utils/fake_input.h new file mode 100644 index 00000000..c3062762 --- /dev/null +++ b/deepray/custom_ops/utils/fake_input.h @@ -0,0 +1,40 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_ +#define TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_ + +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/types.h" + +namespace tensorflow { + +// These functions return values that may be passed to +// NodeDefBuilder::Input() to add an input for a test. Use them when +// you don't care about the node names/output indices providing the +// input. They also allow you to omit the input types and/or +// list length when they may be inferred. +FakeInputFunctor FakeInput(); // Infer everything +FakeInputFunctor FakeInput(DataType dt); +FakeInputFunctor FakeInput(int n); // List of length n +FakeInputFunctor FakeInput(int n, DataType dt); +FakeInputFunctor FakeInput(DataTypeSlice dts); +inline FakeInputFunctor FakeInput(std::initializer_list dts) { + return FakeInput(DataTypeSlice(dts)); +} + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_ diff --git a/deepray/custom_ops/utils/kernel_benchmark_testlib.cc b/deepray/custom_ops/utils/kernel_benchmark_testlib.cc new file mode 100644 index 00000000..cb325697 --- /dev/null +++ b/deepray/custom_ops/utils/kernel_benchmark_testlib.cc @@ -0,0 +1,210 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernel_benchmark_testlib.h" + +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/executor_factory.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/op_segment.h" +#include "tensorflow/core/framework/versions.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/byte_order.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { +namespace test { + +// TODO(hongm): Convert `g` and `init` to using std::unique_ptr. +Benchmark::Benchmark(const string& device, Graph* g, + const SessionOptions* options, Graph* init, + Rendezvous* rendez, const char* executor_type, + bool old_benchmark_api) { + auto cleanup = gtl::MakeCleanup([g, init]() { + delete g; + delete init; + }); + + SessionOptions default_options; + if (!options) { + options = &default_options; + } + + CHECK(!old_benchmark_api) << "Expected new API only"; + + string t = absl::AsciiStrToUpper(device); + // Allow NewDevice to allocate a new threadpool with different number of + // threads for each new benchmark. + LocalDevice::set_use_global_threadpool(false); + + device_mgr_ = std::make_unique( + DeviceFactory::NewDevice(t, *options, "/job:localhost/replica:0/task:0")); + device_ = device_mgr_->ListDevices()[0]; + CHECK(device_) << "Could not create a " << device << " device"; + + pool_ = + new thread::ThreadPool(options->env, "blocking", port::MaxParallelism()); + + auto runner = [this](std::function closure) { + pool_->Schedule(closure); + }; + + if (rendez == nullptr) { + rendez_ = NewLocalRendezvous(); + } else { + rendez_ = rendez; + } + + const int graph_def_version = g->versions().producer(); + + flib_def_ = std::make_unique(g->flib_def()); + + pflr_ = std::unique_ptr( + new ProcessFunctionLibraryRuntime( + device_mgr_.get(), Env::Default(), nullptr, graph_def_version, + flib_def_.get(), OptimizerOptions(), pool_, nullptr, nullptr, + Rendezvous::Factory())); + + flr_ = pflr_->GetFLR(device_->name()); + + LocalExecutorParams params; + params.device = device_; + params.function_library = flr_; + params.create_kernel = [this, graph_def_version]( + const std::shared_ptr& props, + OpKernel** kernel) { + return CreateNonCachedKernel(device_, flr_, props, graph_def_version, + kernel); + }; + params.delete_kernel = [](OpKernel* kernel) { + DeleteNonCachedKernel(kernel); + }; + + if (init) { + std::unique_ptr init_exec; + TF_CHECK_OK(NewExecutor(executor_type, params, *init, &init_exec)); + Executor::Args args; + args.rendezvous = rendez_; + args.runner = runner; + TF_CHECK_OK(init_exec->Run(args)); + } + + TF_CHECK_OK(NewExecutor(executor_type, params, *g, &exec_)); +} + +Benchmark::Benchmark(const string& device, Graph* g, bool old_benchmark_api) + : Benchmark(device, g, nullptr, nullptr, nullptr, "", old_benchmark_api) {} + +Benchmark::~Benchmark() { + if (device_) { + rendez_->Unref(); + // We delete `exec_` before `device_mgr_` because the `exec_` destructor may + // run kernel destructors that may attempt to access state borrowed from + // `device_mgr_`, such as the resource manager. + exec_.reset(); + pflr_.reset(); + device_mgr_.reset(); + delete pool_; + } +} + +void Benchmark::Run(benchmark::State& state) { + RunWithRendezvousArgs({}, {}, state); +} + +string GetRendezvousKey(const Node* node) { + string send_device; + TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device)); + string recv_device; + TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device)); + string tensor_name; + TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name)); + uint64 send_device_incarnation; + TF_CHECK_OK( + GetNodeAttr(node->attrs(), "send_device_incarnation", + reinterpret_cast(&send_device_incarnation))); + return Rendezvous::CreateKey(send_device, send_device_incarnation, + recv_device, tensor_name, FrameAndIter(0, 0)); +} + +void Benchmark::RunWithRendezvousArgs( + const std::vector>& inputs, + const std::vector& outputs, benchmark::State& state) { + if (!device_ || state.max_iterations == 0) { + return; + } + Tensor unused; // In benchmark, we don't care the return value. + bool is_dead; + + // Warm up + Executor::Args args; + args.rendezvous = rendez_; + args.runner = [this](std::function closure) { + pool_->Schedule(closure); + }; + static const int kWarmupRuns = 3; + for (int i = 0; i < kWarmupRuns; ++i) { + for (const auto& p : inputs) { + Rendezvous::ParsedKey parsed; + TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed)); + TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false)); + } + TF_CHECK_OK(exec_->Run(args)); + for (const string& key : outputs) { + Rendezvous::ParsedKey parsed; + TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed)); + TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead)); + } + } + TF_CHECK_OK(device_->Sync()); + VLOG(3) << kWarmupRuns << " warmup runs done."; + + // Benchmark loop. Timer starts automatically at the beginning of the loop + // and ends automatically after the last iteration. + for (auto s : state) { + for (const auto& p : inputs) { + Rendezvous::ParsedKey parsed; + TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed)); + TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false)); + } + TF_CHECK_OK(exec_->Run(args)); + for (const string& key : outputs) { + Rendezvous::ParsedKey parsed; + TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed)); + TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead)); + } + } + TF_CHECK_OK(device_->Sync()); +} + +} // end namespace test +} // end namespace tensorflow diff --git a/deepray/custom_ops/utils/kernel_benchmark_testlib.h b/deepray/custom_ops/utils/kernel_benchmark_testlib.h new file mode 100644 index 00000000..fcab9a65 --- /dev/null +++ b/deepray/custom_ops/utils/kernel_benchmark_testlib.h @@ -0,0 +1,86 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ + +#include +#include + +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +class Device; +class FunctionLibraryRuntime; +class ProcessFunctionLibraryRuntime; +struct SessionOptions; +class DynamicDeviceMgr; + +namespace test { + +class Benchmark { + public: + // "device" must be either "cpu" or "gpu". Takes ownership of "g", + // "init", and one reference on "rendez" (if not null). + // + // old_benchmark_api: If true, the benchmark is running with older API + // * In the old API, the timer needs to be stopped/restarted + // by users. + // * In the new API, the timer starts automatically at the first + // iteration of the loop and stops after the last iteration. + // TODO(vyng) Remove this once we have migrated all code to newer API. + Benchmark(const string& device, Graph* g, + const SessionOptions* options = nullptr, Graph* init = nullptr, + Rendezvous* rendez = nullptr, const char* executor_type = "", + bool old_benchmark_api = false); + + Benchmark(const string& device, Graph* g, bool old_benchmark_api); + + ~Benchmark(); + + void Run(benchmark::State& state); + + void RunWithRendezvousArgs( + const std::vector>& inputs, + const std::vector& outputs, benchmark::State& state); + + private: + thread::ThreadPool* pool_ = nullptr; // Not owned. + Device* device_ = nullptr; // Not owned. + Rendezvous* rendez_ = nullptr; + std::unique_ptr device_mgr_; + std::unique_ptr flib_def_; + std::unique_ptr pflr_; + FunctionLibraryRuntime* flr_; // Not owned. + std::unique_ptr exec_; + + Benchmark(const Benchmark&) = delete; + void operator=(const Benchmark&) = delete; +}; + +// Returns the rendezvous key associated with the given Send/Recv node. +string GetRendezvousKey(const Node* node); + +} // end namespace test +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ diff --git a/deepray/custom_ops/utils/ok_status_util.h b/deepray/custom_ops/utils/ok_status_util.h new file mode 100644 index 00000000..a9c7517c --- /dev/null +++ b/deepray/custom_ops/utils/ok_status_util.h @@ -0,0 +1,41 @@ +/* Copyright 2024 The Deepray Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef DEEPRAY_UTILS_H_ +#define DEEPRAY_UTILS_H_ + +// #define PRINT_MACRO_HELPER(x) #x +// #define PRINT_MACRO(x) #x "=" PRINT_MACRO_HELPER(x) + +namespace tensorflow { +namespace deepray { + +/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()". +This code is for compatibility.*/ +#if TF_VERSION_INTEGER >= 2150 +#define TFOkStatus absl::OkStatus() +// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER)) +#elif TF_VERSION_INTEGER >= 2100 +#define TFOkStatus OkStatus() +// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER)) +#else +// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER)) +// #define TFOkStatus Status::OK() +#define TFOkStatus absl::OkStatus() +#endif +} // namespace deepray +} // namespace tensorflow + +#endif // DEEPRAY_UTILS_H_ diff --git a/deepray/custom_ops/utils/ops_testutil.cc b/deepray/custom_ops/utils/ops_testutil.cc new file mode 100644 index 00000000..f694941a --- /dev/null +++ b/deepray/custom_ops/utils/ops_testutil.cc @@ -0,0 +1,271 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/node_properties.h" +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#define EIGEN_USE_GPU +#include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h" +#endif + +#include +#include +#include +#include +#include + +#include "ops_testutil.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/control_flow.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/type_index.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/protobuf/config.pb.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" + +namespace tensorflow { +namespace test { + +void SetOutputAttrs(OpKernelContext::Params* params, + std::vector* attrs) { + attrs->clear(); + for (int index = 0; index < params->op_kernel->num_outputs(); index++) { + AllocatorAttributes attr; + const bool on_host = + (params->op_kernel->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + attrs->push_back(attr); + } + params->output_attr_array = attrs->data(); +} + +} // namespace test + +OpsTestBase::OpsTestBase() : device_type_(DEVICE_CPU) { + auto device = DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"); + CHECK(device) << "Could not create CPU device"; + + thread_pool_ = std::make_unique( + Env::Default(), /*name=*/"default", /*num_threads=*/1); + + device_ = device.get(); + device_mgr_ = std::make_unique(std::move(device)); + + allocator_ = device_->GetAllocator(AllocatorAttributes()); + + flib_def_ = std::make_unique(OpRegistry::Global(), + FunctionDefLibrary{}); + pflr_ = std::make_unique( + device_mgr_.get(), Env::Default(), /*config=*/nullptr, + TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions()); +} + +OpsTestBase::~OpsTestBase() { + for (auto& temp : tensors_) { + delete temp; + } + for (auto& temp : managed_outputs_) { + delete temp; + } + tensors_.clear(); + managed_outputs_.clear(); + context_.reset(nullptr); + params_.reset(nullptr); +} + +void OpsTestBase::SetDevice(const DeviceType& device_type, + std::unique_ptr device) { + CHECK(device_) << "No device provided"; + + device_ = device.get(); + device_type_ = device_type; +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + if (device_type == DEVICE_GPU) { + managed_allocator_.reset(new GpuManagedAllocator()); + allocator_ = managed_allocator_.get(); + } else { + managed_allocator_.reset(); + allocator_ = device_->GetAllocator(AllocatorAttributes()); + } +#else + CHECK_NE(device_type, DEVICE_GPU) + << "Requesting GPU on binary compiled without GOOGLE_CUDA or " + "TENSORFLOW_USE_ROCM."; + allocator_ = device_->GetAllocator(AllocatorAttributes()); +#endif + + device_mgr_ = std::make_unique(std::move(device)); + pflr_ = std::make_unique( + device_mgr_.get(), Env::Default(), /*config=*/nullptr, + TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(), + thread_pool_.get()); +} + +void OpsTestBase::set_node_def(const NodeDef& node_def) { + node_def_.CopyFrom(node_def); +} + +NodeDef* OpsTestBase::node_def() { return &node_def_; } + +Status OpsTestBase::InitOp() { + return InitOpWithGraphVersion(TF_GRAPH_DEF_VERSION); +} + +Status OpsTestBase::InitOpWithGraphVersion(int graph_def_version) { + std::shared_ptr props; + TF_RETURN_IF_ERROR(NodeProperties::CreateFromNodeDef( + node_def_, OpRegistry::Global(), &props)); + OpKernel* kernel; + TF_RETURN_IF_ERROR(CreateOpKernel( + device_type_, device_, allocator(), /*flib=*/nullptr, + device_->resource_manager(), props, graph_def_version, &kernel)); + kernel_.reset(kernel); + input_types_ = kernel_->input_types(); + return OkStatus(); +} + +static std::function)>* GetDefaultRunner() { + static auto* const default_runner = + new std::function)>( + [](const std::function& f) { f(); }); + return default_runner; +} + +void OpsTestBase::CreateContext() { + // Make sure the old OpKernelContext is deleted before the Params + // it was using. + context_.reset(nullptr); + + // Delete the output copies from previous runs. + for (auto& temp : managed_outputs_) { + delete temp; + } + managed_outputs_.clear(); + managed_outputs_.resize(0); + + params_.reset(new OpKernelContext::Params); + params_->device = device_; + params_->frame_iter = FrameAndIter(0, 0); + params_->inputs = inputs_; + params_->op_kernel = kernel_.get(); + step_container_.reset(new ScopedStepContainer(0, [](const string&) {})); + params_->step_container = step_container_.get(); + test::SetOutputAttrs(params_.get(), &out_alloc_attrs_); + params_->slice_reader_cache = &slice_reader_cache_wrapper_; + params_->cancellation_manager = &default_cancellation_manager_; + params_->resource_manager = device_->resource_manager(); + params_->function_library = pflr_->GetFLR(device_->name()); + params_->runner = GetDefaultRunner(); + params_->session_metadata = &session_metadata(); + + context_.reset(new OpKernelContext(params_.get())); +} + +Status OpsTestBase::RunOpKernel() { + CreateContext(); + device_->Compute(kernel_.get(), context_.get()); + return context_->status(); +} + +const Tensor& OpsTestBase::GetInput(int input_index) const { + CHECK_LT(input_index, context_->num_inputs()); + CHECK(!IsRefType(context_->input_dtype(input_index))); + return context_->input(input_index); +} + +TensorValue OpsTestBase::mutable_input(int input_index) { + CHECK_LT(input_index, inputs_.size()); + return inputs_[input_index]; +} + +Tensor* OpsTestBase::GetOutput(int output_index) { + CHECK_LT(output_index, context_->num_outputs()); + Tensor* output = context_->mutable_output(output_index); +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + if (device_type_ == DEVICE_GPU) { + managed_outputs_.resize(context_->num_outputs()); + // Copy the output tensor to managed memory if we haven't done so. + if (!managed_outputs_[output_index]) { + Tensor* managed_output = + new Tensor(allocator(), output->dtype(), output->shape()); + auto src = output->tensor_data(); + auto dst = managed_output->tensor_data(); + context_->eigen_gpu_device().memcpyDeviceToHost( + const_cast(dst.data()), src.data(), src.size()); + context_->eigen_gpu_device().synchronize(); + managed_outputs_[output_index] = managed_output; + } + output = managed_outputs_[output_index]; + } +#endif + return output; +} + +Allocator* OpsTestBase::allocator() { return allocator_; } + +OpKernel* OpsTestBase::op_kernel() { return kernel_.get(); } + +const DataTypeVector& OpsTestBase::output_types() const { + return kernel_->output_types(); +} + +Tensor* OpsTestBase::AddInput(DataType dtype, const TensorShape& shape) { + CHECK_GT(input_types_.size(), inputs_.size()) + << "Adding more inputs than types; perhaps you need to call MakeOp"; + bool is_ref = IsRefType(input_types_[inputs_.size()]); + Tensor* input = new Tensor(allocator(), dtype, shape); + tensors_.push_back(input); + if (is_ref) { + CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), dtype); + inputs_.push_back({&lock_for_refs_, input}); + } else { + CHECK_EQ(input_types_[inputs_.size()], dtype); + inputs_.push_back({nullptr, input}); + } + return input; +} + +void OpsTestBase::AddResourceInputInternal(const std::string& container_name, + const std::string& name, + const TypeIndex& type_index) { + ResourceHandle handle; + handle.set_device(device_->name()); + handle.set_container(container_name); + handle.set_name(name); + handle.set_hash_code(type_index.hash_code()); + handle.set_maybe_type_name(type_index.name()); + Tensor* input = new Tensor(allocator(), DT_RESOURCE, TensorShape({})); + input->scalar()() = handle; + tensors_.push_back(input); + inputs_.push_back({nullptr, input}); +} + +} // namespace tensorflow diff --git a/deepray/custom_ops/utils/ops_testutil.h b/deepray/custom_ops/utils/ops_testutil.h new file mode 100644 index 00000000..3edd4a3e --- /dev/null +++ b/deepray/custom_ops/utils/ops_testutil.h @@ -0,0 +1,212 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_ +#define TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_ + +#include +#include +#include +#include +#include +#include + +#include "tensor_testutil.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/type_index.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/threadpool.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" + +namespace tensorflow { +namespace test { + +void SetOutputAttrs(OpKernelContext::Params* params, + std::vector* attrs); + +} // namespace test + +// Helpful functions to test operators. +// +// This class will eventually be replaced / heavily modified +// to use the BrainClient interface. +class OpsTestBase : public ::testing::Test { + public: + OpsTestBase(); + + ~OpsTestBase() override; + + // Allow kernel unit tests to run on GPU + void SetDevice(const DeviceType& device_type, std::unique_ptr device); + + void set_node_def(const NodeDef& node_def); + + // Clients can manipulate the underlying NodeDef via this accessor. + NodeDef* node_def(); + + // Initializes an operator that takes in 'input_types' as input + // and output types as output. + // + // Returns the status of initialization. + Status InitOp(); + + // Only use this directly if you have a deprecated op that you need to test. + Status InitOpWithGraphVersion(int graph_def_version); + + // Adds an input for every element described by the shape. + // 'input_mapping' maps an index (0...NumElements(shape)) to a + // value. + // + // TODO(vrv): Replace with something like a BrainClient Feed. + template + void AddInput(const TensorShape& shape, std::function input_mapping) { + test::FillFn(AddInput(DataTypeToEnum::v(), shape), input_mapping); + } + + // Like AddInput but takes in an explicit arrayslice of data. + template + void AddInputFromArray(const TensorShape& shape, + const gtl::ArraySlice data) { + test::FillValues(AddInput(DataTypeToEnum::v(), shape), data); + } + + // Convenience function to add an input and populate it with the elements from + // an initializer list converting the types as needed. + template + void AddInputFromList(const TensorShape& shape, + std::initializer_list data) { + test::FillValues(AddInput(DataTypeToEnum::v(), shape), data); + } + + // Adds a Resource type as input. If is empty, uses the default + // container name. + template + void AddResourceInput(const string& container, const string& name, + T* resource) { + CHECK_GT(input_types_.size(), inputs_.size()) + << "Adding more inputs than types; perhaps you need to call MakeOp"; + ResourceMgr* rm = device_->resource_manager(); + std::string container_name = + container.empty() ? rm->default_container() : container; + EXPECT_TRUE(rm->Create(container_name, name, resource).ok()); + AddResourceInputInternal(container_name, name, TypeIndex::Make()); + } + + // Runs an operation producing 'num_outputs' outputs. + // + // Returns the context's status after running the operation. + Status RunOpKernel(); + + // Returns the tensor input for 'input_index'. + // + // REQUIRES: 0 <= input_index < context_->num_inputs() + const Tensor& GetInput(int input_index) const; + + TensorValue mutable_input(int input_index); + + // Returns the tensor output for 'output_index'. + // + // REQUIRES: 0 <= output_index < context_->num_outputs() + Tensor* GetOutput(int output_index); + + Allocator* allocator(); + + OpKernel* op_kernel(); + + const DataTypeVector& output_types() const; + + void set_session_metadata(SessionMetadata session_metadata) { + session_metadata_ = std::move(session_metadata); + } + + const SessionMetadata& session_metadata() const { return session_metadata_; } + + protected: + void CreateContext(); + Tensor* AddInput(DataType dtype, const TensorShape& shape); + void AddResourceInputInternal(const std::string& container_name, + const std::string& name, + const TypeIndex& type_index); + + // device_mgr_ owns device_. + std::unique_ptr device_mgr_; + Device* device_; + + // The device allocator, or the managed_allocator_ below if running on GPU. + Allocator* allocator_; + + std::unique_ptr kernel_; + std::unique_ptr step_container_; + NodeDef node_def_; + DataTypeVector input_types_; + DeviceType device_type_; + + mutex lock_for_refs_; // Used as the Mutex for inputs added as refs + + gtl::InlinedVector inputs_; + // Owns Tensors. + std::vector tensors_; + // Copies of the outputs in unified memory (host and device accessible). + std::vector managed_outputs_; + + // AllocatorAttributes for the allocators of the outputs. + std::vector out_alloc_attrs_; + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper_; + CancellationManager default_cancellation_manager_; + std::unique_ptr params_; + std::unique_ptr context_; + // Unified memory allocator, only used when running on GPU. + std::unique_ptr managed_allocator_; + + std::unique_ptr flib_def_; + std::unique_ptr pflr_; + std::unique_ptr thread_pool_; + + SessionMetadata session_metadata_; + + private: + OpsTestBase(const OpsTestBase&) = delete; + void operator=(const OpsTestBase&) = delete; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_ diff --git a/deepray/custom_ops/utils/ops_testutil_test.cc b/deepray/custom_ops/utils/ops_testutil_test.cc new file mode 100644 index 00000000..0cee1981 --- /dev/null +++ b/deepray/custom_ops/utils/ops_testutil_test.cc @@ -0,0 +1,52 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "ops_testutil.h" + +#include "fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/variable_ops.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +TEST_F(OpsTestBase, ScopedStepContainer) { + TF_EXPECT_OK(NodeDefBuilder("identity", "Identity") + .Input(FakeInput(DT_STRING)) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + AddInputFromArray(TensorShape({}), {""}); + TF_EXPECT_OK(RunOpKernel()); + EXPECT_TRUE(step_container_ != nullptr); +} + +// Verify that a Resource input can be added to the test kernel. +TEST_F(OpsTestBase, ResourceVariableInput) { + TF_EXPECT_OK(NodeDefBuilder("identity", "Identity") + .Input(FakeInput(DT_RESOURCE)) + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + Var* var = new Var(DT_STRING); + AddResourceInput("" /* container */, "Test" /* name */, var); + TF_ASSERT_OK(RunOpKernel()); + Tensor* output = GetOutput(0); + EXPECT_EQ(output->dtype(), DT_RESOURCE); +} + +} // namespace tensorflow diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random_test.cc b/deepray/custom_ops/utils/random.cc similarity index 67% rename from deepray/custom_ops/unique_ops/cc/kernels/random_test.cc rename to deepray/custom_ops/utils/random.cc index d37c47eb..6baf1f4b 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/random_test.cc +++ b/deepray/custom_ops/utils/random.cc @@ -13,25 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/lib/random/random.h" +#include "random.h" -#include - -#include "tensorflow/core/platform/test.h" -#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/platform/random.h" +#include "tensorflow/core/util/env_var.h" namespace tensorflow { namespace random { -namespace { - -TEST(New64Test, SanityCheck) { - std::set values; - for (int i = 0; i < 1000000; i++) { - uint64 x = New64(); - EXPECT_TRUE(values.insert(x).second) << "duplicate " << x; - } + +uint64 New64Configuable() { + int64 random_64; + CHECK( + ReadInt64FromEnvVar("DEEPREC_CONFIG_RAND_64", New64(), &random_64).ok()); + return static_cast(random_64); } -} // namespace } // namespace random } // namespace tensorflow diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random.h b/deepray/custom_ops/utils/random.h similarity index 82% rename from deepray/custom_ops/unique_ops/cc/kernels/random.h rename to deepray/custom_ops/utils/random.h index 29aae909..50b61140 100644 --- a/deepray/custom_ops/unique_ops/cc/kernels/random.h +++ b/deepray/custom_ops/utils/random.h @@ -21,14 +21,6 @@ limitations under the License. namespace tensorflow { namespace random { -// Return a 64-bit random value. Different sequences are generated -// in different processes. -uint64 New64(); - -// Return a 64-bit random value. Uses -// std::mersenne_twister_engine::default_seed as seed value. -uint64 New64DefaultSeed(); - // Call New64 to generate a 64-bit random value // if env var DEEPREC_CONFIG_RAND_64 not set. // Otherwise, return int64 from DEEPREC_CONFIG_RAND_64 diff --git a/deepray/custom_ops/utils/spin_lock.h b/deepray/custom_ops/utils/spin_lock.h new file mode 100644 index 00000000..ec99f589 --- /dev/null +++ b/deepray/custom_ops/utils/spin_lock.h @@ -0,0 +1,73 @@ +#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_SPINLOCK_H +#define THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_SPINLOCK_H + +namespace tensorflow { +namespace { +/* Compile read-write barrier */ +#define mem_barrier() asm volatile("" : : : "memory") + +/* Pause instruction to prevent excess processor bus usage */ +#if defined(__x86_64) +#define cpu_relax() asm volatile("pause\n" : : : "memory") +#else +#define cpu_relax() asm volatile("yield\n" : : : "memory") +#endif + +#define __ASM_FORM(x) " " #x " " +#define __ASM_SEL(a, b) __ASM_FORM(a) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) +#define _ASM_PTR __ASM_SEL(.long, .quad) + +#define LOCK_PREFIX \ + ".section .smp_locks,\"a\"\n" _ASM_ALIGN "\n" _ASM_PTR \ + "661f\n" /* address */ \ + ".previous\n" \ + "661:\n\tlock; " +#define LOCK_PREFIX \ + ".section .smp_locks,\"a\"\n" _ASM_ALIGN "\n" _ASM_PTR \ + "661f\n" /* address */ \ + ".previous\n" \ + "661:\n\tlock; " + +/* Atomic exchange (of various sizes) */ +static inline unsigned long xchg_64(void* ptr, unsigned long x) { +#if defined(__x86_64) + asm volatile("xchgq %0,%1" + : "=r"((unsigned long)x) + : "m"(*(volatile long*)ptr), "0"((unsigned long)x) + : "memory"); +#else + x = __atomic_exchange_n((unsigned long*)ptr, x, __ATOMIC_SEQ_CST); +#endif + + return x; +} + +static void lock_impl(unsigned long* lock) { + while (xchg_64((void*)lock, 1)) { + while (*lock) cpu_relax(); + } +} + +static void unlock_impl(unsigned long* lock) { + mem_barrier(); + *lock = 0; +} +} // namespace + +class spin_lock { + public: + spin_lock() = default; + spin_lock(const spin_lock&) = delete; + spin_lock& operator=(const spin_lock&) = delete; + + void lock() { lock_impl(&lock_); } + + void unlock() { unlock_impl(&lock_); } + + private: + unsigned long lock_ = 0; +}; + +} // namespace tensorflow +#endif diff --git a/deepray/custom_ops/utils/spin_rw_lock.h b/deepray/custom_ops/utils/spin_rw_lock.h new file mode 100644 index 00000000..00439a48 --- /dev/null +++ b/deepray/custom_ops/utils/spin_rw_lock.h @@ -0,0 +1,248 @@ +#ifndef TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_ +#define TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_ + +#define EASY_SMP_LOCK "lock;" +#define easy_atomic_set(v, i) ((v) = (i)) + +#if defined(__x86_64) +#define cpu_relax() asm volatile("pause\n" : : : "memory") +#else +#define cpu_relax() asm volatile("yield\n" : : : "memory") +#endif + +typedef volatile int64_t easy_atomic_t; +static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i) { +#if defined(__x86_64__) + __asm__ __volatile__(EASY_SMP_LOCK "addq %1,%0" + : "=m"((*v)) + : "r"(i), "m"((*v))); +#else + __atomic_add_fetch(v, i, __ATOMIC_SEQ_CST); +#endif +} +static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value, + int64_t i) { + int64_t __i = i; +#if defined(__x86_64__) + __asm__ __volatile__(EASY_SMP_LOCK "xaddq %0, %1;" + : "=r"(i) + : "m"(*value), "0"(i)); +#else + i = __atomic_fetch_add(value, i, __ATOMIC_SEQ_CST); +#endif + return i + __i; +} +static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old, + int64_t set) { + uint8_t res; +#if defined(__x86_64__) + __asm__ volatile(EASY_SMP_LOCK "cmpxchgq %3, %1; sete %0" + : "=a"(res) + : "m"(*lock), "a"(old), "r"(set) + : "cc", "memory"); +#else + res = __atomic_compare_exchange_n(lock, &old, set, true, __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST); +#endif + return res; +} +static __inline__ void easy_atomic_inc(easy_atomic_t *v) { +#if defined(__x86_64__) + __asm__ __volatile__(EASY_SMP_LOCK "incq %0" : "=m"(*v) : "m"(*v)); +#else + __atomic_add_fetch(v, 1, __ATOMIC_SEQ_CST); +#endif +} +static __inline__ void easy_atomic_dec(easy_atomic_t *v) { +#if defined(__x86_64__) + __asm__ __volatile__(EASY_SMP_LOCK "decq %0" : "=m"(*v) : "m"(*v)); +#else + __atomic_sub_fetch(v, 1, __ATOMIC_SEQ_CST); +#endif +} + +#define EASY_OK 0 +#define EASY_ERROR (-1) +#define EASY_ABORT (-2) +#define EASY_ASYNC (-3) +#define EASY_BREAK (-4) +#define EASY_ENCODE (-5) +#define EASY_QUEUE_FULL (-6) +#define EASY_AGAIN (-EAGAIN) + +typedef struct easy_spinrwlock_t { + easy_atomic_t ref_cnt; + easy_atomic_t wait_write; +} easy_spinrwlock_t; +#define EASY_SPINRWLOCK_INITIALIZER {0, 0} +static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock) { + int ret = EASY_OK; + + if (NULL == lock) { + ret = EASY_ERROR; + } else { + int cond = 1; + + while (cond) { + int loop = 1; + + do { + easy_atomic_t oldv = lock->ref_cnt; + + if (0 <= oldv && 0 == lock->wait_write) { + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, oldv + 1)) { + return ret; + } + } + + cpu_relax(); + loop <<= 1; + } while (loop < 1024); + + sched_yield(); + } + } + + return ret; +} +static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock) { + int ret = EASY_OK; + + if (NULL == lock) { + ret = EASY_ERROR; + } else { + int cond = 1; + easy_atomic_inc(&lock->wait_write); + + while (cond) { + int loop = 1; + + do { + easy_atomic_t oldv = lock->ref_cnt; + + if (0 == oldv) { + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, -1)) { + cond = 0; + break; + } + } + + cpu_relax(); + loop <<= 1; + } while (loop < 1024); + + if (cond) sched_yield(); + } + + easy_atomic_dec(&lock->wait_write); + } + + return ret; +} +static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock) { + int ret = EASY_OK; + + if (NULL == lock) { + ret = EASY_ERROR; + } else { + ret = EASY_AGAIN; + easy_atomic_t oldv = lock->ref_cnt; + + if (0 <= oldv && 0 == lock->wait_write) { + easy_atomic_t newv = oldv + 1; + + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) { + ret = EASY_OK; + } + } + } + + return ret; +} +static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock) { + int ret = EASY_OK; + + if (NULL == lock) { + ret = EASY_ERROR; + } else { + ret = EASY_AGAIN; + easy_atomic_t oldv = lock->ref_cnt; + + if (0 == oldv) { + easy_atomic_t newv = -1; + + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) { + ret = EASY_OK; + } + } + } + + return ret; +} +static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t *lock) { + int ret = EASY_OK; + + if (NULL == lock) { + ret = EASY_ERROR; + } else { + while (1) { + easy_atomic_t oldv = lock->ref_cnt; + + if (-1 == oldv) { + easy_atomic_t newv = 0; + + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) { + break; + } + } else if (0 < oldv) { + easy_atomic_t newv = oldv - 1; + + if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) { + break; + } + } else { + ret = EASY_ERROR; + break; + } + } + } + + return ret; +} +namespace tensorflow { + +class spin_rd_lock { + public: + typedef easy_spinrwlock_t lock_type; + + explicit spin_rd_lock(lock_type *lock) : lock_(lock) { + easy_spinrwlock_rdlock(lock_); + } + explicit spin_rd_lock(lock_type &lock) : lock_(&lock) { + easy_spinrwlock_rdlock(lock_); + } + ~spin_rd_lock() { easy_spinrwlock_unlock(lock_); } + + private: + lock_type *lock_; +}; + +class spin_wr_lock { + public: + typedef easy_spinrwlock_t lock_type; + + explicit spin_wr_lock(lock_type *lock) : lock_(lock) { + easy_spinrwlock_wrlock(lock_); + } + explicit spin_wr_lock(lock_type &lock) : lock_(&lock) { + easy_spinrwlock_wrlock(lock_); + } + ~spin_wr_lock() { easy_spinrwlock_unlock(lock_); } + + private: + lock_type *lock_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_ diff --git a/deepray/custom_ops/utils/tensor_testutil.cc b/deepray/custom_ops/utils/tensor_testutil.cc new file mode 100644 index 00000000..a97daa7a --- /dev/null +++ b/deepray/custom_ops/utils/tensor_testutil.cc @@ -0,0 +1,294 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensor_testutil.h" + +#include + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace test { + +::testing::AssertionResult IsSameType(const Tensor& x, const Tensor& y) { + if (x.dtype() != y.dtype()) { + return ::testing::AssertionFailure() + << "Tensors have different dtypes (" << x.dtype() << " vs " + << y.dtype() << ")"; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult IsSameShape(const Tensor& x, const Tensor& y) { + if (!x.IsSameSize(y)) { + return ::testing::AssertionFailure() + << "Tensors have different shapes (" << x.shape().DebugString() + << " vs " << y.shape().DebugString() << ")"; + } + return ::testing::AssertionSuccess(); +} + +template +static ::testing::AssertionResult EqualFailure(const T& x, const T& y) { + return ::testing::AssertionFailure() + << std::setprecision(std::numeric_limits::digits10 + 2) << x + << " not equal to " << y; +} + +template <> +::testing::AssertionResult EqualFailure(const int8& x, const int8& y) { + return EqualFailure(static_cast(x), static_cast(y)); +} + +static ::testing::AssertionResult IsEqual(float x, float y, Tolerance t) { + // We consider NaNs equal for testing. + if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) + return ::testing::AssertionSuccess(); + if (t == Tolerance::kNone) { + if (x == y) return ::testing::AssertionSuccess(); + } else { + if (::testing::internal::CmpHelperFloatingPointEQ("", "", x, y)) + return ::testing::AssertionSuccess(); + } + return EqualFailure(x, y); +} +static ::testing::AssertionResult IsEqual(double x, double y, Tolerance t) { + // We consider NaNs equal for testing. + if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) + return ::testing::AssertionSuccess(); + if (t == Tolerance::kNone) { + if (x == y) return ::testing::AssertionSuccess(); + } else { + if (::testing::internal::CmpHelperFloatingPointEQ("", "", x, y)) + return ::testing::AssertionSuccess(); + } + return EqualFailure(x, y); +} +static ::testing::AssertionResult IsEqual(Eigen::half x, Eigen::half y, + Tolerance t) { + // We consider NaNs equal for testing. + if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) + return ::testing::AssertionSuccess(); + + // Below is a reimplementation of CmpHelperFloatingPointEQ, which + // we cannot use because Eigen::half is not default-constructible. + + if (Eigen::numext::isnan(x) || Eigen::numext::isnan(y)) + return EqualFailure(x, y); + + auto sign_and_magnitude_to_biased = [](uint16_t sam) { + const uint16_t kSignBitMask = 0x8000; + if (kSignBitMask & sam) return ~sam + 1; // negative number. + return kSignBitMask | sam; // positive number. + }; + + auto xb = sign_and_magnitude_to_biased(Eigen::numext::bit_cast(x)); + auto yb = sign_and_magnitude_to_biased(Eigen::numext::bit_cast(y)); + if (t == Tolerance::kNone) { + if (xb == yb) return ::testing::AssertionSuccess(); + } else { + auto distance = xb >= yb ? xb - yb : yb - xb; + const uint16_t kMaxUlps = 4; + if (distance <= kMaxUlps) return ::testing::AssertionSuccess(); + } + return EqualFailure(x, y); +} +template +static ::testing::AssertionResult IsEqual(const T& x, const T& y, Tolerance t) { + if (::testing::internal::CmpHelperEQ("", "", x, y)) + return ::testing::AssertionSuccess(); + return EqualFailure(x, y); +} + +template +static ::testing::AssertionResult IsEqual(const std::complex& x, + const std::complex& y, + Tolerance t) { + if (IsEqual(x.real(), y.real(), t) && IsEqual(x.imag(), y.imag(), t)) + return ::testing::AssertionSuccess(); + return EqualFailure(x, y); +} + +template +static void ExpectEqual(const Tensor& x, const Tensor& y, + Tolerance t = Tolerance::kDefault) { + const T* Tx = x.unaligned_flat().data(); + const T* Ty = y.unaligned_flat().data(); + auto size = x.NumElements(); + int max_failures = 10; + int num_failures = 0; + for (decltype(size) i = 0; i < size; ++i) { + EXPECT_TRUE(IsEqual(Tx[i], Ty[i], t)) << "i = " << (++num_failures, i); + ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up."; + } +} + +template +static ::testing::AssertionResult IsClose(const T& x, const T& y, const T& atol, + const T& rtol) { + // We consider NaNs equal for testing. + if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y)) + return ::testing::AssertionSuccess(); + if (x == y) return ::testing::AssertionSuccess(); // Handle infinity. + auto tolerance = atol + rtol * Eigen::numext::abs(x); + if (Eigen::numext::abs(x - y) <= tolerance) + return ::testing::AssertionSuccess(); + return ::testing::AssertionFailure() << x << " not close to " << y; +} + +template +static ::testing::AssertionResult IsClose(const std::complex& x, + const std::complex& y, + const T& atol, const T& rtol) { + if (IsClose(x.real(), y.real(), atol, rtol) && + IsClose(x.imag(), y.imag(), atol, rtol)) + return ::testing::AssertionSuccess(); + return ::testing::AssertionFailure() << x << " not close to " << y; +} + +// Return type can be different from T, e.g. float for T=std::complex. +template +static auto GetTolerance(double tolerance) { + using Real = typename Eigen::NumTraits::Real; + auto default_tol = static_cast(5.0) * Eigen::NumTraits::epsilon(); + auto result = tolerance < 0.0 ? default_tol : static_cast(tolerance); + EXPECT_GE(result, static_cast(0)); + return result; +} + +template +static void ExpectClose(const Tensor& x, const Tensor& y, double atol, + double rtol) { + auto typed_atol = GetTolerance(atol); + auto typed_rtol = GetTolerance(rtol); + + const T* Tx = x.unaligned_flat().data(); + const T* Ty = y.unaligned_flat().data(); + auto size = x.NumElements(); + int max_failures = 10; + int num_failures = 0; + for (decltype(size) i = 0; i < size; ++i) { + EXPECT_TRUE(IsClose(Tx[i], Ty[i], typed_atol, typed_rtol)) + << "i = " << (++num_failures, i) << " Tx[i] = " << Tx[i] + << " Ty[i] = " << Ty[i]; + ASSERT_LT(num_failures, max_failures) + << "Too many mismatches (atol = " << atol << " rtol = " << rtol + << "), giving up."; + } + EXPECT_EQ(num_failures, 0) + << "Mismatches detected (atol = " << atol << " rtol = " << rtol << ")."; +} + +void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) { + ASSERT_TRUE(IsSameType(x, y)); + ASSERT_TRUE(IsSameShape(x, y)); + + switch (x.dtype()) { + case DT_FLOAT: + return ExpectEqual(x, y, t); + case DT_DOUBLE: + return ExpectEqual(x, y, t); + case DT_INT32: + return ExpectEqual(x, y); + case DT_UINT32: + return ExpectEqual(x, y); + case DT_UINT16: + return ExpectEqual(x, y); + case DT_UINT8: + return ExpectEqual(x, y); + case DT_INT16: + return ExpectEqual(x, y); + case DT_INT8: + return ExpectEqual(x, y); + case DT_STRING: + return ExpectEqual(x, y); + case DT_COMPLEX64: + return ExpectEqual(x, y, t); + case DT_COMPLEX128: + return ExpectEqual(x, y, t); + case DT_INT64: + return ExpectEqual(x, y); + case DT_UINT64: + return ExpectEqual(x, y); + case DT_BOOL: + return ExpectEqual(x, y); + case DT_QINT8: + return ExpectEqual(x, y); + case DT_QUINT8: + return ExpectEqual(x, y); + case DT_QINT16: + return ExpectEqual(x, y); + case DT_QUINT16: + return ExpectEqual(x, y); + case DT_QINT32: + return ExpectEqual(x, y); + case DT_BFLOAT16: + return ExpectEqual(x, y, t); + case DT_HALF: + return ExpectEqual(x, y, t); + case DT_FLOAT8_E5M2: + return ExpectEqual(x, y, t); + case DT_FLOAT8_E4M3FN: + return ExpectEqual(x, y, t); + case DT_INT4: + return ExpectEqual(x, y, t); + case DT_UINT4: + return ExpectEqual(x, y, t); + default: + EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype()); + } +} + +void ExpectClose(const Tensor& x, const Tensor& y, double atol, double rtol) { + ASSERT_TRUE(IsSameType(x, y)); + ASSERT_TRUE(IsSameShape(x, y)); + + switch (x.dtype()) { + case DT_HALF: + return ExpectClose(x, y, atol, rtol); + case DT_BFLOAT16: + return ExpectClose(x, y, atol, rtol); + case DT_FLOAT: + return ExpectClose(x, y, atol, rtol); + case DT_DOUBLE: + return ExpectClose(x, y, atol, rtol); + case DT_COMPLEX64: + return ExpectClose(x, y, atol, rtol); + case DT_COMPLEX128: + return ExpectClose(x, y, atol, rtol); + default: + EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype()); + } +} + +::testing::AssertionResult internal_test::IsClose(Eigen::half x, Eigen::half y, + double atol, double rtol) { + return test::IsClose(x, y, GetTolerance(atol), + GetTolerance(rtol)); +} +::testing::AssertionResult internal_test::IsClose(float x, float y, double atol, + double rtol) { + return test::IsClose(x, y, GetTolerance(atol), + GetTolerance(rtol)); +} +::testing::AssertionResult internal_test::IsClose(double x, double y, + double atol, double rtol) { + return test::IsClose(x, y, GetTolerance(atol), + GetTolerance(rtol)); +} + +} // end namespace test +} // end namespace tensorflow diff --git a/deepray/custom_ops/utils/tensor_testutil.h b/deepray/custom_ops/utils/tensor_testutil.h new file mode 100644 index 00000000..53ad5969 --- /dev/null +++ b/deepray/custom_ops/utils/tensor_testutil.h @@ -0,0 +1,162 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_ + +#include + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace test { + +// Constructs a scalar tensor with 'val'. +template +Tensor AsScalar(const T& val) { + Tensor ret(DataTypeToEnum::value, {}); + ret.scalar()() = val; + return ret; +} + +// Constructs a flat tensor with 'vals'. +template +Tensor AsTensor(gtl::ArraySlice vals) { + Tensor ret(DataTypeToEnum::value, {static_cast(vals.size())}); + std::copy_n(vals.data(), vals.size(), ret.flat().data()); + return ret; +} + +// Constructs a tensor of "shape" with values "vals". +template +Tensor AsTensor(gtl::ArraySlice vals, const TensorShape& shape) { + Tensor ret; + CHECK(ret.CopyFrom(AsTensor(vals), shape)); + return ret; +} + +// Fills in '*tensor' with 'vals'. E.g., +// Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2})); +// test::FillValues(&x, {11, 21, 21, 22}); +template +void FillValues(Tensor* tensor, gtl::ArraySlice vals) { + auto flat = tensor->flat(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + std::copy_n(vals.data(), vals.size(), flat.data()); + } +} + +// Fills in '*tensor' with 'vals', converting the types as needed. +template +void FillValues(Tensor* tensor, std::initializer_list vals) { + auto flat = tensor->flat(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + size_t i = 0; + for (auto itr = vals.begin(); itr != vals.end(); ++itr, ++i) { + flat(i) = T(*itr); + } + } +} + +// Fills in '*tensor' with a sequence of value of val, val+1, val+2, ... +// Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2})); +// test::FillIota(&x, 1.0); +template +void FillIota(Tensor* tensor, const T& val) { + auto flat = tensor->flat(); + std::iota(flat.data(), flat.data() + flat.size(), val); +} + +// Fills in '*tensor' with a sequence of value of fn(0), fn(1), ... +// Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2})); +// test::FillFn(&x, [](int i)->float { return i*i; }); +template +void FillFn(Tensor* tensor, std::function fn) { + auto flat = tensor->flat(); + for (int i = 0; i < flat.size(); ++i) flat(i) = fn(i); +} + +// Expects "x" and "y" are tensors of the same type, same shape, and identical +// values (within 4 ULPs for floating point types unless explicitly disabled). +enum class Tolerance { + kNone, + kDefault, +}; +void ExpectEqual(const Tensor& x, const Tensor& y, + Tolerance t = Tolerance ::kDefault); + +// Expects "x" and "y" are tensors of the same (floating point) type, +// same shape and element-wise difference between x and y is no more +// than atol + rtol * abs(x). If atol or rtol is negative, the data type's +// epsilon * kSlackFactor is used. +void ExpectClose(const Tensor& x, const Tensor& y, double atol = -1.0, + double rtol = -1.0); + +// Expects "x" and "y" are tensors of the same type T, same shape, and +// equal values. Consider using ExpectEqual above instead. +template +void ExpectTensorEqual(const Tensor& x, const Tensor& y) { + EXPECT_EQ(x.dtype(), DataTypeToEnum::value); + ExpectEqual(x, y); +} + +::testing::AssertionResult IsSameType(const Tensor& x, const Tensor& y); +::testing::AssertionResult IsSameShape(const Tensor& x, const Tensor& y); + +template +void ExpectTensorEqual(const Tensor& x, const Tensor& y, + std::function is_equal) { + EXPECT_EQ(x.dtype(), DataTypeToEnum::value); + ASSERT_TRUE(IsSameType(x, y)); + ASSERT_TRUE(IsSameShape(x, y)); + + const T* Tx = x.unaligned_flat().data(); + const T* Ty = y.unaligned_flat().data(); + auto size = x.NumElements(); + int max_failures = 10; + int num_failures = 0; + for (decltype(size) i = 0; i < size; ++i) { + EXPECT_TRUE(is_equal(Tx[i], Ty[i])) << "i = " << (++num_failures, i); + ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up."; + } +} + +// Expects "x" and "y" are tensors of the same type T, same shape, and +// approximate equal values. Consider using ExpectClose above instead. +template +void ExpectTensorNear(const Tensor& x, const Tensor& y, double atol) { + EXPECT_EQ(x.dtype(), DataTypeToEnum::value); + ExpectClose(x, y, atol, /*rtol=*/0.0); +} + +// For tensor_testutil_test only. +namespace internal_test { +::testing::AssertionResult IsClose(Eigen::half x, Eigen::half y, + double atol = -1.0, double rtol = -1.0); +::testing::AssertionResult IsClose(float x, float y, double atol = -1.0, + double rtol = -1.0); +::testing::AssertionResult IsClose(double x, double y, double atol = -1.0, + double rtol = -1.0); +} // namespace internal_test + +} // namespace test +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_ diff --git a/deepray/custom_ops/utils/tensor_testutil_test.cc b/deepray/custom_ops/utils/tensor_testutil_test.cc new file mode 100644 index 00000000..0e3b1572 --- /dev/null +++ b/deepray/custom_ops/utils/tensor_testutil_test.cc @@ -0,0 +1,335 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensor_testutil.h" + +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace test { +namespace { + +using internal_test::IsClose; + +template +void TestEdgeCasesNear() { + EXPECT_TRUE(IsClose(Eigen::NumTraits::infinity(), + Eigen::NumTraits::infinity(), 0.0, 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::lowest(), + Eigen::NumTraits::highest(), + Eigen::NumTraits::infinity(), 0.0)); + EXPECT_FALSE( + IsClose(Eigen::NumTraits::lowest(), Eigen::NumTraits::highest(), + static_cast(Eigen::NumTraits::highest()), 0.0)); + EXPECT_FALSE(IsClose(Eigen::NumTraits::quiet_NaN(), T(0.0), 0.0, 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::quiet_NaN(), 0.0, 0.0)); + EXPECT_FALSE(IsClose(Eigen::NumTraits::quiet_NaN(), T(0.0), + Eigen::NumTraits::infinity(), 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::infinity(), 0.0)); +} + +// For debug printing. Example usage: +// dumpFloatingPointStorage( +// static_cast(-2.71f)); +// dumpFloatingPointStorage(-2.718281f); +// dumpFloatingPointStorage (-2.71828182846); +template +void dumpFloatingPointStorage(T value) { + U* integral = reinterpret_cast(&value); + int shift_amount = (sizeof(U) << 3) - 1; + int exponent_bits = 2 + (log2(sizeof(U)) * 3); + U mask = static_cast(1) << shift_amount; + for (int bits = 0; bits <= shift_amount; ++bits) { + std::cout << ((*integral & mask) > 0); + if (bits == 0 || bits == exponent_bits) std::cout << " "; + mask >>= 1; + } + std::cout << std::endl; + printf("%.20lf\n", static_cast(value)); +} + +TEST(TensorTestUtilTest, ExpectTensorNearHalf) { + // Eigen::half has 1 sign bit, 5 exponent bits, and 10 mantissa bits. + // The exponent is offset at 15. + // https://en.wikipedia.org/wiki/Half-precision_floating-point_format + typedef Eigen::half T; + + // Trivial cases: equalities. + EXPECT_TRUE(IsClose(static_cast(1.0f), static_cast(1.0f), 0.0, 0.0)); + EXPECT_TRUE(IsClose(static_cast(0.0f), static_cast(-0.0f), 0.0, 0.0)); + EXPECT_TRUE( + IsClose(static_cast(3.141592f), static_cast(3.141592f), 0.0, 0.0)); + + // 0 10010 0001111110 -> 1150/128 = 8.984375 vs + // 0 10010 0001111111 -> 1151/128 = 8.9921875 (diff = 0.0078125) + EXPECT_TRUE( + IsClose(static_cast(8.9875f), static_cast(8.99f), 0.0078125, 0.0)); + EXPECT_FALSE( + IsClose(static_cast(8.9875f), static_cast(8.99f), 0.007, 0.0)); + + // 0 11000 0110100000 -> 1440/2 = 720 vs + // 0 11000 0110100001 -> 1441/2 = 720.5 (diff = 0.5) + EXPECT_TRUE( + IsClose(static_cast(720.2f), static_cast(720.3f), 0.5, 0.0)); + EXPECT_FALSE( + IsClose(static_cast(720.2f), static_cast(720.3f), 0.4, 0.0)); + + // 0 11001 0011010010 -> 1234 vs + // 0 11001 0011010011 -> 1235 (diff = 1) + // Rounds to even (1234.5 -> 1234). + EXPECT_TRUE( + IsClose(static_cast(1234.f), static_cast(1235.f), 1.0, 0.0)); + EXPECT_FALSE( + IsClose(static_cast(1234.5f), static_cast(1235.f), 0.5, 0.0)); + EXPECT_TRUE( + IsClose(static_cast(1234.5f), static_cast(1235.f), 1.0, 0.0)); + + // 1 10000 0101101100 -> -1388/512 = -2.7109375 vs + // 1 10000 0101110001 -> -1393/512 = -2.720703125 (diff = 0.009765625) + EXPECT_TRUE( + IsClose(static_cast(-2.71f), static_cast(-2.72f), 0.01, 0.0)); + + TestEdgeCasesNear(); +} + +TEST(TensorTestUtilTest, ExpectTensorNearFloat) { + // float has 1 sign bit, 8 exponent bits, and 23 mantissa bits. + // The exponent offset is 127. + // https://en.wikipedia.org/wiki/Single-precision_floating-point_format + typedef float T; + // Trivial cases: equalities. + EXPECT_TRUE(IsClose(1.0f, 1.0f, 0.0f, 0.0f)); + EXPECT_TRUE(IsClose(0.0f, -0.0f, 0.0f, 0.0f)); + EXPECT_TRUE(IsClose(3.14159265359f, 3.14159265359f, 0.0f, 0.0f)); + + // 0 10000010 00011111100110011001101 -> 9,424,077/2^20 vs + // 0 10000010 00011111100110100110110 -> 9,424,182/2^20 + // diff = 105/2^20 = 0.000100135803223 + EXPECT_TRUE(IsClose(8.9875f, 8.9876f, 0.0001002f, 0.0f)); + EXPECT_FALSE(IsClose(8.9875f, 8.9876f, 0.0001f, 0.0f)); + + // 0 10001000 01101000000110011101001 -> 11,799,785/2^14 vs + // 0 10001000 01101000000110011101010 -> 11,799,786/2^14 + // diff = 1/2^14 = 0.00006103515625 + EXPECT_TRUE(IsClose(720.2017f, 720.2018f, 0.0001f, 0.0f)); + EXPECT_FALSE(IsClose(720.20175f, 720.20185f, 0.0001f, 0.0f)); + EXPECT_TRUE(IsClose(720.20175f, 720.20185f, 0.00013f, 0.0f)); + + // 0 10011001 11010110111100110100010 -> 15,432,098*2^3 vs + // 0 10011001 11010110111100110100011 -> 15,432,099*2^3 (diff = 2^3 = 8) + EXPECT_FALSE(IsClose(123456788.f, 123456789.f, 4.0f, 0.0f)); + EXPECT_TRUE(IsClose(123456788.f, 123456789.f, 8.0f, 0.0f)); + + // 1 10000000 01011011111100001010001 -> 11,401,297/2^22 vs + // 1 10000000 01011011111100001010101 -> 11,401,301/2^22 + // diff = 4/2^22 = 0.000000953674316 + EXPECT_TRUE(IsClose(-2.718281f, -2.718282f, 0.1f, 0.0f)); + + TestEdgeCasesNear(); +} + +TEST(TensorTestUtilTest, ExpectTensorNearDouble) { + // double has 1 sign bit, 11 exponent bits, and 52 mantissa bits. + // The exponent offset is 1,023. + // https://en.wikipedia.org/wiki/Double-precision_floating-point_format + typedef double T; + // Trivial cases: equalities. + EXPECT_TRUE(IsClose(1.0, 1.0, 0.0, 0.0)); + EXPECT_TRUE(IsClose(0.0, -0.0, 0.0, 0.0)); + EXPECT_TRUE(IsClose(3.14159265359, 3.14159265359, 0.0, 0.0)); + + // 0 10000000010 0001111110011001100110011001100110011001100110011010 + // -> 5,059,512,706,374,042/2^49 vs + // 0 10000000010 0001111110011010011010110101000010110000111100101000 + // -> 5,059,569,001,369,384/2^49 + // diff = 56,294,995,342/2^49 = 9.999999999976694198267E-5 + EXPECT_TRUE(IsClose(8.9875, 8.9876, 0.0001, 0.0)); + + // 0 10000001111 1000100101110000001100111010100100101010001100000101 + // -> 6,921,439,564,440,325/2^36 + // 0 10000001111 1000100101110000001100111010111110110111111010010001 + // -> 6,921,439,571,312,273/2^36 + // diff = 6,871,948/2^36 = 1.000000047497451305389E-4 + EXPECT_FALSE(IsClose(100720.2018, 100720.2019, 0.0001, 0.0)); + EXPECT_TRUE(IsClose(100720.2018, 100720.2019, 1.00000005e-4, 0.0)); + + // 0 10000110100 0101111011100010101000101110101101011010010111000100 + // -> 6,172,839,450,617,284 * 2 + // 0 10000110100 0101111011100010101000101110101101011010010111000011 + // -> 6,172,839,450,617,283 * 2 + // diff = 1 * 2 = 2 + EXPECT_FALSE(IsClose(12345678901234567., 12345678901234566., 1.0, 0.0)); + EXPECT_TRUE(IsClose(12345678901234567., 12345678901234566., 2.0, 0.0)); + + // 1 10000000000 0101101111110000101010001011000101000101111111001111 + // -> -6,121,026,514,870,223/2^51 + // 1 10000000000 0101101111110000101010001011000101001011011111000101 + // -> -6,121,026,514,892,741/2^51 + // diff = 22,518/2^51 = 1.00000008274037099909E-11 + EXPECT_FALSE(IsClose(-2.71828182846, -2.71828182847, 1.0e-11, 0.0)); + EXPECT_TRUE(IsClose(-2.71828182846, -2.71828182847, 1.00000009e-11, 0.0)); + + TestEdgeCasesNear(); +} + +// Tensor::Slice() and Tensor::SubSlice() may return unaligned Tensor. +TEST(TensorTestUtilTest, ExpectTensorNearSlice) { + Tensor x(DT_FLOAT, TensorShape({7, 3})); + test::FillFn(&x, [](int i) { return 1.0f; }); + + test::ExpectTensorNear( + x.SubSlice(3), test::AsTensor({1.0, 1.0, 1.0}, TensorShape({3})), + 1e-10); +} + +template +void TestEdgeCasesClose() { + EXPECT_TRUE(IsClose(Eigen::NumTraits::infinity(), + Eigen::NumTraits::infinity(), 0.0, 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::lowest(), + Eigen::NumTraits::highest(), + Eigen::NumTraits::infinity(), + Eigen::NumTraits::infinity())); + EXPECT_TRUE(IsClose(Eigen::NumTraits::lowest(), + Eigen::NumTraits::highest(), + static_cast(Eigen::NumTraits::highest()), + static_cast(Eigen::NumTraits::highest()))); + EXPECT_FALSE(IsClose(Eigen::NumTraits::quiet_NaN(), T(0.0), 0.0, 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::quiet_NaN(), 0.0, 0.0)); + EXPECT_FALSE(IsClose(Eigen::NumTraits::quiet_NaN(), T(0.0), + Eigen::NumTraits::infinity(), 0.0)); + EXPECT_TRUE(IsClose(Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::quiet_NaN(), + Eigen::NumTraits::infinity(), 0.0)); +} + +TEST(TensorTestUtilTest, ExpectTensorCloseHalf) { + typedef Eigen::half T; + + EXPECT_TRUE(IsClose(static_cast(1.0f), static_cast(1.1f), 0.1, 0.1)); + EXPECT_TRUE(IsClose(static_cast(1.0f), static_cast(1.0f), 0.0, 0.0)); + EXPECT_FALSE(IsClose(static_cast(1.0f), static_cast(1.1f), 0.0, 0.0)); + + // Epsilon: 0 00010 0000000000 -> 2^-13 = 0.0001220703125 + // Default Tolerance: 0 00100 0100000000 -> 5/2^13 = 0.0006103515625 + + // 1.234 -> 0 01111 0011110000 -> 1264/2^10 = 1.234375 + // 1.233 -> 0 01111 0011101111 -> 1263/2^10 = 1.2333984375 + // 1.235 -> 0 01111 0011110001 -> 1265/2^10 = 1.2353515625 + // 1.232 -> 0 01111 0011101110 -> 1262/2^10 = 1.232421875 + // 1.236 -> 0 01111 0011110010 -> 1266/2^10 = 1.236328125 + // 1/2^10 = 0.0009765625E + // Threshold = 0.0013637542724609375 + EXPECT_TRUE(IsClose(static_cast(1.234f), static_cast(1.234f))); + EXPECT_TRUE(IsClose(static_cast(1.234f), static_cast(1.233f))); + EXPECT_TRUE(IsClose(static_cast(1.234f), static_cast(1.235f))); + + // Diff = 0.001953125 + EXPECT_FALSE(IsClose(static_cast(1.234f), static_cast(1.232f))); + EXPECT_FALSE(IsClose(static_cast(1.234f), static_cast(1.236f))); + EXPECT_TRUE( + IsClose(static_cast(1.234f), static_cast(1.232f), 8e-4f, 1e-3f)); + EXPECT_TRUE( + IsClose(static_cast(1.234f), static_cast(1.236f), 1.4e-3f, 5e-4f)); + + // Too fine-grained: won't detect the difference + EXPECT_TRUE( + IsClose(static_cast(3.141592f), static_cast(3.141593f), 0.0, 0.0)); + + // Trivial case. + EXPECT_FALSE(IsClose(static_cast(1e4f), static_cast(1e-4f))); + + TestEdgeCasesClose(); +} + +TEST(TensorTestUtilTest, ExpectTensorCloseFloat) { + typedef float T; + + EXPECT_TRUE(IsClose(1.0f, 1.1f, 0.1f, 0.1f)); + EXPECT_TRUE(IsClose(1.0f, 1.0f, 0.0f, 0.0f)); + EXPECT_FALSE(IsClose(1.0f, 1.1f, 0.0f, 0.0f)); + + // Epsilon: 2^-23 ~ 0.00000011920928955078 + // Default Tolerance: 5/2^23 ~ 0.00000059604644775391 + + // 1.234567f -> 10,356,299/2^23 ~ 1.234567046165466308594 + // 1.234568f -> 10,356,307/2^23 ~ 1.234567999839782714844 + // 1.234566f -> 10,356,290/2^23 ~ 1.234565973281860351563 + // 1.234569f -> 10,356,315/2^23 ~ 1.234568953514099121094 + // 1.234565f -> 10,356,282/2^23 ~ 1.234565019607543945313 + // Threshold ~ 0.00000133190576434572 + EXPECT_TRUE(IsClose(1.234567f, 1.234567f)); + EXPECT_TRUE(IsClose(1.234567f, 1.234568f)); + EXPECT_TRUE(IsClose(1.234567f, 1.234566f)); + EXPECT_FALSE(IsClose(1.234567f, 1.234569f)); + EXPECT_FALSE(IsClose(1.234567f, 1.234565f)); + EXPECT_TRUE(IsClose(1.234567f, 1.234569f, 8e-7f, 1e-6f)); + EXPECT_TRUE(IsClose(1.234567f, 1.234565f, 3e-7f, 1.5e-6f)); + + // Too fine-grained: won't detect the difference + EXPECT_TRUE(IsClose(3.14159265f, 3.14159266f, 0.0f, 0.0f)); + + // Trivial cases + EXPECT_FALSE(IsClose(1e8f, 1e-8f)); + EXPECT_FALSE(IsClose(1e15f, 1e-15f)); + + TestEdgeCasesClose(); +} + +TEST(TensorTestUtilTest, ExpectTensorCloseDouble) { + typedef double T; + + EXPECT_TRUE(IsClose(1.0, 1.1, 0.1, 0.1)); + EXPECT_TRUE(IsClose(1.0, 1.0, 0.0, 0.0)); + EXPECT_FALSE(IsClose(1.0, 1.1, 0.0, 0.0)); + + // Epsilon: 2^-52 ~ 2.220446049250313080847E-16 + // Default Tolerance: 5/2^52 ~ 1.110223024625156540424E-15 + + // 1.234567890123456 -> 5,559,999,489,923,576/2^52 ~ 1.234567890123456024298 + // 1.234567890123457 -> 5,559,999,489,923,580/2^52 ~ 1.234567890123456912477 + // 1.234567890123455 -> 5,559,999,489,923,571/2^52 ~ 1.234567890123454914075 + // 1.234567890123458 -> 5,559,999,489,923,585/2^52 ~ 1.2345678901234580227 + // 1.234567890123454 -> 5,559,999,489,923,567/2^52 ~ 1.234567890123454025897 + // 1.234567890123459 -> 5,559,999,489,923,589/2^52 ~ 1.234567890123458910878 + // 1.234567890123453 -> 5,559,999,489,923,562/2^52 ~ 1.234567890123452915674 + // Threshold ~ 2.480868721703117812159E-15 + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123456)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123457)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123455)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123458)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123454)); + EXPECT_FALSE(IsClose(1.234567890123456, 1.234567890123459)); + EXPECT_FALSE(IsClose(1.234567890123456, 1.234567890123453)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123459, 9.5e-16, 1.6e-15)); + EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123453, 7e-16, 2e-15)); + + // Too fine-grained: won't detect the difference + EXPECT_TRUE(IsClose(3.141592653589793238, 3.141592653589793239, 0.0, 0.0)); + + // Trivial cases + EXPECT_FALSE(IsClose(1e15, 1e-15)); + EXPECT_FALSE(IsClose(1e30, 1e-30)); + + TestEdgeCasesClose(); +} + +} // namespace +} // namespace test +} // namespace tensorflow diff --git a/deepray/custom_ops/zero_out/BUILD b/deepray/custom_ops/zero_out/BUILD index 49c053d0..b8ac1cd1 100644 --- a/deepray/custom_ops/zero_out/BUILD +++ b/deepray/custom_ops/zero_out/BUILD @@ -10,6 +10,9 @@ custom_op_library( "cc/kernels/zero_out_kernels.cc", "cc/ops/zero_out_ops.cc", ], + deps = [ + "//deepray/custom_ops/utils:ok_status_util", + ], ) py_library( @@ -37,5 +40,8 @@ py_test( main = "python/tests/run_all_test.py", deps = [ ":zero_out_ops", + "@pypi_numpy//:pkg", + "@pypi_pytest//:pkg", + "@pypi_tensorflow//:pkg", ], ) diff --git a/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc b/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc index 56271426..abbe95e1 100644 --- a/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc +++ b/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc @@ -15,6 +15,11 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" +#define PRINT_MACRO_HELPER(x) #x +#define PRINT_MACRO(x) #x "=" PRINT_MACRO_HELPER(x) + +#pragma message(PRINT_MACRO(_GLIBCXX_USE_CXX11_ABI)) + using namespace tensorflow; class ZeroOutOp : public OpKernel { @@ -24,6 +29,10 @@ class ZeroOutOp : public OpKernel { void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& input_tensor = context->input(0); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()), + errors::InvalidArgument("ZeroOut expects a 1-D vector.")); + auto input = input_tensor.flat(); // Create an output tensor diff --git a/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc b/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc index 70fd824b..71aec83f 100644 --- a/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc +++ b/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "deepray/custom_ops/utils/ok_status_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" @@ -23,5 +24,5 @@ REGISTER_OP("ZeroOut") .Output("zeroed: int32") .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { c->set_output(0, c->input(0)); - return Status::OK(); + return TFOkStatus; }); diff --git a/deepray/datasets/adult_census_income/adult_census_income.py b/deepray/datasets/adult_census_income/adult_census_income.py index e5ca1175..5ec8cced 100644 --- a/deepray/datasets/adult_census_income/adult_census_income.py +++ b/deepray/datasets/adult_census_income/adult_census_income.py @@ -4,13 +4,11 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, LabelEncoder -FLAGS = flags.FLAGS - dir_path = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(os.path.join(dir_path, 'feature_map.csv')): FLAGS([ @@ -19,7 +17,7 @@ ]) -class Adult_census_income(DataPipeLine): +class Adult_census_income(DataPipeline): def __init__(self, data_path='/workspaces/dataset/census/adult.csv'): super().__init__() @@ -48,9 +46,7 @@ def __init__(self, data_path='/workspaces/dataset/census/adult.csv'): f"--num_train_examples={self.train_df.shape[0]}", ]) - def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs - ): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs): if is_training: target = self.train_df.pop('income') dataset = tf.data.Dataset.from_tensor_slices((dict(self.train_df), target)) diff --git a/deepray/datasets/adult_census_income/adult_census_income_test.py b/deepray/datasets/adult_census_income/adult_census_income_test.py index 7724995b..4027f716 100644 --- a/deepray/datasets/adult_census_income/adult_census_income_test.py +++ b/deepray/datasets/adult_census_income/adult_census_income_test.py @@ -11,8 +11,6 @@ from deepray.datasets.adult_census_income import Adult_census_income from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py index 22ccf14d..c232ac93 100644 --- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py +++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py @@ -3,21 +3,19 @@ from tensorflow.python.data.ops import dataset_ops from deepray.custom_ops.parquet_dataset import parquet_dataset_ops -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS - -class Ali_display_ad_click(DataPipeLine): +class Ali_display_ad_click(DataPipeline): def parse(self, record): label_map = {} - for label in FLAGS.label: + for label in flags.FLAGS.label: # label_map[label] = record.pop(label) label_map[label] = tf.reshape(record.pop(label), [-1, 1]) return record, label_map - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): """Makes dataset (of filenames) from filename glob patterns.""" # Extract lines from input files using the Dataset API. @@ -30,16 +28,17 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0) for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values ], - num_parallel_reads=FLAGS.parallel_reads_per_file if FLAGS.parallel_reads_per_file else dataset_ops.AUTOTUNE, + num_parallel_reads=flags.FLAGS.parallel_reads_per_file + if flags.FLAGS.parallel_reads_per_file else dataset_ops.AUTOTUNE, ) dataset = dataset.map( map_func=self.parse, - num_parallel_calls=FLAGS.parallel_parse if FLAGS.parallel_parse else dataset_ops.AUTOTUNE, + num_parallel_calls=flags.FLAGS.parallel_parse if flags.FLAGS.parallel_parse else dataset_ops.AUTOTUNE, ) - if FLAGS.shuffle_buffer: + if flags.FLAGS.shuffle_buffer: dataset = dataset.apply( - tf.data.experimental.shuffle_and_repeat(buffer_size=FLAGS.shuffle_buffer, count=FLAGS.epochs) + tf.data.experimental.shuffle_and_repeat(buffer_size=flags.FLAGS.shuffle_buffer, count=flags.FLAGS.epochs) ) else: - dataset = dataset.repeat(FLAGS.epochs) + dataset = dataset.repeat(flags.FLAGS.epochs) return dataset diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py index 64c1c324..f0d1dd39 100644 --- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py +++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py @@ -11,8 +11,6 @@ from deepray.datasets.ali_display_ad_click.ali_display_ad_click import Ali_display_ad_click from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014.py b/deepray/datasets/amazon_books_2014/amazon_books_2014.py index ecdc5eb1..2ea68ba3 100644 --- a/deepray/datasets/amazon_books_2014/amazon_books_2014.py +++ b/deepray/datasets/amazon_books_2014/amazon_books_2014.py @@ -19,12 +19,6 @@ from deepray.datasets.tfrecord_pipeline import TFRecordPipeline -FLAGS = flags.FLAGS -FLAGS([ - sys.argv[0], - "--num_train_examples=11932672", -]) - LABEL = ["label"] NEGATIVE_HISTORY = ["item_feat_0_neg", "item_feat_1_neg"] POSITIVE_HISTORY = ["item_feat_0_pos", "item_feat_1_pos"] @@ -37,6 +31,10 @@ class AmazonBooks2014(TFRecordPipeline): def __init__(self, max_seq_length, **kwargs): super().__init__(**kwargs) self._max_seq_length = max_seq_length + FLAGS([ + sys.argv[0], + "--num_train_examples=11932672", + ]) def parser(self, record): tf_feature_spec = { diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py index 44681f12..f89843d7 100644 --- a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py +++ b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py @@ -11,8 +11,6 @@ from deepray.utils.benchmark import PerformanceCalculator from .amazon_books_2014 import AmazonBooks2014 -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -31,14 +29,14 @@ def runner(argv=None): if argv: FLAGS(argv, known_only=True) - data_pipe = AmazonBooks2014(FLAGS.max_seq_length) + prebatch_size = 5 + data_pipe = AmazonBooks2014(prebatch_size=prebatch_size, FLAGS.max_seq_length) # create data pipline of train & test dataset # since each tfrecord file must include all of the features, it is enough to read first chunk for each split. # train_files = [dataset_dir / file for file in feature_spec.source_spec[TRAIN_MAPPING][0][FILES_SELECTOR]] - prebatch_size = 5 - train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size, prebatch_size=prebatch_size) + train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size) _performance_calculator = PerformanceCalculator(0, 1000) diff --git a/deepray/datasets/avazu/avazu.py b/deepray/datasets/avazu/avazu.py index 74a5b130..b5d80249 100644 --- a/deepray/datasets/avazu/avazu.py +++ b/deepray/datasets/avazu/avazu.py @@ -23,9 +23,7 @@ import tensorflow as tf from absl import flags -from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine - -FLAGS = flags.FLAGS +from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline dir_path = os.path.dirname(os.path.realpath(__file__)) FLAGS([ @@ -41,7 +39,7 @@ DEFAULT_VALUE = {"int64": 0, "float32": 0.0, "bytes": ""} -class Avazu(ParquetPipeLine): +class Avazu(ParquetPipeline): def parse(self, record): for name in self.feature_map[(self.feature_map['length'] == 1)]["name"].values: diff --git a/deepray/datasets/avazu/avazu_test.py b/deepray/datasets/avazu/avazu_test.py index 4738e832..3dd43727 100644 --- a/deepray/datasets/avazu/avazu_test.py +++ b/deepray/datasets/avazu/avazu_test.py @@ -11,8 +11,6 @@ from deepray.datasets.avazu.avazu import Avazu from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/cifar/cifar.py b/deepray/datasets/cifar/cifar.py index 4896c1dc..f825e63f 100644 --- a/deepray/datasets/cifar/cifar.py +++ b/deepray/datasets/cifar/cifar.py @@ -22,18 +22,17 @@ import numpy as np import tensorflow as tf from absl import flags -from keras.utils.data_utils import get_file +from keras.src.utils.data_utils import get_file from tensorflow import keras -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS([ +flags.FLAGS([ sys.argv[0], "--num_train_examples=60000", ]) -class CIFAR(DataPipeLine): +class CIFAR(DataPipeline): def load_batch(self, fpath, label_key="labels"): """Internal utility for parsing CIFAR data. @@ -123,7 +122,7 @@ def __init__(self, **kwargs): ), ) - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): if is_training: num_train_samples = 50000 @@ -150,7 +149,7 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat y = keras.utils.to_categorical(y, num_classes) dataset = tf.data.Dataset.from_tensor_slices((x / 255.0, y)) - dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size) + dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size) return dataset @@ -202,7 +201,7 @@ def __init__(self, label_mode="fine", **kwargs): f"Received: label_mode={label_mode}.") dirname = "cifar-100-python" - origin = "http://minio1.arsenal.kanzhun-inc.com/datasets/cifar100/cifar-100-python.tar.gz" #"https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" + origin = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" self.path = get_file( dirname, origin=origin, @@ -213,7 +212,7 @@ def __init__(self, label_mode="fine", **kwargs): ) self.label_mode = label_mode - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): if is_training: fpath = os.path.join(self.path, "train") @@ -230,5 +229,5 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat y = keras.utils.to_categorical(y, num_classes) dataset = tf.data.Dataset.from_tensor_slices((x / 255.0, y)) - dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size) + dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size) return dataset diff --git a/deepray/datasets/cifar/cifar_test.py b/deepray/datasets/cifar/cifar_test.py index 1de32b11..de256065 100644 --- a/deepray/datasets/cifar/cifar_test.py +++ b/deepray/datasets/cifar/cifar_test.py @@ -9,8 +9,6 @@ from .cifar import CIFAR100, CIFAR10 -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/creditcardfraud/creditcardfraud.py b/deepray/datasets/creditcardfraud/creditcardfraud.py index 19e429de..a2c83c97 100644 --- a/deepray/datasets/creditcardfraud/creditcardfraud.py +++ b/deepray/datasets/creditcardfraud/creditcardfraud.py @@ -12,29 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Fashion-MNIST dataset.""" +"""Credit Card Fraud dataset.""" -import gzip -import os import sys + import numpy as np +import pandas as pd import tensorflow as tf from absl import flags -import pandas as pd -from keras.utils.data_utils import get_file from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS([ +flags.FLAGS([ sys.argv[0], "--num_train_examples=182280", ]) -class CreditCardFraud(DataPipeLine): +class CreditCardFraud(DataPipeline): def __init__(self, url='https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv'): super().__init__() @@ -86,19 +83,12 @@ def __len__(self): pass def build_dataset( - self, - input_file_pattern, - batch_size, - is_training=True, - context: tf.distribute.InputContext = None, - use_horovod=False, - *args, - **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): if is_training: ds = tf.data.Dataset.from_tensor_slices((self.train_features, self.train_labels)) else: ds = tf.data.Dataset.from_tensor_slices((self.val_features, self.val_labels)) - ds = ds.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size) + ds = ds.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size) return ds diff --git a/deepray/datasets/creditcardfraud/creditcardfraud_test.py b/deepray/datasets/creditcardfraud/creditcardfraud_test.py index 94ff2a7f..ea7c6c91 100644 --- a/deepray/datasets/creditcardfraud/creditcardfraud_test.py +++ b/deepray/datasets/creditcardfraud/creditcardfraud_test.py @@ -9,8 +9,6 @@ from .creditcardfraud import CreditCardFraud -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/criteo/docker/Dockerfile_preprocessing b/deepray/datasets/criteo/Dockerfile_preprocessing similarity index 100% rename from deepray/datasets/criteo/docker/Dockerfile_preprocessing rename to deepray/datasets/criteo/Dockerfile_preprocessing diff --git a/deepray/datasets/criteo/README.md b/deepray/datasets/criteo/README.md deleted file mode 100644 index 922715d8..00000000 --- a/deepray/datasets/criteo/README.md +++ /dev/null @@ -1,282 +0,0 @@ -# Criteo dataset processing - -This repository provides a script and recipe to process Criteo Terabyte Dataset. - - -## Quick Start Guide - -To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using -the default parameters of DLRM on the Criteo Terabyte dataset. For the specifics concerning training and inference, -see the [Advanced](#advanced) section. - -1. Clone the repository. -``` -git clone xxx -cd DeePray/deepray/datasets/criteo -``` - -2. Download the dataset. - -You can download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/. -When you have successfully downloaded it and unpacked it, set the `CRITEO_DATASET_PARENT_DIRECTORY` to its parent directory: -``` -CRITEO_DATASET_PARENT_DIRECTORY=/raid/criteo -``` -We recommend to choose the fastest possible file system, otherwise it may lead to an IO bottleneck. - -3. Build DLRM Docker containers -```bash -docker build -t criteo_preprocessing -f Dockerfile_preprocessing . --build-arg DGX_VERSION=[DGX-2|DGX-A100] -``` - -3. Start an interactive session in the NGC container to run preprocessing. -The DLRM PyTorch container can be launched with: -```bash -docker run --runtime=nvidia -it --rm --ipc=host -v ${CRITEO_DATASET_PARENT_DIRECTORY}:/data/dlrm criteo_preprocessing bash -``` - -4. Preprocess the dataset. - -Here are a few examples of different preprocessing commands. Out of the box, we support preprocessing on DGX-2 and DGX A100 systems. For the details on how those scripts work and detailed description of dataset types (small FL=15, large FL=3, xlarge FL=2), system requirements, setup instructions for different systems and all the parameters consult the [preprocessing section](#preprocessing). -For an explanation of the `FL` parameter, see the [Dataset Guidelines](#dataset-guidelines) and [Preprocessing](#preprocessing) sections. - -Depending on dataset type (small FL=15, large FL=3, xlarge FL=2) run one of following command: - -4.1. Preprocess to small dataset (FL=15) with Spark GPU: -```bash -cd /workspace/dlrm/preproc -./prepare_dataset.sh 15 GPU Spark -``` - -4.2. Preprocess to large dataset (FL=3) with Spark GPU: -```bash -cd /workspace/dlrm/preproc -./prepare_dataset.sh 3 GPU Spark -``` - -4.3. Preprocess to xlarge dataset (FL=2) with Spark GPU: -```bash -cd /workspace/dlrm/preproc -./prepare_dataset.sh 2 GPU Spark -``` - - -## Advanced - -The following sections provide greater details of the dataset. - - -### Getting the data - -This example uses the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/). -The first 23 days are used as the training set. The last day is split in half. The first part, referred to as "test", is used for validating training results. The second one, referred to as "validation", is unused. - - -#### Dataset guidelines - -The preprocessing steps applied to the raw data include: -- Replacing the missing values with `0` -- Replacing the categorical values that exist fewer than `FL` times with a special value (FL value is called a frequency threshold or a frequency limit) -- Converting the hash values to consecutive integers -- Adding 3 to all the numerical features so that all of them are greater or equal to 1 -- Taking a natural logarithm of all numerical features - - -#### BYO dataset - -This implementation supports using other datasets thanks to BYO dataset functionality. -The BYO dataset functionality allows users to plug in their dataset in a common fashion for all Recommender models -that support this functionality. Using BYO dataset functionality, the user does not have to modify the source code of -the model thanks to the Feature Specification file. For general information on how BYO dataset works, refer to the -[BYO dataset overview section](#byo-dataset-functionality-overview). - -There are three ways to plug in user's dataset: -
-1. Provide an unprocessed dataset in a format matching the one used by Criteo 1TB, then use Criteo 1TB's preprocessing. Feature Specification file is then generated automatically. -The required format of the user's dataset is: - -The data should be split into text files. Each line of those text files should contain a single training example. -An example should consist of multiple fields separated by tabulators: - -* The first field is the label – 1 for a positive example and 0 for negative. -* The next N tokens should contain the numerical features separated by tabs. -* The next M tokens should contain the hashed categorical features separated by tabs. - -The correct dataset files together with the Feature Specification yaml file will be generated automatically by preprocessing script. - -For an example of using this process, refer to the [Quick Start Guide](#quick-start-guide) - -
- -
-2. Provide a CSV containing preprocessed data and a simplified Feature Specification yaml file, then transcode the data with `transcode.py` script -This option should be used if the user has their own CSV file with a preprocessed dataset they want to train on. - -The required format of the user's dataset is: -* CSV files containing the data, already split into train and test sets. -* Feature Specification yaml file describing the layout of the CSV data - -For an example of a feature specification file, refer to the `tests/transcoding` folder. - -The CSV containing the data: -* should be already split into train and test -* should contain no header -* should contain one column per feature, in the order specified by the list of features for that chunk - in the source_spec section of the feature specification file -* categorical features should be non-negative integers in the range [0,cardinality-1] if cardinality is specified - -The Feature Specification yaml file: -* needs to describe the layout of data in CSV files -* may contain information about cardinalities. However, if set to `auto`, they will be inferred from the data by the transcoding script. - -Refer to `tests/transcoding/small_csv.yaml` for an example of the yaml Feature Specification. - -The following example shows how to use this way of plugging user's dataset: - -Prepare your data and save the path: -```bash -DATASET_PARENT_DIRECTORY=/raid/dlrm -``` - -Build the DLRM image with: -```bash -docker build -t nvidia_dlrm_pyt . -``` -Launch the container with: -```bash -docker run --runtime=nvidia -it --rm --ipc=host -v ${DATASET_PARENT_DIRECTORY}:/data nvidia_dlrm_preprocessing bash -``` - -If you are just testing the process, you can create synthetic csv data: -```bash -python -m dlrm.scripts.gen_csv --feature_spec_in tests/transcoding/small_csv.yaml -``` - -Convert the data: -```bash -mkdir /data/conversion_output -python -m dlrm.scripts.transcode --input /data --output /data/converted -``` -You may need to tune the --chunk_size parameter. Higher values speed up the conversion but require more RAM. - -This will convert the data from `/data` and save the output in `/data/converted`. -A feature specification file describing the new data will be automatically generated. - -To run the training on 1 GPU: -```bash -python -m dlrm.scripts.main --mode train --dataset /data/converted --amp --cuda_graphs -``` - -- multi-GPU for DGX A100: -```bash -python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \ - bash -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.main \ - --dataset /data/converted --seed 0 --epochs 1 --amp --cuda_graphs' -``` - -- multi-GPU for DGX-1 and DGX-2: -```bash -python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \ - bash -c './bind.sh --cpu=exclusive -- python -m dlrm.scripts.main \ - --dataset /data/converted --seed 0 --epochs 1 --amp --cuda_graphs' -``` -
-
-3. Provide a fully preprocessed dataset, saved in split binary files, and a Feature Specification yaml file -This is the option to choose if you want full control over preprocessing and/or want to preprocess data directly to the target format. - -Your final output will need to contain a Feature Specification yaml describing data and file layout. -For an example feature specification file, refer to `tests/feature_specs/criteo_f15.yaml` - -For details, refer to the [BYO dataset overview section](#byo-dataset-functionality-overview). -
- - - -##### Channel definitions and requirements - -This model defines three channels: - -- categorical, accepting an arbitrary number of features -- numerical, accepting an arbitrary number of features -- label, accepting a single feature - - -The training script expects two mappings: - -- train -- test - -For performance reasons: -* The only supported dataset type is split binary -* Splitting chunks into multiple files is not supported. -* Each categorical feature has to be provided in a separate chunk -* All numerical features have to be provided in a single chunk -* All numerical features have to appear in the same order in channel_spec and source_spec -* Only integer types are supported for categorical features -* Only float16 is supported for numerical features - -##### BYO dataset constraints for the model - -There are the following constraints of BYO dataset functionality for this model: -1. The performance of the model depends on the dataset size. Generally, the model should scale better for datasets containing more data points. For a smaller dataset, you might experience slower performance than the one reported for Criteo -2. Using other datasets might require tuning some hyperparameters (for example, learning rate, beta1 and beta2) to reach desired accuracy. -3. The optimized cuda interaction kernels for FP16 and TF32 assume that the number of categorical variables is smaller than WARP_SIZE=32 and embedding size is <=128 -#### Preprocessing - -The preprocessing scripts provided in this repository support running both on CPU and GPU using [NVtabular](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/) (GPU only) and [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/). - -Please note that the preprocessing will require about 4TB of disk storage. - - -The syntax for the preprocessing script is as follows: -```bash -cd /workspace/dlrm/preproc -./prepare_dataset.sh -``` - -For the Criteo Terabyte dataset, we recommend a frequency threshold of `FL=3`(when using A100 40GB or V100 32 GB) or `FL=2`(when using A100 80GB) if you intend to run the hybrid-parallel mode -on multiple GPUs. If you want to make the model fit into a single NVIDIA Tesla V100-32GB, you can set `FL=15`. - -The first argument means the frequency threshold to apply to the categorical variables. For a frequency threshold `FL`, the categorical values that occur less -often than `FL` will be replaced with one special value for each category. Thus, a larger value of `FL` will require smaller embedding tables -and will substantially reduce the overall size of the model. - -The second argument is the hardware to use (either GPU or CPU). - -The third arguments is a framework to use (either NVTabular or Spark). In case of choosing a CPU preprocessing this argument is omitted as it only Apache Spark is supported on CPU. - -The preprocessing scripts make use of the following environment variables to configure the data directory paths: -- `download_dir` – this directory should contain the original Criteo Terabyte CSV files -- `spark_output_path` – directory to which the parquet data will be written -- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format -- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM - -In the `final_output_dir` will be three subdirectories created: `train`, `test`, `validation`, and one json file – `model_size.json` – containing a maximal index of each category. -The `train` is the train dataset transformed from day_0 to day_22. -The `test` is the test dataset transformed from the prior half of day_23. -The `validation` is the dataset transformed from the latter half of day_23. - -The model is tested on 3 datasets resulting from Criteo dataset preprocessing: small (Freqency threshold = 15), large (Freqency threshold = 3) and xlarge (Freqency threshold = 2). Each dataset occupies approx 370GB of disk space. Table below presents information on the supercomputer and GPU count that are needed to train model on particular dataset. - -| Dataset | GPU VRAM consumption\* | Model checkpoint size\* | FL setting | DGX A100 40GB, 1GPU | DGX A100 40GB, 8GPU | DGX A100 80GB, 1GPU | DGX A100 80GB, 8GPU | DGX-1** or DGX-2, 1 GPU | DGX-1** or DGX-2, 8GPU | DGX-2, 16GPU | -| ------- | ---------------------- | ----------------------- | ---------- | -------------------- | -------------------- | -------------------- | -------------------- | ---------------------- | --------------------- | ------------ | -| small (FL=15) | 20.5 | 15.0 | 15 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | -| large (FL=3) | 132.3 | 81.9 | 3 | NA | Yes | NA | Yes | NA | Yes | Yes | -| xlarge (FL=2) | 198.8 | 141.3 | 2 | NA | NA | NA | Yes | NA | NA | NA | - -\*with default embedding dimension setting -\**DGX-1 V100 32GB - -##### NVTabular - -NVTabular preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values of `ALL_DS_MEM_FRAC`, `TRAIN_DS_MEM_FRAC`, `TEST_DS_MEM_FRAC`, `VALID_DS_MEM_FRAC` in `preproc/preproc_NVTabular.py`, so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. - -##### Spark - -The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. - -Note that the Spark job requires about 3TB disk space used for data shuffling. - -Spark preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values in `preproc/DGX-2_config.sh` or `preproc/DGX-A100_config.sh` -so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. diff --git a/deepray/datasets/criteo/criteo.py b/deepray/datasets/criteo/criteo.py index ee39a41a..17e20922 100644 --- a/deepray/datasets/criteo/criteo.py +++ b/deepray/datasets/criteo/criteo.py @@ -17,17 +17,19 @@ import sys from absl import flags +from tensorflow.python.distribute.distribute_lib import InputContext -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS([ - sys.argv[0], - "--num_train_examples=11932672", -]) +class Criteo(DataPipeline): -class Criteo(DataPipeLine): + def __init__(self, context: InputContext = None, **kwargs): + super().__init__(context, **kwargs) + flags.FLAGS([ + sys.argv[0], + "--num_train_examples=11932672", + ]) - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): pass diff --git a/deepray/datasets/criteo/criteo_dataset.md b/deepray/datasets/criteo/criteo_dataset.md new file mode 100644 index 00000000..89dd97d8 --- /dev/null +++ b/deepray/datasets/criteo/criteo_dataset.md @@ -0,0 +1,190 @@ +## Quick Start Guide + +To prepare the Criteo 1TB dataset for training, follow these steps. + +1. Make sure you meet the prerequisites. + +You will need around 4TB of storage for storing the original Criteo 1TB dataset, the results of some +intermediate preprocessing steps and the final dataset. The final dataset itself will take about 400GB. + +We recommend using local storage, such as a fast SSD drive, to run the preprocessing. Using other types of storage +will negatively impact the preprocessing time. + + +2. Build the preprocessing docker image. +```bash +docker build -t preproc_docker_image -f Dockerfile_spark . --build-arg DGX_VERSION=[DGX-2|DGX-A100] +``` + +3. Download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/. + +When you have successfully downloaded the dataset, put it in the `/data/criteo_orig` directory in the container +(`$PWD/data/criteo_orig` in the host system). + +4. Start an interactive session in the NGC container to run preprocessing. +The DLRM TensorFlow container can be launched with: + +```bash +mkdir -p data +docker run --runtime=nvidia -it --rm --ipc=host -v ${PWD}/data:/data preproc_docker_image bash +``` + +5. Unzip the data with: + +```bash +gunzip /data/criteo_orig/*.gz +``` + +6. Preprocess the data. + +Here are a few examples of different preprocessing commands. Out of the box, we support preprocessing on DGX-2 and DGX A100 systems. For the details on how those scripts work and detailed description of dataset types (small FL=15, large FL=3, xlarge FL=2), system requirements, setup instructions for different systems and all the parameters consult the [preprocessing section](#preprocessing). +For an explanation of the `FL` parameter, see the [Dataset Guidelines](#dataset-guidelines) and [Preprocessing](#preprocessing) sections. + +Depending on dataset type (small FL=15, large FL=3, xlarge FL=2) run one of following command: + +```bash +export download_dir=/data/criteo_orig +export final_output_dir=/data/preprocessed + +cd preproc + +# Preprocess to small dataset (FL=15) with Spark GPU: +./prepare_dataset.sh 15 GPU Spark + +# Preprocess to large dataset (FL=3) with Spark GPU: +./prepare_dataset.sh 3 GPU Spark + +# Preprocess to xlarge dataset (FL=2) with Spark GPU: +./prepare_dataset.sh 2 GPU Spark + +# to run on Spark GPU with no frequency limit: +./prepare_dataset.sh 0 GPU Spark +``` + + + +## Advanced + +### Dataset guidelines + +The first 23 days are used as the training set. The last day is split in half. +The first part is used as a validation set and the second set is used as a hold-out test set. + +The preprocessing steps applied to the raw data include: +- Replacing the missing values with `0`. +- Replacing the categorical values that exist fewer than 15 times with a special value. +- Converting the hash values to consecutive integers. +- Adding 2 to all the numerical features so that all of them are greater or equal to 1. +- Taking a natural logarithm of all numerical features. + + +### Preprocess with Spark + +The preprocessing scripts provided in this repository support running both on CPU and GPU using [NVtabular](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/) (GPU only) and [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/). + +Please note that the preprocessing will require about 4TB of disk storage. + + +The syntax for the preprocessing script is as follows: +```bash +cd /workspace/dlrm/preproc +./prepare_dataset.sh +``` + +For the Criteo Terabyte dataset, we recommend a frequency threshold of `FL=3`(when using A100 40GB or V100 32 GB) or `FL=2`(when using A100 80GB) if you intend to run the hybrid-parallel mode +on multiple GPUs. If you want to make the model fit into a single NVIDIA Tesla V100-32GB, you can set `FL=15`. + +The first argument means the frequency threshold to apply to the categorical variables. For a frequency threshold `FL`, the categorical values that occur less +often than `FL` will be replaced with one special value for each category. Thus, a larger value of `FL` will require smaller embedding tables +and will substantially reduce the overall size of the model. + +The second argument is the hardware to use (either GPU or CPU). + +The third arguments is a framework to use (either NVTabular or Spark). In case of choosing a CPU preprocessing this argument is omitted as it only Apache Spark is supported on CPU. + +The preprocessing scripts make use of the following environment variables to configure the data directory paths: +- `download_dir` – this directory should contain the original Criteo Terabyte CSV files +- `spark_output_path` – directory to which the parquet data will be written +- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format +- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM + +In the `final_output_dir` will be three subdirectories created: `train`, `test`, `validation`, and one json file – `model_size.json` – containing a maximal index of each category. +The `train` is the train dataset transformed from day_0 to day_22. +The `test` is the test dataset transformed from the prior half of day_23. +The `validation` is the dataset transformed from the latter half of day_23. + +The model is tested on 3 datasets resulting from Criteo dataset preprocessing: small (Freqency threshold = 15), large (Freqency threshold = 3) and xlarge (Freqency threshold = 2). Each dataset occupies approx 370GB of disk space. Table below presents information on the supercomputer and GPU count that are needed to train model on particular dataset. + +| Dataset | GPU VRAM consumption\* | Model checkpoint size\* | FL setting | DGX A100 40GB, 1GPU | DGX A100 40GB, 8GPU | DGX A100 80GB, 1GPU | DGX A100 80GB, 8GPU | DGX-1** or DGX-2, 1 GPU | DGX-1** or DGX-2, 8GPU | DGX-2, 16GPU | +| ------- | ---------------------- | ----------------------- | ---------- | -------------------- | -------------------- | -------------------- | -------------------- | ---------------------- | --------------------- | ------------ | +| small (FL=15) | 20.5 | 15.0 | 15 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| large (FL=3) | 132.3 | 81.9 | 3 | NA | Yes | NA | Yes | NA | Yes | Yes | +| xlarge (FL=2) | 198.8 | 141.3 | 2 | NA | NA | NA | Yes | NA | NA | NA | + +\*with default embedding dimension setting +\**DGX-1 V100 32GB + +##### NVTabular + +NVTabular preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values of `ALL_DS_MEM_FRAC`, `TRAIN_DS_MEM_FRAC`, `TEST_DS_MEM_FRAC`, `VALID_DS_MEM_FRAC` in `preproc/preproc_NVTabular.py`, so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. + +##### Spark + +The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. + +Note that the Spark job requires about 3TB disk space used for data shuffling. + +Spark preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values in `preproc/DGX-2_config.sh` or `preproc/DGX-A100_config.sh` +so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. + + + + + +The preprocessing scripts makes use of the following environment variables to configure the data directory paths: +- `download_dir` – this directory should contain the original Criteo Terabyte CSV files +- `spark_output_path` – directory to which the parquet data will be written +- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format +- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM + +The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then runs several PySpark jobs with `spark_data_utils.py`, for example: +generates the dictionary +- transforms the train dataset +- transforms the test dataset +- transforms the validation dataset + + Change the variables in the `run-spark.sh` script according to your environment. + Configure the paths. +``` +export SPARK_LOCAL_DIRS=/data/spark-tmp +export INPUT_PATH=/data/criteo +export OUTPUT_PATH=/data/output +``` +Note that the Spark job requires about 3TB disk space used for data shuffle. + +Where: +`SPARK_LOCAL_DIRS` is the path where Spark uses to write shuffle data. +`INPUT_PATH` is the path of the Criteo Terabyte Dataset, including uncompressed files like day_0, day_1… +`OUTPUT_PATH` is where the script writes the output data. It will generate the following subdirectories of `models`, `train`, `test`, and `validation`. +- The `model` is the dictionary folder. +- The `train` is the train dataset transformed from day_0 to day_22. +- The `test` is the test dataset transformed from the prior half of day_23. +- The `validation` is the dataset transformed from the latter half of day_23. + +Configure the resources which Spark will use. +``` +export TOTAL_CORES=80 +export TOTAL_MEMORY=800 +``` + +Where: +`TOTAL_CORES` is the total CPU cores you want Spark to use. + +`TOTAL_MEMORY` is the total memory Spark will use. + +Configure frequency limit. +``` +USE_FREQUENCY_LIMIT=15 +``` +The frequency limit is used to filter out the categorical values which appear less than n times in the whole dataset, and make them be 0. Change this variable to 1 to enable it. The default frequency limit is 15 in the script. You also can change the number as you want by changing the line of `OPTS="--frequency_limit 8"`. + diff --git a/deepray/datasets/criteo/criteo_test.py b/deepray/datasets/criteo/criteo_test.py index 8f2cb38e..bb66063b 100644 --- a/deepray/datasets/criteo/criteo_test.py +++ b/deepray/datasets/criteo/criteo_test.py @@ -20,9 +20,6 @@ stop_threshold=False, ) -FLAGS = flags.FLAGS -logging.set_verbosity(logging.INFO) - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/criteo/criteo_tsv_reader.py b/deepray/datasets/criteo/criteo_tsv_reader.py index 0a511105..bbccdecc 100644 --- a/deepray/datasets/criteo/criteo_tsv_reader.py +++ b/deepray/datasets/criteo/criteo_tsv_reader.py @@ -21,13 +21,11 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.utils.horovod_utils import get_world_size, get_rank -FLAGS = flags.FLAGS - -class CriteoTsvReader(DataPipeLine): +class CriteoTsvReader(DataPipeline): """Input reader callable for pre-processed Criteo data. Raw Criteo data is assumed to be preprocessed in the following way: @@ -49,7 +47,6 @@ def build_dataset( input_file_pattern, batch_size, is_training=True, - prebatch_size=0, epochs=1, shuffle=True, *args, @@ -76,7 +73,7 @@ def make_dataset(): indices = tf.data.Dataset.range(get_world_size()) dataset = indices.interleave( - map_func=make_dataset, cycle_length=FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE + map_func=make_dataset, cycle_length=flags.FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE ) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) @@ -93,7 +90,7 @@ def parser(self, example: tf.Tensor): fields = tf.io.decode_csv(example, record_defaults, field_delim='\t', na_value='-1') num_labels = 1 - label = tf.reshape(fields[0], [FLAGS.batch_size, 1]) + label = tf.reshape(fields[0], [flags.FLAGS.batch_size, 1]) features = {} num_dense = len(dense_defaults) diff --git a/deepray/datasets/criteo/criteo_tsv_reader_test.py b/deepray/datasets/criteo/criteo_tsv_reader_test.py index 3c987e07..6188e386 100644 --- a/deepray/datasets/criteo/criteo_tsv_reader_test.py +++ b/deepray/datasets/criteo/criteo_tsv_reader_test.py @@ -10,8 +10,6 @@ from deepray.datasets.criteo.criteo_tsv_reader import CriteoTsvReader from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS - def runner(argv=None): dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/deepray/datasets/criteo/feature_map_small.csv b/deepray/datasets/criteo/feature_map_small.csv index 43296d20..45f9d00c 100644 --- a/deepray/datasets/criteo/feature_map_small.csv +++ b/deepray/datasets/criteo/feature_map_small.csv @@ -1,41 +1,41 @@ name,dtype,ftype,dim,length,voc_size -feature_0,int32,Label,1,1, -feature_1,float64,Numerical,1,1, -feature_2,float64,Numerical,1,1, -feature_3,float64,Numerical,1,1, -feature_4,float64,Numerical,1,1, -feature_5,float64,Numerical,1,1, -feature_6,float64,Numerical,1,1, -feature_7,float64,Numerical,1,1, -feature_8,float64,Numerical,1,1, -feature_9,float64,Numerical,1,1, -feature_10,float64,Numerical,1,1, -feature_11,float64,Numerical,1,1, -feature_12,float64,Numerical,1,1, -feature_13,float64,Numerical,1,1, -feature_14,int32,Categorical,16,1,7912888 -feature_15,int32,Categorical,16,1,33822 -feature_16,int32,Categorical,16,1,17138 -feature_17,int32,Categorical,16,1,7338 -feature_18,int32,Categorical,16,1,20045 -feature_19,int32,Categorical,16,1,3 -feature_20,int32,Categorical,16,1,7104 -feature_21,int32,Categorical,16,1,1381 -feature_22,int32,Categorical,16,1,62 -feature_23,int32,Categorical,16,1,5554113 -feature_24,int32,Categorical,16,1,582468 -feature_25,int32,Categorical,16,1,245827 -feature_26,int32,Categorical,16,1,10 -feature_27,int32,Categorical,16,1,2208 -feature_28,int32,Categorical,16,1,10666 -feature_29,int32,Categorical,16,1,103 -feature_30,int32,Categorical,16,1,3 -feature_31,int32,Categorical,16,1,967 -feature_32,int32,Categorical,16,1,14 -feature_33,int32,Categorical,16,1,8165895 -feature_34,int32,Categorical,16,1,2675939 -feature_35,int32,Categorical,16,1,7156452 -feature_36,int32,Categorical,16,1,302515 -feature_37,int32,Categorical,16,1,12021 -feature_38,int32,Categorical,16,1,96 -feature_39,int32,Categorical,16,1,34 \ No newline at end of file +f_c0,int32,Label,1,1, +f_c1,float64,Numerical,1,1, +f_c2,float64,Numerical,1,1, +f_c3,float64,Numerical,1,1, +f_c4,float64,Numerical,1,1, +f_c5,float64,Numerical,1,1, +f_c6,float64,Numerical,1,1, +f_c7,float64,Numerical,1,1, +f_c8,float64,Numerical,1,1, +f_c9,float64,Numerical,1,1, +f_c10,float64,Numerical,1,1, +f_c11,float64,Numerical,1,1, +f_c12,float64,Numerical,1,1, +f_c13,float64,Numerical,1,1, +f_c14,int32,Categorical,16,1,7912888 +f_c15,int32,Categorical,16,1,33822 +f_c16,int32,Categorical,16,1,17138 +f_c17,int32,Categorical,16,1,7338 +f_c18,int32,Categorical,16,1,20045 +f_c19,int32,Categorical,16,1,3 +f_c20,int32,Categorical,16,1,7104 +f_c21,int32,Categorical,16,1,1381 +f_c22,int32,Categorical,16,1,62 +f_c23,int32,Categorical,16,1,5554113 +f_c24,int32,Categorical,16,1,582468 +f_c25,int32,Categorical,16,1,245827 +f_c26,int32,Categorical,16,1,10 +f_c27,int32,Categorical,16,1,2208 +f_c28,int32,Categorical,16,1,10666 +f_c29,int32,Categorical,16,1,103 +f_c30,int32,Categorical,16,1,3 +f_c31,int32,Categorical,16,1,967 +f_c32,int32,Categorical,16,1,14 +f_c33,int32,Categorical,16,1,8165895 +f_c34,int32,Categorical,16,1,2675939 +f_c35,int32,Categorical,16,1,7156452 +f_c36,int32,Categorical,16,1,302515 +f_c37,int32,Categorical,16,1,12021 +f_c38,int32,Categorical,16,1,96 +f_c39,int32,Categorical,16,1,34 \ No newline at end of file diff --git a/deepray/datasets/criteo/feature_map_xlarge.csv b/deepray/datasets/criteo/feature_map_xlarge.csv index 40ecd51d..b90ab0f3 100644 --- a/deepray/datasets/criteo/feature_map_xlarge.csv +++ b/deepray/datasets/criteo/feature_map_xlarge.csv @@ -1,41 +1,41 @@ name,dtype,ftype,dim,length,voc_size -_c0,int32,Label,1,1, -_c1,float64,Numerical,1,1, -_c2,float64,Numerical,1,1, -_c3,float64,Numerical,1,1, -_c4,float64,Numerical,1,1, -_c5,float64,Numerical,1,1, -_c6,float64,Numerical,1,1, -_c7,float64,Numerical,1,1, -_c8,float64,Numerical,1,1, -_c9,float64,Numerical,1,1, -_c10,float64,Numerical,1,1, -_c11,float64,Numerical,1,1, -_c12,float64,Numerical,1,1, -_c13,float64,Numerical,1,1, -_c14,int32,Categorical,1,1,227605431 -_c15,int32,Categorical,1,1,39060 -_c16,int32,Categorical,1,1,17295 -_c17,int32,Categorical,1,1,7424 -_c18,int32,Categorical,1,1,20265 -_c19,int32,Categorical,1,1,3 -_c20,int32,Categorical,1,1,7122 -_c21,int32,Categorical,1,1,1543 -_c22,int32,Categorical,1,1,63 -_c23,int32,Categorical,1,1,130229466 -_c24,int32,Categorical,1,1,3067955 -_c25,int32,Categorical,1,1,405282 -_c26,int32,Categorical,1,1,10 -_c27,int32,Categorical,1,1,2208 -_c28,int32,Categorical,1,1,11938 -_c29,int32,Categorical,1,1,154 -_c30,int32,Categorical,1,1,3 -_c31,int32,Categorical,1,1,976 -_c32,int32,Categorical,1,1,14 -_c33,int32,Categorical,1,1,292775613 -_c34,int32,Categorical,1,1,40790947 -_c35,int32,Categorical,1,1,187188509 -_c36,int32,Categorical,1,1,590151 -_c37,int32,Categorical,1,1,12973 -_c38,int32,Categorical,1,1,108 -_c39,int32,Categorical,1,1,36 \ No newline at end of file +f_c0,int32,Label,1,1, +f_c1,float64,Numerical,1,1, +f_c2,float64,Numerical,1,1, +f_c3,float64,Numerical,1,1, +f_c4,float64,Numerical,1,1, +f_c5,float64,Numerical,1,1, +f_c6,float64,Numerical,1,1, +f_c7,float64,Numerical,1,1, +f_c8,float64,Numerical,1,1, +f_c9,float64,Numerical,1,1, +f_c10,float64,Numerical,1,1, +f_c11,float64,Numerical,1,1, +f_c12,float64,Numerical,1,1, +f_c13,float64,Numerical,1,1, +f_c14,int32,Categorical,1,1,227605431 +f_c15,int32,Categorical,1,1,39060 +f_c16,int32,Categorical,1,1,17295 +f_c17,int32,Categorical,1,1,7424 +f_c18,int32,Categorical,1,1,20265 +f_c19,int32,Categorical,1,1,3 +f_c20,int32,Categorical,1,1,7122 +f_c21,int32,Categorical,1,1,1543 +f_c22,int32,Categorical,1,1,63 +f_c23,int32,Categorical,1,1,130229466 +f_c24,int32,Categorical,1,1,3067955 +f_c25,int32,Categorical,1,1,405282 +f_c26,int32,Categorical,1,1,10 +f_c27,int32,Categorical,1,1,2208 +f_c28,int32,Categorical,1,1,11938 +f_c29,int32,Categorical,1,1,154 +f_c30,int32,Categorical,1,1,3 +f_c31,int32,Categorical,1,1,976 +f_c32,int32,Categorical,1,1,14 +f_c33,int32,Categorical,1,1,292775613 +f_c34,int32,Categorical,1,1,40790947 +f_c35,int32,Categorical,1,1,187188509 +f_c36,int32,Categorical,1,1,590151 +f_c37,int32,Categorical,1,1,12973 +f_c38,int32,Categorical,1,1,108 +f_c39,int32,Categorical,1,1,36 \ No newline at end of file diff --git a/deepray/datasets/criteo/preproc/data/__init__.py b/deepray/datasets/criteo/preproc/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/datasets/criteo/preproc/data/defaults.py b/deepray/datasets/criteo/preproc/data/defaults.py new file mode 100644 index 00000000..b4e12767 --- /dev/null +++ b/deepray/datasets/criteo/preproc/data/defaults.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +CATEGORICAL_CHANNEL = "categorical" +NUMERICAL_CHANNEL = "numerical" +LABEL_CHANNEL = "label" + +SPLIT_BINARY = "split_binary" + +TRAIN_MAPPING = "train" +TEST_MAPPING = "test" + +TYPE_SELECTOR = "type" +FEATURES_SELECTOR = "features" +FILES_SELECTOR = "files" + +DTYPE_SELECTOR = "dtype" +CARDINALITY_SELECTOR = "cardinality" + + +def get_categorical_feature_type(size: int): + """This function works both when max value and cardinality is passed. + Consistency by the user is required""" + types = (np.int8, np.int16, np.int32) + + for numpy_type in types: + if size < np.iinfo(numpy_type).max: + return numpy_type + + raise RuntimeError(f"Categorical feature of size {size} is too big for defined types") diff --git a/deepray/datasets/criteo/preproc/data/feature_spec.py b/deepray/datasets/criteo/preproc/data/feature_spec.py new file mode 100644 index 00000000..f40a43bb --- /dev/null +++ b/deepray/datasets/criteo/preproc/data/feature_spec.py @@ -0,0 +1,268 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import os +from typing import Dict +from typing import List +import numpy as np +from defaults import CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL, \ + TRAIN_MAPPING, TEST_MAPPING, \ + TYPE_SELECTOR, FEATURES_SELECTOR, FILES_SELECTOR, CARDINALITY_SELECTOR, DTYPE_SELECTOR, \ + SPLIT_BINARY, \ + get_categorical_feature_type +""" For performance reasons, numerical features are required to appear in the same order + in both source_spec and channel_spec. + For more detailed requirements, see the check_feature_spec method""" + + +class FeatureSpec: + + def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metadata=None, base_directory=None): + self.feature_spec: Dict = feature_spec if feature_spec is not None else {} + self.source_spec: Dict = source_spec if source_spec is not None else {} + self.channel_spec: Dict = channel_spec if channel_spec is not None else {} + self.metadata: Dict = metadata if metadata is not None else {} + self.base_directory: str = base_directory + + @classmethod + def from_yaml(cls, path): + with open(path, 'r') as feature_spec_file: + base_directory = os.path.dirname(path) + feature_spec = yaml.safe_load(feature_spec_file) + return cls.from_dict(feature_spec, base_directory=base_directory) + + @classmethod + def from_dict(cls, source_dict, base_directory): + return cls(base_directory=base_directory, **source_dict) + + def to_dict(self) -> Dict: + attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata'] + return {attr: self.__dict__[attr] for attr in attributes_to_dump} + + def to_string(self): + return yaml.dump(self.to_dict()) + + def to_yaml(self, output_path=None): + if not output_path: + output_path = self.base_directory + '/feature_spec.yaml' + with open(output_path, 'w') as output_file: + print(yaml.dump(self.to_dict()), file=output_file) + + def get_number_of_numerical_features(self) -> int: + numerical_features = self.channel_spec[NUMERICAL_CHANNEL] + return len(numerical_features) + + def cat_positions_to_names(self, positions: List[int]): + # Ordering needs to correspond to the one in get_categorical_sizes() + feature_names = self.get_categorical_feature_names() + return [feature_names[i] for i in positions] + + def get_categorical_feature_names(self): + """ Provides the categorical feature names. The returned order should me maintained.""" + return self.channel_spec[CATEGORICAL_CHANNEL] + + def get_categorical_sizes(self) -> List[int]: + """For a given feature spec, this function is expected to return the sizes in the order corresponding to the + order in the channel_spec section """ + categorical_features = self.get_categorical_feature_names() + cardinalities = [self.feature_spec[feature_name][CARDINALITY_SELECTOR] for feature_name in categorical_features] + + return cardinalities + + def check_feature_spec(self): + # TODO check if cardinality fits in dtype, check if base directory is set + # TODO split into two checking general and model specific requirements + # check that mappings are the ones expected + mapping_name_list = list(self.source_spec.keys()) + assert sorted(mapping_name_list) == sorted([TEST_MAPPING, TRAIN_MAPPING]) + + # check that channels are the ones expected + channel_name_list = list(self.channel_spec.keys()) + assert sorted(channel_name_list) == sorted([CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL]) + + categorical_features_list = self.channel_spec[CATEGORICAL_CHANNEL] + numerical_features_list = self.channel_spec[NUMERICAL_CHANNEL] + label_features_list = self.channel_spec[LABEL_CHANNEL] + set_of_categorical_features = set(categorical_features_list) + set_of_numerical_features = set(numerical_features_list) + + # check that exactly one label feature is selected + assert len(label_features_list) == 1 + label_feature_name = label_features_list[0] + + # check that lists in channel spec contain unique names + assert sorted(list(set_of_categorical_features)) == sorted(categorical_features_list) + assert sorted(list(set_of_numerical_features)) == sorted(numerical_features_list) + + # check that all features used in channel spec are exactly ones defined in feature_spec + feature_spec_features = list(self.feature_spec.keys()) + channel_spec_features = list( + set.union(set_of_categorical_features, set_of_numerical_features, {label_feature_name}) + ) + assert sorted(feature_spec_features) == sorted(channel_spec_features) + + # check that correct dtypes are provided for all features + for feature_dict in self.feature_spec.values(): + assert DTYPE_SELECTOR in feature_dict + try: + np.dtype(feature_dict[DTYPE_SELECTOR]) + except TypeError: + assert False, "Type not understood by numpy" + + # check that categorical features have cardinality provided + for feature_name, feature_dict in self.feature_spec.items(): + if feature_name in set_of_categorical_features: + assert CARDINALITY_SELECTOR in feature_dict + assert isinstance(feature_dict[CARDINALITY_SELECTOR], int) + + for mapping_name in [TRAIN_MAPPING, TEST_MAPPING]: + + mapping = self.source_spec[mapping_name] + mapping_features = set() + for chunk in mapping: + # check that chunk has the correct type + assert chunk[TYPE_SELECTOR] == SPLIT_BINARY + + contained_features = chunk[FEATURES_SELECTOR] + containing_files = chunk[FILES_SELECTOR] + + # check that features are unique in mapping + for feature in contained_features: + assert feature not in mapping_features + mapping_features.add(feature) + + # check that chunk has at least one features + assert len(contained_features) >= 1 + + # check that chunk has exactly file + assert len(containing_files) == 1 + + first_feature = contained_features[0] + + if first_feature in set_of_categorical_features: + # check that each categorical feature is in a different file + assert len(contained_features) == 1 + + elif first_feature in set_of_numerical_features: + # check that numerical features are all in one chunk + assert sorted(contained_features) == sorted(numerical_features_list) + + # check that ordering is exactly same as in channel spec - required for performance + assert contained_features == numerical_features_list + + # check numerical dtype + for feature in contained_features: + assert np.dtype(self.feature_spec[feature][DTYPE_SELECTOR]) == np.float16 + + elif first_feature == label_feature_name: + # check that label feature is in a separate file + assert len(contained_features) == 1 + + # check label dtype + assert np.dtype(self.feature_spec[first_feature][DTYPE_SELECTOR]) == bool + + else: + assert False, "Feature of unknown type" + + # check that all features appeared in mapping + assert sorted(mapping_features) == sorted(feature_spec_features) + + @staticmethod + def get_default_feature_spec(number_of_numerical_features, categorical_feature_cardinalities): + numerical_feature_fstring = "num_{}" + categorical_feature_fstring = "cat_{}.bin" + label_feature_name = "label" + + numerical_file_name = "numerical.bin" + categorical_file_fstring = "{}" # TODO remove .bin from feature name, add to file name + label_file_name = "label.bin" + + number_of_categorical_features = len(categorical_feature_cardinalities) + numerical_feature_names = [numerical_feature_fstring.format(i) for i in range(number_of_numerical_features)] + categorical_feature_names = [categorical_feature_fstring.format(i) for i in range(number_of_categorical_features)] + cat_feature_types = [get_categorical_feature_type(int(cat_size)) for cat_size in categorical_feature_cardinalities] + + feature_dict = { + f_name: { + DTYPE_SELECTOR: str(np.dtype(f_type)), + CARDINALITY_SELECTOR: f_size + } for f_name, f_type, f_size in + zip(categorical_feature_names, cat_feature_types, categorical_feature_cardinalities) + } + for f_name in numerical_feature_names: + feature_dict[f_name] = {DTYPE_SELECTOR: str(np.dtype(np.float16))} + feature_dict[label_feature_name] = {DTYPE_SELECTOR: str(np.dtype(bool))} + + channel_spec = { + CATEGORICAL_CHANNEL: categorical_feature_names, + NUMERICAL_CHANNEL: numerical_feature_names, + LABEL_CHANNEL: [label_feature_name] + } + source_spec = {} + + for filename in (TRAIN_MAPPING, TEST_MAPPING): + source_spec[filename] = [] + dst_folder = filename + + numerical_file_path = os.path.join(dst_folder, numerical_file_name) + source_spec[filename].append( + { + TYPE_SELECTOR: SPLIT_BINARY, + FEATURES_SELECTOR: numerical_feature_names, + FILES_SELECTOR: [numerical_file_path] + } + ) + + label_file_path = os.path.join(dst_folder, label_file_name) + source_spec[filename].append( + { + TYPE_SELECTOR: SPLIT_BINARY, + FEATURES_SELECTOR: [label_feature_name], + FILES_SELECTOR: [label_file_path] + } + ) + + for feature_name in categorical_feature_names: + categorical_file_name = categorical_file_fstring.format(feature_name) + categorical_file_path = os.path.join(dst_folder, categorical_file_name) + source_spec[filename].append( + { + TYPE_SELECTOR: SPLIT_BINARY, + FEATURES_SELECTOR: [feature_name], + FILES_SELECTOR: [categorical_file_path] + } + ) + + return FeatureSpec(feature_spec=feature_dict, source_spec=source_spec, channel_spec=channel_spec, metadata={}) + + def get_mapping_paths(self, mapping_name: str): + label_feature_name = self.channel_spec[LABEL_CHANNEL][0] + set_of_categorical_features = set(self.channel_spec[CATEGORICAL_CHANNEL]) + set_of_numerical_features = set(self.channel_spec[NUMERICAL_CHANNEL]) + + label_path = None + numerical_path = None + categorical_paths = dict() + for chunk in self.source_spec[mapping_name]: + local_path = os.path.join(self.base_directory, chunk[FILES_SELECTOR][0]) + if chunk[FEATURES_SELECTOR][0] in set_of_numerical_features: + numerical_path = local_path + elif chunk[FEATURES_SELECTOR][0] in set_of_categorical_features: + local_feature = chunk[FEATURES_SELECTOR][0] + categorical_paths[local_feature] = local_path + elif chunk[FEATURES_SELECTOR][0] == label_feature_name: + label_path = local_path + + return label_path, numerical_path, categorical_paths diff --git a/deepray/datasets/criteo/preproc/parquet_to_binary.py b/deepray/datasets/criteo/preproc/parquet_to_binary.py index f824cee9..cf13b33b 100644 --- a/deepray/datasets/criteo/preproc/parquet_to_binary.py +++ b/deepray/datasets/criteo/preproc/parquet_to_binary.py @@ -23,10 +23,10 @@ def process_file(f, dst): - label = '_c0' - dense_columns = [f'_c{i}' for i in range(1, 14)] - categorical_columns = [f'_c{i}' for i in range(14, 40)] - all_columns_sorted = [f'_c{i}' for i in range(0, 40)] + label = 'f_c0' + dense_columns = [f'f_c{i}' for i in range(1, 14)] + categorical_columns = [f'f_c{i}' for i in range(14, 40)] + all_columns_sorted = [f'f_c{i}' for i in range(0, 40)] data = pd.read_parquet(f) data = data[all_columns_sorted] diff --git a/deepray/datasets/criteo/preproc/preproc_NVTabular.py b/deepray/datasets/criteo/preproc/preproc_NVTabular.py index b99b3be1..90b57faf 100644 --- a/deepray/datasets/criteo/preproc/preproc_NVTabular.py +++ b/deepray/datasets/criteo/preproc/preproc_NVTabular.py @@ -43,9 +43,9 @@ LambdaOp from cudf.io.parquet import ParquetWriter -CRITEO_CONTINUOUS_COLUMNS = [f'_c{x}' for x in range(1, 14)] -CRITEO_CATEGORICAL_COLUMNS = [f'_c{x}' for x in range(14, 40)] -CRITEO_CLICK_COLUMNS = ['_c0'] +CRITEO_CONTINUOUS_COLUMNS = [f'f_c{x}' for x in range(1, 14)] +CRITEO_CATEGORICAL_COLUMNS = [f'f_c{x}' for x in range(14, 40)] +CRITEO_CLICK_COLUMNS = ['f_c0'] COLUMNS = CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS + CRITEO_CLICK_COLUMNS CRITEO_TRAIN_DAYS = list(range(0, 23)) diff --git a/deepray/datasets/criteo/preproc/spark_data_utils.py b/deepray/datasets/criteo/preproc/spark_data_utils.py index f549f87b..ee9da510 100644 --- a/deepray/datasets/criteo/preproc/spark_data_utils.py +++ b/deepray/datasets/criteo/preproc/spark_data_utils.py @@ -33,7 +33,7 @@ def get_column_counts_with_frequency_limit(df, frequency_limit=None): - cols = ['_c%d' % i for i in CAT_COLS] + cols = ['f_c%d' % i for i in CAT_COLS] df = ( df.select(posexplode(array(*cols)) ).withColumnRenamed('pos', 'column_id').withColumnRenamed('col', @@ -182,7 +182,7 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0): # not make a difference. models = sorted(models, key=itemgetter(3), reverse=True) for i, model, original_rows, would_broadcast in models: - col_name = '_c%d' % i + col_name = 'f_c%d' % i if not (would_broadcast or broadcast_model): # The data is highly skewed so we need to offset that cutoff = int(original_rows * skew_broadcast_pct / 100.0) @@ -193,11 +193,11 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0): model = (model.drop('model_count').withColumnRenamed('data', col_name)) model = broadcast(model) if broadcast_model else model df = (df.join(model, col_name, how='left').drop(col_name).withColumnRenamed('id', col_name)) - return df.fillna(0, ['_c%d' % i for i in CAT_COLS]) + return df.fillna(0, ['f_c%d' % i for i in CAT_COLS]) def transform_log(df, transform_log=False): - cols = ['_c%d' % i for i in INT_COLS] + cols = ['f_c%d' % i for i in INT_COLS] if transform_log: for col_name in cols: df = df.withColumn(col_name, log(df[col_name] + 3)) @@ -226,9 +226,9 @@ def delete_data_source(spark, path): def load_raw(spark, folder, day_range): - label_fields = [StructField('_c%d' % LABEL_COL, IntegerType())] - int_fields = [StructField('_c%d' % i, IntegerType()) for i in INT_COLS] - str_fields = [StructField('_c%d' % i, StringType()) for i in CAT_COLS] + label_fields = [StructField('f_c%d' % LABEL_COL, IntegerType())] + int_fields = [StructField('f_c%d' % i, IntegerType()) for i in INT_COLS] + str_fields = [StructField('f_c%d' % i, StringType()) for i in CAT_COLS] schema = StructType(label_fields + int_fields + str_fields) paths = [os.path.join(folder, 'day_%d' % i) for i in day_range] @@ -423,7 +423,7 @@ def _main(): models = list(load_column_models(spark, args.model_folder, bool(args.model_size_file))) if args.model_size_file: save_model_size( - OrderedDict(('_c%d' % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode + OrderedDict(('f_c%d' % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode ) models = [(i, df, agg.sum, flag) for i, df, agg, flag in models] diff --git a/deepray/datasets/criteo/preproc/split_dataset.py b/deepray/datasets/criteo/preproc/split_dataset.py index 4dad640f..2e7a75df 100644 --- a/deepray/datasets/criteo/preproc/split_dataset.py +++ b/deepray/datasets/criteo/preproc/split_dataset.py @@ -25,8 +25,8 @@ import sys sys.path.append('/workspace/dlrm') -from dlrm.data.defaults import get_categorical_feature_type -from dlrm.data.feature_spec import FeatureSpec +from data.defaults import get_categorical_feature_type +from data.feature_spec import FeatureSpec def split_binary_file( @@ -71,7 +71,7 @@ def split_binary_file( numerical_f.write(numerical_features.astype(np.float16).tobytes()) label = batch_data[:, 0] - label_f.write(label.astype(np.bool).tobytes()) + label_f.write(label.astype(bool).tobytes()) cat_offset = num_numerical_features + 1 for cat_idx, cat_feature_type in enumerate(cat_feature_types): diff --git a/deepray/datasets/criteo/docker/requirements_preprocessing.txt b/deepray/datasets/criteo/requirements_preprocessing.txt similarity index 58% rename from deepray/datasets/criteo/docker/requirements_preprocessing.txt rename to deepray/datasets/criteo/requirements_preprocessing.txt index 58f5116b..6be9d65d 100644 --- a/deepray/datasets/criteo/docker/requirements_preprocessing.txt +++ b/deepray/datasets/criteo/requirements_preprocessing.txt @@ -1,4 +1,4 @@ numpy pandas -joblib +joblib==0.16 tqdm diff --git a/deepray/datasets/csv_pipeline.py b/deepray/datasets/csv_pipeline.py deleted file mode 100644 index af3016d7..00000000 --- a/deepray/datasets/csv_pipeline.py +++ /dev/null @@ -1,20 +0,0 @@ -import tensorflow as tf -from deepray.datasets.datapipeline import DataPipeLine -from absl import flags - -FLAGS = flags.FLAGS - - -class CSVPipeLine(DataPipeLine): - - def build_dataset(self, csv_path): - dataset = tf.data.experimental.make_csv_dataset( - csv_path, - record_defaults=list(self.feature_map["dtype"]), - column_names=list(self.feature_map["name"]), - batch_size=FLAGS.batch_size, - label_name=FLAGS.label, - field_delim=",", - header=True, - ) - return dataset diff --git a/deepray/datasets/csv_pipeline/__init__.py b/deepray/datasets/csv_pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/datasets/csv_pipeline/csv_pipeline.py b/deepray/datasets/csv_pipeline/csv_pipeline.py new file mode 100644 index 00000000..ac8dd6e0 --- /dev/null +++ b/deepray/datasets/csv_pipeline/csv_pipeline.py @@ -0,0 +1,18 @@ +import tensorflow as tf +from deepray.datasets.datapipeline import DataPipeline +from absl import flags + + +class CSVPipeline(DataPipeline): + + def build_dataset(self, batch_size, input_file_pattern, is_training=True, epochs=1, shuffle=False, *args, **kwargs): + dataset = tf.data.experimental.make_csv_dataset( + input_file_pattern, + record_defaults=list(self.feature_map["dtype"]), + column_names=list(self.feature_map["name"]), + batch_size=batch_size, + label_name=flags.FLAGS.label, + field_delim=",", + header=True, + ) + return dataset diff --git a/deepray/datasets/datapipeline.py b/deepray/datasets/datapipeline.py index 1c0dbf14..23dcc057 100644 --- a/deepray/datasets/datapipeline.py +++ b/deepray/datasets/datapipeline.py @@ -4,15 +4,13 @@ # @license : Copyright(C), import abc -import multiprocessing import os import urllib.request from enum import Enum import pandas as pd import tensorflow as tf -from absl import flags -from absl import logging +from absl import flags, logging import deepray from deepray.utils.data.feature_map import FeatureMap @@ -24,33 +22,29 @@ ROOT_PATH = os.path.dirname(deepray.__file__) -FLAGS = flags.FLAGS -flags.DEFINE_integer("parallel_parse", multiprocessing.cpu_count(), "Number of parallel parsing") -flags.DEFINE_integer("shuffle_buffer", None, "Size of shuffle buffer") -flags.DEFINE_integer("prefetch_buffer", 16, "Size of prefetch buffer") -flags.DEFINE_integer("parallel_reads_per_file", None, "Number of parallel reads per file") -flags.DEFINE_integer("interleave_cycle", 16, "Number of interleaved inputs") -flags.DEFINE_integer("interleave_block", 2, "Number of interleaved block_length inputs") -flags.DEFINE_float("neg_sample_rate", 0.0, "") -flags.DEFINE_string("conf_file", os.getcwd() + "/conf/dp.yaml", "configuration in file.") - IS_TRAINING = Enum('is_training', ('Train', 'Valid', 'Test')) -class DataPipeLine(tf.keras.layers.Layer): +class DataPipeline(object): def __init__(self, context: tf.distribute.InputContext = None, **kwargs): - super().__init__(**kwargs) - self.use_horovod = FLAGS.use_horovod + # super().__init__(**kwargs) + self.built = False + self.use_horovod = flags.FLAGS.use_horovod self.context = context - self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map - # self.conf = Foo(FLAGS.conf_file).conf + self.feature_map = FeatureMap().feature_map + # self.conf = Foo(flags.FLAGS.conf_file).conf self.url = None + self.prebatch_size = kwargs.get("prebatch_size", None) @abc.abstractmethod def __len__(self): pass + @abc.abstractmethod + def build(self): + raise NotImplementedError("build: not implemented!") + @classmethod def read_list_from_file(cls, filename): file_list = tf.io.gfile.glob(filename) @@ -70,27 +64,18 @@ def parser(self, record): @abc.abstractmethod def build_dataset( - self, - input_file_pattern, - batch_size, - is_training=True, - prebatch_size=0, - epochs=1, - shuffle=False, - *args, - **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): """ must be defined in subclass """ raise NotImplementedError("build_dataset: not implemented!") - def call(self, input_file_pattern=None, batch_size=None, is_training=True, prebatch_size=0, *args, **kwargs): + def __call__(self, batch_size=None, input_file_pattern=None, is_training=True, *args, **kwargs): """Gets a closure to create a dataset.""" - return self.build_dataset( - input_file_pattern=input_file_pattern, batch_size=self.context.get_per_replica_batch_size(batch_size) if self.context else batch_size, + input_file_pattern=input_file_pattern, is_training=is_training, epochs=1, *args, @@ -125,3 +110,9 @@ def _dataset_options(self, input_files): options.experimental_optimization.map_parallelization = True return options + + def train_test_split(self, arrays, test_size=0.33, shuffle=False): + from sklearn.model_selection import train_test_split + random_state = flags.FLAGS.random_seed if flags.FLAGS.random_seed else 1024 + X_train, X_test = train_test_split(arrays, test_size=test_size, shuffle=shuffle, random_state=random_state) + return X_train, X_test diff --git a/deepray/datasets/dataset_factory.py b/deepray/datasets/dataset_factory.py index f3fbb934..45b22763 100644 --- a/deepray/datasets/dataset_factory.py +++ b/deepray/datasets/dataset_factory.py @@ -1,8 +1,6 @@ from absl import logging, flags flags.DEFINE_string("data_source", "parquet_dataset", "parquet or tfrecord") - -FLAGS = flags.FLAGS """ Build model """ @@ -19,10 +17,10 @@ def load_dataset(): module_instance = ArsenalDatasetV3() elif module_class_name == "parquet_dataset": - from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine + from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline logging.info("Load parquet dataset") - module_instance = ParquetPipeLine() + module_instance = ParquetPipeline() """ abs_mod_dir_path = os.path.dirname(os.path.realpath(__file__)) logging.info(f"abs_mod_dir_path: {abs_mod_dir_path}") diff --git a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py index 08234316..71539eed 100644 --- a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py +++ b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py @@ -29,12 +29,12 @@ def __init__(self, save_path): self.model_urls = { 'bert_base_uncased': ( - 'http://minio1.arsenal.kanzhun-inc.com/datasets/bert_models/google_pretrained_weights/uncased_L-12_H-768_A-12.tar.gz', + 'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz', 'uncased_L-12_H-768_A-12.tar.gz' ), 'bert_large_uncased': ( - 'http://minio1.arsenal.kanzhun-inc.com/datasets/bert_models/google_pretrained_weights/uncased_L-24_H-1024_A-16.tar.gz', + 'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-24_H-1024_A-16.tar.gz', 'uncased_L-24_H-1024_A-16.tar.gz' ), # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'), diff --git a/deepray/datasets/downloader/bertPrep.py b/deepray/datasets/downloader/bertPrep.py index 0f751cc1..de0cf4ca 100644 --- a/deepray/datasets/downloader/bertPrep.py +++ b/deepray/datasets/downloader/bertPrep.py @@ -16,15 +16,17 @@ import pprint import subprocess -from bookscorpus import BookscorpusTextFormatting +import bookscorpus.BookscorpusTextFormatting +import pubmed.PubMedTextFormatting +import wikicorpus.WikicorpusTextFormatting + import Downloader -from pubmed import PubMedTextFormatting import TextSharding -from wikicorpus import WikicorpusTextFormatting def main(args): - working_dir = "/workspaces/dataset/wikicorpus_en" # os.environ['BERT_PREP_WORKING_DIR'] + working_dir = os.environ['BERT_PREP_WORKING_DIR'] + print('Working Directory:', working_dir) print('Action:', args.action) print('Dataset Name:', args.dataset) @@ -37,7 +39,7 @@ def main(args): + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \ + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100)) directory_structure = { - 'download': working_dir + '', # Downloaded and decompressed + 'download': working_dir + '/download', # Downloaded and decompressed 'extracted': working_dir + '/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor) 'formatted': working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same @@ -71,7 +73,7 @@ def main(args): if args.dataset == 'bookscorpus': books_path = directory_structure['download'] + '/bookscorpus' - # books_path = directory_structure['download'] + #books_path = directory_structure['download'] output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt' books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True) books_formatter.merge() @@ -92,10 +94,9 @@ def main(args): wiki_formatter.merge() elif args.dataset == 'wikicorpus_zh': - assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is ' \ - 'added.' + assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.' if args.skip_wikiextractor == 0: - path_to_wikiextractor_in_container = 'WikiExtractor.py' + path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py' wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure[ 'download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str( args.n_processes @@ -176,7 +177,7 @@ def main(args): last_process = None def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'): - bert_preprocessing_command = 'python /workspaces/Deepray2/deepray/datasets/downloader/create_pretraining_data.py' + bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py' bert_preprocessing_command += ' --input_file=' + directory_structure[ 'sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt' bert_preprocessing_command += ' --output_file=' + directory_structure[ diff --git a/deepray/datasets/downloader/create_datasets_from_start.sh b/deepray/datasets/downloader/create_datasets_from_start.sh index e16b7284..0677d45e 100755 --- a/deepray/datasets/downloader/create_datasets_from_start.sh +++ b/deepray/datasets/downloader/create_datasets_from_start.sh @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +set -e export BERT_PREP_WORKING_DIR=/workspaces/bert_tf2/data to_download=${1:-"all"} diff --git a/deepray/datasets/downloader/create_finetuning_data.py b/deepray/datasets/downloader/create_finetuning_data.py index 683f2f33..581e7083 100644 --- a/deepray/datasets/downloader/create_finetuning_data.py +++ b/deepray/datasets/downloader/create_finetuning_data.py @@ -31,8 +31,6 @@ from squad import squad_lib_sp import tokenization -FLAGS = flags.FLAGS - flags.DEFINE_enum( "fine_tuning_task_type", "classification", ["classification", "squad"], "The name of the BERT fine tuning task for which data " diff --git a/deepray/datasets/downloader/create_pretraining_data.py b/deepray/datasets/downloader/create_pretraining_data.py index 13040329..b922b241 100644 --- a/deepray/datasets/downloader/create_pretraining_data.py +++ b/deepray/datasets/downloader/create_pretraining_data.py @@ -24,8 +24,6 @@ import tokenization -FLAGS = flags.FLAGS - flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).") flags.DEFINE_string("output_file", None, "Output TF example file (or comma-separated list of files).") diff --git a/deepray/datasets/fashion_mnist/fashion_mnist.py b/deepray/datasets/fashion_mnist/fashion_mnist.py index eaff543e..bdb78adb 100644 --- a/deepray/datasets/fashion_mnist/fashion_mnist.py +++ b/deepray/datasets/fashion_mnist/fashion_mnist.py @@ -17,21 +17,21 @@ import gzip import os import sys + import numpy as np import tensorflow as tf from absl import flags -from keras.utils.data_utils import get_file +from keras.src.utils.data_utils import get_file -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS([ +flags.FLAGS([ sys.argv[0], "--num_train_examples=60000", ]) -class FashionMNIST(DataPipeLine): +class FashionMNIST(DataPipeline): def __init__(self): """Loads the Fashion-MNIST dataset. @@ -104,16 +104,8 @@ def __len__(self): pass def build_dataset( - self, - input_file_pattern, - batch_size, - is_training=True, - context: tf.distribute.InputContext = None, - use_horovod=False, - *args, - **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): - if is_training: with gzip.open(self.paths[0], "rb") as lbpath: y = np.frombuffer(lbpath.read(), np.uint8, offset=8) @@ -130,5 +122,5 @@ def build_dataset( dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y, tf.int64)) ) - dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size) + dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size) return dataset diff --git a/deepray/datasets/fashion_mnist/fashion_mnist_test.py b/deepray/datasets/fashion_mnist/fashion_mnist_test.py index d18d6dc1..73c3d9c8 100644 --- a/deepray/datasets/fashion_mnist/fashion_mnist_test.py +++ b/deepray/datasets/fashion_mnist/fashion_mnist_test.py @@ -9,8 +9,6 @@ from .fashion_mnist import FashionMNIST -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py index 712f34a4..131606e9 100644 --- a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py +++ b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py @@ -57,8 +57,6 @@ 'Should have train and validation subdirectories inside it.' ) -FLAGS = flags.FLAGS - LABELS_FILE = 'synset_labels.txt' TRAINING_SHARDS = 1024 @@ -384,6 +382,5 @@ def main(_): if __name__ == '__main__': - logging.set_verbosity(logging.INFO) tf.disable_v2_behavior() app.run(main) diff --git a/deepray/datasets/imdb/imdb.py b/deepray/datasets/imdb/imdb.py index e257d87c..b1bc3f54 100644 --- a/deepray/datasets/imdb/imdb.py +++ b/deepray/datasets/imdb/imdb.py @@ -24,14 +24,12 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine - -FLAGS = flags.FLAGS +from deepray.datasets.datapipeline import DataPipeline AUTOTUNE = tf.data.AUTOTUNE -class IMDB(DataPipeLine): +class IMDB(DataPipeline): def __init__(self, url='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', **kwargs): super().__init__(**kwargs) @@ -57,9 +55,7 @@ def parser(self, record): y = tokenized_sentences[:, 1:] return x, y - def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs - ): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs): if is_training: raw_ds = tf.keras.utils.text_dataset_from_directory( diff --git a/deepray/datasets/imdb/imdb_test.py b/deepray/datasets/imdb/imdb_test.py index 601357ce..c421df93 100644 --- a/deepray/datasets/imdb/imdb_test.py +++ b/deepray/datasets/imdb/imdb_test.py @@ -9,8 +9,6 @@ from .imdb import IMDB -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/kafka_dataset.py b/deepray/datasets/kafka_dataset.py deleted file mode 100644 index 5045eb11..00000000 --- a/deepray/datasets/kafka_dataset.py +++ /dev/null @@ -1,43 +0,0 @@ -from tensorflow.python.data.ops import readers -import tensorflow as tf -from deepray.datasets.datapipeline import DataPipeLine -from absl import flags - -FLAGS = flags.FLAGS - - -class KafkaDataset(DataPipeLine): - - def parse(self, raw_message, raw_key): - context_features, sequence_features = {}, {} - for key, dim in self.feature_map["FLOAT"].items(): - context_features[key] = tf.io.FixedLenFeature([], tf.float32) - for key, dim in self.feature_map["INT"].items(): - context_features[key] = tf.io.FixedLenFeature([], tf.int64) - for key, dim in self.feature_map["VARINT"].items(): - sequence_features[key] = tf.io.VarLenFeature(tf.int64) - - tensor, sparse_tensor = tf.io.parse_single_sequence_example( - serialized=raw_message, context_features=context_features, sequence_features=sequence_features - ) - reshaped_tensor = {} - for fea in context_features: - reshaped_tensor[fea] = tensor[fea] - # reshaped_tensor[fea] = tf.reshape(tensor[fea], [1]) - label = reshaped_tensor.pop(FLAGS.label) - for fea in sequence_features: - reshaped_tensor[fea] = sparse_tensor[fea] - # reshaped_tensor[fea] = tf.sparse.reshape(sparse_tensor[fea], [-1]) - return reshaped_tensor, label - - def build_dataset(self): - dataset = ( - readers.KafkaGroupIODataset( - topics=self.conf["Kafka"]["topics"], - group_id=self.conf["Kafka"]["group_id"], - servers=self.conf["Kafka"]["servers"], - stream_timeout=3000, - configuration=self.conf["Kafka"]["configuration"], - ).map(map_func=self.parse, num_parallel_calls=FLAGS.parallel_parse).batch(FLAGS.batch_size) - ) - return dataset diff --git a/deepray/datasets/kafka_pipeline/__init__.py b/deepray/datasets/kafka_pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline.py b/deepray/datasets/kafka_pipeline/kafka_pipeline.py new file mode 100644 index 00000000..5ce39e62 --- /dev/null +++ b/deepray/datasets/kafka_pipeline/kafka_pipeline.py @@ -0,0 +1,254 @@ +import multiprocessing +import sys +from abc import ABC + +import tensorflow as tf +from tensorflow_io.python.ops import core_ops + +from deepray.datasets.datapipeline import DataPipeline +from deepray.utils import logging_util + +logger = logging_util.get_logger() + + +class KafkaGroupIODataset(tf.data.Dataset): + """Represents a streaming dataset from kafka using consumer groups. + + The dataset is created by fetching messages from kafka using consumer clients + which are part of a consumer group. Owing to the offset management capability of + the kafka brokers, the dataset can maintain offsets of all the partitions + without explicit initialization. If the consumer client joins an existing + consumer group, it will start fetching messages from the already committed offsets. + To start fetching the messages from the beginning, please join a different consumer group. + The dataset will be prepared from the committed/start offset until the last offset. + + The dataset can be prepared and iterated in the following manner: + + >>> import tensorflow_io as tfio + >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset( + topics=["topic1"], + group_id="cg", + servers="localhost:9092" + ) + + >>> for (message, key) in dataset: + ... print(message) + + Cases may arise where the consumer read time out issues arise due to + the consumer group being in a rebalancing state. In order to address that, please + set `session.timeout.ms` and `max.poll.interval.ms` values in the configuration tensor + and try again after the group rebalances. For example: considering the kafka cluster + has been setup with the default settings, `max.poll.interval.ms` would be `300000ms`. + It can be changed to `8000ms` to reduce the time between pools. Also, the `session.timeout.ms` + can be changed to `7000ms`. However, the value for `session.timeout.ms` should be + according to the following relation: + + - `group.max.session.timeout.ms` in server.properties > `session.timeout.ms` in the + consumer.properties. + - `group.min.session.timeout.ms` in server.properties < `session.timeout.ms` in the + consumer.properties + + >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset( + topics=["topic1"], + group_id="cg", + servers="localhost:9092", + configuration=[ + "session.timeout.ms=7000", + "max.poll.interval.ms=8000", + "auto.offset.reset=earliest", + ], + ) + + In the above example, the `auto.offset.reset` configuration is set to `earliest` so that + in case the consumer group is being newly created, it will start reading the messages from + the beginning. If it is not set, it defaults to `latest`. For additional configurations, + please refer the librdkafka's configurations: + https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md + + In addition to the standard streaming functionality, there is added support for a timeout + based stream. Once the existing data has been fetched, this dataset will block for + an additional `stream_timeout` milliseconds, for the new messages to be captured. + + >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset( + topics=["topic1"], + group_id="cg", + servers="localhost:9092", + stream_timeout=30000, + configuration=[ + "session.timeout.ms=7000", + "max.poll.interval.ms=8000", + "auto.offset.reset=earliest", + ], + ) + >>> for (message, key) in dataset: + ... print(message) + + The above loop will run as long as the consumer clients are able to fetch messages + from the topic(s). However, since we set the `stream_timeout` value to `15000` milliseconds, + the dataset will wait for any new messages that might be added to the topic for that duration. + + As the kafka deployments vary in configuration as per various use-cases, the time required for + the consumers to fetch a single message might also vary. This timeout value can be adjusted + using the `message_poll_timeout` parameter. + + The `message_poll_timeout` value represents the duration which the consumers + have to wait while fetching a new message. However, even if we receive a new message + before the `message_poll_timeout` interval finishes, the consumer doesn't resume the + consumption but it will wait until the `message_poll_timeout` interval has finished. + Thus, if we want to block indefinitely until a new message arrives, + we cannot do it with `message_poll_timeout` alone. This is when the `stream_timeout` + value comes in, where we can set the value to a very high timeout + (i.e, block indefinitely) and keep on polling for new messages at + `message_poll_timeout` intervals. + """ + + def __init__( + self, + topics, + group_id, + servers, + stream_timeout=0, + message_poll_timeout=10000, + configuration=None, + internal=True, + ): + """ + Args: + topics: A `tf.string` tensor containing topic names in [topic] format. + For example: ["topic1", "topic2"] + group_id: The id of the consumer group. For example: cgstream + servers: An optional list of bootstrap servers. + For example: `localhost:9092`. + stream_timeout: An optional timeout duration (in milliseconds) to block until + the new messages from kafka are fetched. + By default it is set to 0 milliseconds and doesn't block for new messages. + To block indefinitely, set it to -1. + message_poll_timeout: An optional timeout duration (in milliseconds) + after which the kafka consumer throws a timeout error while fetching + a single message. This value also represents the intervals at which + the kafka topic(s) are polled for new messages while using the `stream_timeout` + configuration: An optional `tf.string` tensor containing + configurations in [Key=Value] format. + Global configuration: please refer to 'Global configuration properties' + in librdkafka doc. Examples include + ["enable.auto.commit=false", "heartbeat.interval.ms=2000"] + Topic configuration: please refer to 'Topic configuration properties' + in librdkafka doc. Note all topic configurations should be + prefixed with `conf.topic.`. Examples include + ["conf.topic.auto.offset.reset=earliest"] + Reference: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md + internal: Whether the dataset is being created from within the named scope. + Default: True + """ + with tf.name_scope("KafkaGroupIODataset"): + assert internal + + if stream_timeout == -1: + stream_timeout = sys.maxsize + elif stream_timeout >= 0: + # Taking the max of `stream_timeout` and `message_poll_timeout` + # to prevent the user from bothering about the underlying polling + # mechanism. + stream_timeout = max(stream_timeout, message_poll_timeout) + else: + raise ValueError("Invalid stream_timeout value: {} ,set it to -1 to block indefinitely.".format(stream_timeout)) + metadata = list(configuration or []) + if group_id is not None: + metadata.append("group.id=%s" % group_id) + if servers is not None: + metadata.append("bootstrap.servers=%s" % servers) + resource = core_ops.io_kafka_group_readable_init(topics=topics, metadata=metadata) + + self._resource = resource + dataset = tf.data.Dataset.counter() + dataset = dataset.map( + lambda i: core_ops.io_kafka_group_readable_next( + input=self._resource, + index=i, + message_poll_timeout=message_poll_timeout, + stream_timeout=stream_timeout, + ) + ) + dataset = dataset.take_while(lambda v: tf.greater(v.continue_fetch, 0)) + dataset = dataset.map(lambda v: v.message) + dataset = dataset.unbatch() + + self._dataset = dataset + super().__init__(self._dataset._variant_tensor) # pylint: disable=protected-access + + def _inputs(self): + return [] + + @property + def element_spec(self): + return self._dataset.element_spec + + +class KafkaPipeline(DataPipeline, ABC): + + def __init__( + self, + topics, + group_id, + servers, + stream_timeout=None, + configuration=None, + compression_type=None, + num_client=1, + **kwargs + ): + super().__init__(**kwargs) + self.topics = topics + self.group_id = group_id + self.servers = servers + self.stream_timeout = stream_timeout + self.configuration = configuration + self.compression_type = compression_type + self.num_client = num_client + + def build_dataset( + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs + ): + if self.num_client > 1: + logger.info(f"Using {self.num_client} Kafka clients.") + clients = tuple( + KafkaGroupIODataset( + topics=self.topics, + group_id=self.group_id, + servers=self.servers, + stream_timeout=self.stream_timeout, + configuration=self.configuration, + ) for _ in range(self.num_client) + ) + dataset = tf.data.Dataset.zip(clients) + dataset = dataset.map(lambda *x: tf.stack(x, axis=-1)).unbatch() + else: + dataset = KafkaGroupIODataset( + topics=self.topics, + group_id=self.group_id, + servers=self.servers, + stream_timeout=self.stream_timeout, + configuration=self.configuration, + ) + logger.info( + "Using only one Kafka client, if there is an IO bottleneck, it is recommended to adjust 'num_client' to increase the number of Kafka clients" + ) + if self.prebatch_size: + if batch_size > self.prebatch_size: + dataset = dataset.batch( + batch_size=batch_size // self.prebatch_size, + num_parallel_calls=tf.data.AUTOTUNE, + deterministic=True, + drop_remainder=True + ) + else: + dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True, drop_remainder=True) + if self.compression_type: + dataset = dataset.map( + lambda v: tf.io.decode_compressed(v, compression_type=self.compression_type), multiprocessing.cpu_count() + ) + if not hasattr(self.parser, "__isabstractmethod__"): + dataset = dataset.map(self.parser, multiprocessing.cpu_count()) + if self.prebatch_size and batch_size % self.prebatch_size != 0: + dataset = dataset.unbatch().batch(batch_size) + return dataset diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py new file mode 100644 index 00000000..bc957110 --- /dev/null +++ b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py @@ -0,0 +1,52 @@ +# -*- coding: UTF-8 -*- +import sys +from absl import flags +import tensorflow as tf + +from tf_keras import backend as K + +import deepray as dp +from deepray.utils.benchmark import PerformanceCalculator +from deepray.utils import logging_util + +from deepray.utils.horovod_utils import is_main_process + +from deepray.datasets.kafka_pipeline.kafka_pipeline import KafkaPipeline + +logger = logging_util.get_logger() + + +def main(): + + data_pipe = KafkaPipeline( + # dataset_name=flags.FLAGS.dataset, + # partitions=[{'ds': date} for date in get_dates()], + ) + + train_dataset = data_pipe(input_file_pattern=None, batch_size=flags.FLAGS.batch_size) + + _performance_calculator = PerformanceCalculator(0, 1000) + num_examples = 0 + step = 0 + + for sample in train_dataset.take(1000): + step += 1 + # example = tf.train.Example() + # example.ParseFromString(sample[0].numpy()) + print(sample) + # print(key) + step_throughput = _performance_calculator(1, flags.FLAGS.batch_size) + + if num_examples % 100 == 0: + logger.info(f'step {step}, Perf {step_throughput} samples/s') + + print(num_examples) + results_perf = _performance_calculator.results + if not _performance_calculator.completed: + print(f"self._performance_calculator.completed: {_performance_calculator.completed}") + results_perf = _performance_calculator.get_current_benchmark_results() + print(results_perf) + + +if __name__ == "__main__": + dp.runner(main) diff --git a/deepray/datasets/mnist/mnist.py b/deepray/datasets/mnist/mnist.py index cd4051d9..f328534f 100644 --- a/deepray/datasets/mnist/mnist.py +++ b/deepray/datasets/mnist/mnist.py @@ -19,19 +19,13 @@ import numpy as np import tensorflow as tf from absl import flags -from keras.utils.data_utils import get_file +from keras.src.utils.data_utils import get_file -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.utils.horovod_utils import get_rank, get_world_size -FLAGS = flags.FLAGS -FLAGS([ - sys.argv[0], - "--num_train_examples=60000", -]) - -class Mnist(DataPipeLine): +class Mnist(DataPipeline): def __init__(self, path="mnist.npz"): """Loads the MNIST dataset. @@ -80,6 +74,12 @@ def __init__(self, path="mnist.npz"): https://creativecommons.org/licenses/by-sa/3.0/) """ super().__init__() + + flags.FLAGS([ + sys.argv[0], + "--num_train_examples=60000", + ]) + origin_folder = ("https://storage.googleapis.com/tensorflow/tf-keras-datasets/") self.path = get_file( path, @@ -90,17 +90,15 @@ def __init__(self, path="mnist.npz"): ) def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): with np.load(self.path, allow_pickle=True) as f: if is_training: - x, y = f["x_train"], f["y_train"] + image, label = f["x_train"], f["y_train"] else: - x, y = f["x_test"], f["y_test"] + image, label = f["x_test"], f["y_test"] - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y, tf.int64)) - ) + dataset = tf.data.Dataset.from_tensor_slices((tf.cast(image[..., tf.newaxis] / 255.0, tf.float32), label)) if self.use_horovod: # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, diff --git a/deepray/datasets/mnist/mnist_test.py b/deepray/datasets/mnist/mnist_test.py index a79ec9f7..07cc49c6 100644 --- a/deepray/datasets/mnist/mnist_test.py +++ b/deepray/datasets/mnist/mnist_test.py @@ -2,41 +2,16 @@ # @Time : 2021/8/10 2:50 PM # @Author : Hailin.Fu # @license : Copyright(C), -import sys -from datetime import datetime +from absl import flags +from deepray.datasets.mnist import Mnist -from absl import app, flags +data_pipe = Mnist() +# create data pipline of train & test dataset +train_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, is_training=True) +test_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, is_training=False) -from .mnist import Mnist +num_examples = 0 +for x in train_dataset: + num_examples += flags.FLAGS.batch_size -FLAGS = flags.FLAGS - -TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") - - -def runner(argv=None): - if len(argv) <= 1: - argv = [ - sys.argv[0], - "--batch_size=16", - "-epochs=1", - "--train_data=movielens/1m-ratings", - # f"--feature_map={dir_path}/feature_map.csv", - "--label=clicked", - ] - if argv: - FLAGS(argv, known_only=True) - - data_pipe = Mnist() - # create data pipline of train & test dataset - train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - num_examples = 0 - for x in train_dataset: - num_examples += FLAGS.batch_size - - print(x) - print(num_examples) - - -if __name__ == "__main__": - app.run(runner) +print(num_examples) diff --git a/deepray/datasets/movielens/movielens.csv b/deepray/datasets/movielens/movielens.csv index c200ef88..61f0f17e 100644 --- a/deepray/datasets/movielens/movielens.csv +++ b/deepray/datasets/movielens/movielens.csv @@ -1,4 +1,4 @@ -name,dtype,ftype,dim,length,voc_size,lr,optimizer,storage_type,composition_factor,ev_filter +name,dtype,ftype,dim,length,voc_size user_rating,int64,Label,1,1 user_id,int64,Categorical,32,1 movie_id,int64,Categorical,32,1 \ No newline at end of file diff --git a/deepray/datasets/movielens/movielens.py b/deepray/datasets/movielens/movielens.py index 6a941292..dbba39c4 100644 --- a/deepray/datasets/movielens/movielens.py +++ b/deepray/datasets/movielens/movielens.py @@ -4,11 +4,11 @@ import tensorflow as tf -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.datasets.movielens import constants as rconst -class Movielens(DataPipeLine): +class Movielens(DataPipeline): @staticmethod def parser(self, serialized_data, batch_size=None, is_training=True): diff --git a/deepray/datasets/movielens/movielens_100k_ratings.py b/deepray/datasets/movielens/movielens_100k_ratings.py index 98d2e493..342ece15 100644 --- a/deepray/datasets/movielens/movielens_100k_ratings.py +++ b/deepray/datasets/movielens/movielens_100k_ratings.py @@ -1,50 +1,76 @@ -"""NCF model input pipeline.""" - import os -import sys +import numpy as np import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS( - [ - sys.argv[0], - "--num_train_examples=100000", - "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")), - ] -) +class Movielens100kRating(DataPipeline): -class Movielens100kRating(DataPipeLine): + def __init__(self, split=False, **kwargs): + super().__init__(**kwargs) + self.split = split + flags.FLAGS([ + "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")), + ]) + import tensorflow_datasets as tfds + # Ratings data. + self.ratings = tfds.load("movielens/100k-ratings", split="train", data_dir="/datasets/", download=True) + # Features of all the available movies. + self.movies = tfds.load('movielens/100k-movies', split="train", data_dir="/datasets/", download=True) + users = self.ratings.map(lambda x: x["user_id"], os.cpu_count()) + movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count()) + movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count()) + self.user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.user_ids_vocabulary.adapt(users.batch(1_000_000)) + self.movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.movie_ids_vocabulary.adapt(movie_ids.batch(1_000_000)) + self.movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.movie_titles_vocabulary.adapt(movies.batch(1_682)) + + def get_vocabulary(self, feature): + if feature == "user_id": + return self.user_ids_vocabulary.get_vocabulary() + elif feature == "movie_id": + return self.movie_ids_vocabulary.get_vocabulary() + elif feature == "movie_title": + return self.movie_titles_vocabulary.get_vocabulary() + else: + column = self.original_dataset.map(lambda x: { + feature: x[feature] + }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count()) + return np.unique(np.concatenate(list(column))) def parser(self, record): return { "movie_id": tf.strings.to_number(record["movie_id"], tf.int64), - "user_id": tf.strings.to_number(record["user_id"], tf.int64), + "movie_title": self.movie_titles_vocabulary(record["movie_title"]), + "user_id": self.user_ids_vocabulary(record["user_id"]), "movie_genres": tf.cast(record["movie_genres"][0], tf.int32), "user_gender": tf.cast(record["user_gender"], tf.int32), "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32), "raw_user_age": tf.cast(record["raw_user_age"], tf.int32), - "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32), + "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32) }, record["user_rating"] def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): - import tensorflow_datasets as tfds - ratings = tfds.load("movielens/100k-ratings", split="train", data_dir="/dataset/", download=True) - ratings = ratings.map( - self.parser - # lambda x: { - # "movie_id": tf.strings.to_number(x["movie_id"], tf.int64), - # "user_id": tf.strings.to_number(x["user_id"], tf.int64), - # "user_rating": x["user_rating"] - # } - ) - ratings = ratings.repeat(FLAGS.epochs) - shuffled = ratings.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False) - dataset = shuffled.batch(batch_size) + dataset = self.ratings.map(self.parser, os.cpu_count()) + if epochs > 1: + dataset = dataset.repeat(epochs) + if shuffle: + dataset = dataset.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False) + if self.split: + if is_training: + dataset = dataset.take(80_000) + else: + dataset = dataset.skip(80_000).take(20_000) + dataset = dataset.batch(batch_size) return dataset + + @property + def __len__(self): + return 1_000_000 diff --git a/deepray/datasets/movielens/movielens_100k_ratings_test.py b/deepray/datasets/movielens/movielens_100k_ratings_test.py deleted file mode 100644 index 80459b3a..00000000 --- a/deepray/datasets/movielens/movielens_100k_ratings_test.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python -# @Time : 2021/8/10 2:50 PM -# @Author : Hailin.Fu -# @license : Copyright(C), -import sys -from datetime import datetime - -from absl import app, flags, logging - -from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating - -FLAGS = flags.FLAGS -logging.set_verbosity(logging.INFO) - -TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") - - -def runner(argv=None): - if len(argv) <= 1: - argv = [ - sys.argv[0], - # "--batch_size=16", - "-epochs=1", - "--train_data=movielens/100k-ratings", - # f"--feature_map={dir_path}/feature_map.csv", - # "--label=clicked", - ] - if argv: - FLAGS(argv, known_only=True) - - data_pipe = Movielens100kRating() - # create data pipline of train & test dataset - train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - num_examples = 0 - for x in train_dataset: - num_examples += FLAGS.batch_size - - print(x) - print(num_examples) - - -if __name__ == "__main__": - app.run(runner) diff --git a/deepray/datasets/movielens/movielens_1m_ratings.py b/deepray/datasets/movielens/movielens_1m_ratings.py index 53a2c241..94b768f6 100644 --- a/deepray/datasets/movielens/movielens_1m_ratings.py +++ b/deepray/datasets/movielens/movielens_1m_ratings.py @@ -1,46 +1,76 @@ -"""NCF model input pipeline.""" import os -import sys +import numpy as np import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS -FLAGS( - [ - sys.argv[0], - "--num_train_examples=1000224", + +class Movielens1MRating(DataPipeline): + + def __init__(self, split=False, **kwargs): + super().__init__(**kwargs) + self.split = split + flags.FLAGS([ "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")), - ] -) + ]) + import tensorflow_datasets as tfds + # Ratings data. + self.ratings = tfds.load("movielens/1m-ratings", split="train", data_dir="/datasets/", download=True) + # Features of all the available movies. + self.movies = tfds.load('movielens/1m-movies', split="train", data_dir="/datasets/", download=True) + users = self.ratings.map(lambda x: x["user_id"], os.cpu_count()) + movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count()) + movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count()) + self.user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.user_ids_vocabulary.adapt(users.batch(1_000_000)) + self.movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.movie_ids_vocabulary.adapt(movie_ids.batch(1_000_000)) + self.movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None) + self.movie_titles_vocabulary.adapt(movies.batch(1_682)) + def get_vocabulary(self, feature): + if feature == "user_id": + return self.user_ids_vocabulary.get_vocabulary() + elif feature == "movie_id": + return self.movie_ids_vocabulary.get_vocabulary() + elif feature == "movie_title": + return self.movie_titles_vocabulary.get_vocabulary() + else: + column = self.original_dataset.map(lambda x: { + feature: x[feature] + }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count()) + return np.unique(np.concatenate(list(column))) -class Movielens1MRating(DataPipeLine): + def parser(self, record): + return { + "movie_id": self.movie_ids_vocabulary(record["movie_id"]), + "movie_title": self.movie_titles_vocabulary(record["movie_title"]), + "user_id": self.user_ids_vocabulary(record["user_id"]), + "movie_genres": tf.cast(record["movie_genres"][0], tf.int32), + "user_gender": tf.cast(record["user_gender"], tf.int32), + "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32), + "bucketized_user_age": tf.cast(record["bucketized_user_age"], tf.int32), + "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32) + }, record["user_rating"] def build_dataset( - self, - input_file_pattern, - batch_size, - is_training=True, - context: tf.distribute.InputContext = None, - use_horovod=False, - *args, - **kwargs + self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs ): - import tensorflow_datasets as tfds - dataset = tfds.load(input_file_pattern, split='train') - features = dataset.map( - lambda x: { - "movie_id": tf.strings.to_number(x["movie_id"], tf.int64), - "user_id": tf.strings.to_number(x["user_id"], tf.int64), - } - ) - ratings = dataset.map(lambda x: tf.one_hot(tf.cast(x['user_rating'] - 1, dtype=tf.int64), 5)) - dataset = dataset.zip((features, ratings)) - dataset = dataset.repeat(FLAGS.epochs) - dataset = dataset.shuffle(1024, reshuffle_each_iteration=False) + dataset = self.ratings.map(self.parser, os.cpu_count()) + if epochs > 1: + dataset = dataset.repeat(epochs) + if shuffle: + dataset = dataset.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False) + if self.split: + if is_training: + dataset = dataset.take(80_000) + else: + dataset = dataset.skip(80_000).take(20_000) dataset = dataset.batch(batch_size) - return dataset + + @property + def __len__(self): + return 1_000_224 diff --git a/deepray/datasets/movielens/movielens_1m_ratings_test.py b/deepray/datasets/movielens/movielens_1m_ratings_test.py deleted file mode 100644 index 53373aa2..00000000 --- a/deepray/datasets/movielens/movielens_1m_ratings_test.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python -# @Time : 2021/8/10 2:50 PM -# @Author : Hailin.Fu -# @license : Copyright(C), -import sys -from datetime import datetime - -from absl import app, flags, logging - -from deepray.datasets.movielens.movielens_1m_ratings import Movielens1MRating - -FLAGS = flags.FLAGS -logging.set_verbosity(logging.INFO) - -TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") - - -def runner(argv=None): - if len(argv) <= 1: - argv = [ - sys.argv[0], - "--batch_size=16", - "-epochs=1", - "--train_data=movielens/1m-ratings", - # f"--feature_map={dir_path}/feature_map.csv", - "--label=clicked", - ] - if argv: - FLAGS(argv, known_only=True) - - data_pipe = Movielens1MRating() - # create data pipline of train & test dataset - train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - num_examples = 0 - for x in train_dataset: - num_examples += FLAGS.batch_size - - print(x) - print(num_examples) - - -if __name__ == "__main__": - app.run(runner) diff --git a/deepray/datasets/movielens/movielens_ratings_test.py b/deepray/datasets/movielens/movielens_ratings_test.py new file mode 100644 index 00000000..87174de7 --- /dev/null +++ b/deepray/datasets/movielens/movielens_ratings_test.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# @Time : 2021/8/10 2:50 PM +# @Author : Hailin.Fu +# @license : Copyright(C), +import sys, os +import deepray as dp +from datetime import datetime +from absl import app, flags, logging + +from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") + + +def define_flags(): + argv = sys.argv + [ + "--epochs=1", + "--batch_size=2", + "--train_data=movielens/100k-ratings", + ] + flags.FLAGS(argv) + + +def runner(): + data_pipe = Movielens100kRating() + train_dataset = data_pipe(flags.FLAGS.batch_size, is_training=True) + num_examples = 0 + + for x in train_dataset: + num_examples += flags.FLAGS.batch_size + + print(x) + print(num_examples) + + +if __name__ == "__main__": + dp.runner(runner) diff --git a/deepray/datasets/movielens/process.py b/deepray/datasets/movielens/process.py index c555f152..65f5e51c 100644 --- a/deepray/datasets/movielens/process.py +++ b/deepray/datasets/movielens/process.py @@ -37,7 +37,7 @@ # pylint: enable=g-bad-import-order # URL to download dataset -_DATA_URL = "http://minio1.arsenal.kanzhun-inc.com/datasets/movielens/" +_DATA_URL = "https://files.grouplens.org/datasets/movielens/" GENRES = [ 'Action', 'Adventure', 'Animation', "Children", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', @@ -276,5 +276,5 @@ def main(_): if __name__ == "__main__": - FLAGS = flags.FLAGS + app.run(main) diff --git a/deepray/datasets/movielens/producer.py b/deepray/datasets/movielens/producer.py index 24008f44..d047876e 100644 --- a/deepray/datasets/movielens/producer.py +++ b/deepray/datasets/movielens/producer.py @@ -6,21 +6,11 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.datasets.movielens import constants as rconst -FLAGS = flags.FLAGS -FLAGS( - [ - sys.argv[0], - "--num_train_examples=5049000", - # "--=6138000" - "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")), - ] -) - -class Produce(DataPipeLine): +class Produce(DataPipeline): def __init__(self, params, producer): self._producer = producer diff --git a/deepray/datasets/openwebtext/openwebtext.py b/deepray/datasets/openwebtext/openwebtext.py index d4f5df24..0c623c14 100644 --- a/deepray/datasets/openwebtext/openwebtext.py +++ b/deepray/datasets/openwebtext/openwebtext.py @@ -20,22 +20,21 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline -FLAGS = flags.FLAGS FLAGS([ sys.argv[0], "--num_train_examples=60000", ]) -class Openwebtext(DataPipeLine): +class Openwebtext(DataPipeline): def __init__(self, max_seq_length, **kwargs): super().__init__(**kwargs) self._max_seq_length = max_seq_length - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): """The actual input function.""" input_files = tf.io.gfile.glob(input_file_pattern) diff --git a/deepray/datasets/openwebtext/openwebtext_test.py b/deepray/datasets/openwebtext/openwebtext_test.py index a307503d..a2c95f80 100644 --- a/deepray/datasets/openwebtext/openwebtext_test.py +++ b/deepray/datasets/openwebtext/openwebtext_test.py @@ -12,8 +12,6 @@ from .openwebtext import Openwebtext -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py index 0dc0a3d2..9ef186e0 100644 --- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py +++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py @@ -1,45 +1,223 @@ +import random + +import pandas as pd import tensorflow as tf from absl import flags +from six import string_types +from tensorflow import dtypes from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.ops import readers +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops -from deepray.custom_ops.parquet_dataset import dataframe from deepray.custom_ops.parquet_dataset import parquet_dataset_ops -from deepray.datasets.datapipeline import DataPipeLine +from deepray.custom_ops.parquet_dataset.python.parquet_pybind import parquet_filenames_and_fields +from deepray.datasets.datapipeline import DataPipeline +from deepray.utils import logging_util +from deepray.utils.horovod_utils import get_rank, get_world_size + +logger = logging_util.get_logger() + + +def parquet_filenames(filenames, lower=False): + """Check and fetch parquet filenames and fields. + + Args: + filenames: List of Path of parquet file list. + lower: Convert field name to lower case if not found. + + Returns: + Validated file names and fields. + """ + if isinstance(filenames, string_types): + filenames = [filenames] + elif isinstance(filenames, (tuple, list)): + for f in filenames: + if not isinstance(f, string_types): + raise ValueError(f'{f} in `filenames` must be a string') + elif isinstance(filenames, dataset_ops.Dataset): + if filenames.output_types != dtypes.string: + raise TypeError('`filenames` must be a `tf.data.Dataset` of `tf.string` elements.') + if not filenames.output_shapes.is_compatible_with(tensor_shape.TensorShape([])): + raise ValueError('`filenames` must be a `tf.data.Dataset` of scalar `tf.string` ' + 'elements.') + elif isinstance(filenames, ops.Tensor): + if filenames.dtype != dtypes.string: + raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.') + else: + raise ValueError( + f'`filenames` {filenames} must be a `tf.data.Dataset` of scalar ' + '`tf.string` elements or can be converted to a `tf.Tensor` of ' + '`tf.string`.' + ) + + if not isinstance(filenames, dataset_ops.Dataset): + filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string) + filenames = array_ops.reshape(filenames, [-1], name='filenames') + filenames = dataset_ops.Dataset.from_tensor_slices(filenames) + return filenames + + +class ParquetDataset(dataset_ops.DatasetV2): # pylint: disable=abstract-method + """A Parquet Dataset that reads batches from parquet files.""" + + VERSION = 2002 + + def __init__( + self, filenames, column_names=None, batch_size=1, num_parallel_reads=None, num_sequential_reads=2, parser=None + ): + """Create a `ParquetDataset`. + + Args: + filenames: A 0-D or 1-D `tf.string` tensor containing one or more + filenames. + batch_size: (Optional.) Maxium number of samples in an output batch. + column_names: (Optional.) List of DataFrame fields. + partition_count: (Optional.) Count of row group partitions. + partition_index: (Optional.) Index of row group partitions. + drop_remainder: (Optional.) If True, only keep batches with exactly + `batch_size` samples. + num_parallel_reads: (Optional.) A `tf.int64` scalar representing the + number of files to read in parallel. Defaults to reading files + sequentially. + num_sequential_reads: (Optional.) A `tf.int64` scalar representing the + number of batches to read in sequential. Defaults to 1. + """ + self._batch_size = batch_size + self._filter = filter + self._parser = parser + + filenames, fields = parquet_filenames_and_fields(filenames, column_names) + filenames = filenames.batch(32) + + def _create_dataset(f): + dataset = parquet_dataset_ops.ParquetDataset( + filenames=f, + fields=fields, + batch_size=self._batch_size, + ) + if self._parser: + dataset = dataset.map(self._parser, num_parallel_calls=tf.data.AUTOTUNE) + return dataset + + self._impl = self._build_dataset( + _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads + ) + super().__init__(self._impl._variant_tensor) # pylint: disable=protected-access + + def _inputs(self): + return self._impl._inputs() # pylint: disable=protected-access + + @property + def element_spec(self): + return self._impl.element_spec # pylint: disable=protected-access + + def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, num_sequential_reads=1): + """Internal method to create a `ParquetDataset`.""" + if num_parallel_reads is None: + return filenames.flat_map(dataset_creator) + if num_parallel_reads == dataset_ops.AUTOTUNE: + return filenames.interleave(dataset_creator, num_parallel_calls=2, deterministic=False) + return readers.ParallelInterleaveDataset( + filenames, + dataset_creator, + cycle_length=num_parallel_reads, + block_length=num_sequential_reads, + sloppy=True, + buffer_output_elements=None, + prefetch_input_elements=1 + ) + -FLAGS = flags.FLAGS +class ParquetPipeline(DataPipeline): + def __init__(self, column_names=[], **kwargs): + super().__init__(**kwargs) + self.column_names = column_names -class ParquetPipeLine(DataPipeLine): + # duplicate value check + visited = set() + dup_values = [name for name in self.column_names if name in visited or (visited.add(name) or False)] + assert len(dup_values) == 0, "The column_names input parameter has duplicate values: " + str(dup_values) + + self.info_df = pd.DataFrame() def parse(self, record): label_map = {} - for label in FLAGS.label: + for label in flags.FLAGS.label: # label_map[label] = record.pop(label) label_map[label] = tf.reshape(record.pop(label), [-1, 1]) return record, label_map - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): - """Makes dataset (of filenames) from filename glob patterns.""" - # Extract lines from input files using the Dataset API. - - file_list = self.read_list_from_file(input_file_pattern) - - dataset = parquet_dataset_ops.ParquetDataset( - file_list, - batch_size=batch_size, - fields=[ - parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0) - for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values - ] - ).apply(dataframe.to_sparse()) - dataset = dataset.map( - map_func=self.parse, - num_parallel_calls=FLAGS.parallel_parse if FLAGS.parallel_parse else dataset_ops.AUTOTUNE, - ) - if FLAGS.shuffle_buffer: - dataset = dataset.apply( - tf.data.experimental.shuffle_and_repeat(buffer_size=FLAGS.shuffle_buffer, count=FLAGS.epochs) + def build_dataset( + self, + input_file_pattern, + batch_size, + is_training=True, + epochs=1, + shuffle=False, + *args, + **kwargs + ) -> tf.data.Dataset: + if isinstance(input_file_pattern, str): + data_file_list = self.read_list_from_file(input_file_pattern) + else: + data_file_list = input_file_pattern + if not data_file_list: + raise ValueError("The input file list is empty!") + + # When `input_file` is a path to a single file or a list + # containing a single path, disable auto sharding so that + # same input file is sent to all workers. + random_state = flags.FLAGS.random_seed if flags.FLAGS.random_seed else 1024 + if shuffle and isinstance(data_file_list, list): + random.Random(random_state).shuffle(data_file_list) + logger.info(f"Shuffling {len(data_file_list)} parquet files.") + if isinstance(data_file_list, str) or len(data_file_list) < get_world_size(): + dataset = parquet_dataset_ops.ParquetDataset( + filenames=data_file_list, + fields=self.column_names if self.column_names else None, + batch_size=batch_size, ) + if self.use_horovod: + # For multi-host training, we want each hosts to always process the same + # subset of files. Each host only sees a subset of the entire dataset, + # allowing us to cache larger datasets in memory. + dataset = dataset.shard(num_shards=get_world_size(), index=get_rank()) + logger.info("Using samples distributing strategy ❤") + if not hasattr(self.parser, "__isabstractmethod__"): + dataset = dataset.map(self.parser, tf.data.AUTOTUNE) else: - dataset = dataset.repeat(FLAGS.epochs) + if self.use_horovod: + # For multi-host training, we want each hosts process different + # subset of files. Each host only sees a subset of the entire dataset, + # allowing us to cache larger datasets in memory. + data_file_list = [data_file_list[i] for i in range(len(data_file_list)) if i % get_world_size() == get_rank()] + logger.info("Using files distributing strategy ❤") + dataset = ParquetDataset( + filenames=data_file_list, + column_names=self.column_names if self.column_names else None, + batch_size=batch_size, + num_parallel_reads=dataset_ops.AUTOTUNE, + parser=None if hasattr(self.parser, "__isabstractmethod__") else self.parser + ) + + # if not hasattr(self.parser, "__isabstractmethod__"): + # dataset = dataset.map(self.parser, multiprocessing.cpu_count()) + # dataset = dataset.ignore_errors() + # Prefetch overlaps in-feed with training + # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + # dataset = dataset.with_options(self._dataset_options(data_file_list)) + # Using `ignore_errors()` will drop the element that causes an error. + # dataset = dataset.apply(tf.data.experimental.ignore_errors()) + + if shuffle: + shuffle_buffer = kwargs.get("shuffle_buffer", 10) + logger.debug(f"kwargs = {kwargs}") + logger.info(f"The shuffle_buffer is {shuffle_buffer}") + dataset = dataset.unbatch().shuffle( + buffer_size=shuffle_buffer, seed=flags.FLAGS.random_seed, reshuffle_each_iteration=False + ).batch(batch_size) + dataset = dataset.prefetch(tf.data.AUTOTUNE) return dataset diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py index 2fdd60aa..5ca9fc3f 100644 --- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py +++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py @@ -4,49 +4,44 @@ # @license : Copyright(C), import os import sys -from datetime import datetime -from absl import app, flags +from absl import flags -from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine +import deepray as dp +from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS +os.environ["CUDA_VISIBLE_DEVICES"] = "0" -TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") +def define_flags(): + argv = sys.argv + [ + "--batch_size=4096", "--epochs=1", "--dataset=ps_test", + "--feature_map=/workspaces/one-code/shadow-tf/datasets/feature_map.csv", + "--config_file=/workspaces/one-code/shadow-tf/train_feature_process.yaml" + ] + flags.FLAGS(argv) -def runner(argv=None): - dir_path = os.path.dirname(os.path.realpath(__file__)) - if len(argv) <= 1: - argv = [ - sys.argv[0], - "--batch_size=2", - "--epochs=1", - "--train_data=/workspaces/dataset/ali_display_ad_click/output/*.parquet", - "--feature_map=/workspaces/Deepray2/deepray/datasets/ali_display_ad_click/feature_map.csv", - # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list", - # f"--feature_map={dir_path}/bz_search_1to3.csv", - "--label=label", - ] - if argv: - FLAGS(argv, known_only=True) - - data_pipe = ParquetPipeLine() +def main(): + define_flags() + filenames = [ + "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet", + "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet", + ] + data_pipe = ParquetPipeline(column_names=['f_c0', 'f_c1', 'f_c14']) # create data pipline of train & test dataset - train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) + train_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, input_file_pattern=filenames, is_training=True) _performance_calculator = PerformanceCalculator(0, 1000) - # partitions = data_pipe.get_supported_partitions() - # print(partitions) num_examples = 0 step = 0 for batch in train_dataset.take(1000): step += 1 - num_examples += FLAGS.batch_size - step_throughput = _performance_calculator(1, FLAGS.batch_size) + num_examples += flags.FLAGS.batch_size + step_throughput = _performance_calculator(1, flags.FLAGS.batch_size) print(f'step {step}, Perf {step_throughput} samples/s') + print(batch) print(num_examples) results_perf = _performance_calculator.results @@ -57,4 +52,4 @@ def runner(argv=None): if __name__ == "__main__": - app.run(runner) + dp.runner(main) diff --git a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py index 9542f5e7..02324044 100644 --- a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py +++ b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py @@ -11,8 +11,6 @@ from deepray.utils.benchmark import PerformanceCalculator from .parquet_pipeline import parquet_pipeline -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/squad/classifier_dataset.py b/deepray/datasets/squad/classifier_dataset.py new file mode 100644 index 00000000..dc2e2cb5 --- /dev/null +++ b/deepray/datasets/squad/classifier_dataset.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BERT model input pipelines.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from deepray.datasets.datapipeline import DataPipeline +from deepray.utils.horovod_utils import get_rank, get_world_size + + +class Squad(DataPipeline): + + def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs): + super().__init__(**kwargs) + self.max_seq_length = max_seq_length + self.input_pipeline_context = input_pipeline_context + + def decode_record(self, record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.io.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def single_file_dataset(self, input_file, name_to_features): + """Creates a single-file dataset to be passed for BERT custom training.""" + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if self.use_horovod: + d = d.shard(num_shards=get_world_size(), index=get_rank()) + d = d.map(lambda record: self.decode_record(record, name_to_features)) + + # When `input_file` is a path to a single file or a list + # containing a single path, disable auto sharding so that + # same input file is sent to all workers. + if isinstance(input_file, str) or len(input_file) == 1: + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + d = d.with_options(options) + return d + + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs): + """Creates input dataset from (tf)records files for train/eval.""" + name_to_features = { + 'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'label_ids': tf.io.FixedLenFeature([], tf.int64), + 'is_real_example': tf.io.FixedLenFeature([], tf.int64), + } + dataset = self.single_file_dataset(input_file_pattern, name_to_features) + + # The dataset is always sharded by number of hosts. + # num_input_pipelines is the number of hosts rather than number of cores. + if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1: + dataset = dataset.shard( + self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id + ) + + def parser(record): + x = { + 'input_word_ids': record['input_ids'], + 'input_mask': record['input_mask'], + 'input_type_ids': record['segment_ids'] + } + y = record['label_ids'] + return x, y + + dataset = dataset.map(parser) + + if is_training: + dataset = dataset.shuffle(100) + dataset = dataset.repeat() + + dataset = dataset.batch(batch_size, drop_remainder=is_training) + dataset = dataset.prefetch(1024) + return dataset diff --git a/deepray/datasets/squad/pretrain_dataset.py b/deepray/datasets/squad/pretrain_dataset.py new file mode 100644 index 00000000..6373cdcc --- /dev/null +++ b/deepray/datasets/squad/pretrain_dataset.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BERT model input pipelines.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from deepray.datasets.datapipeline import DataPipeline +from deepray.utils.horovod_utils import get_rank, get_world_size + + +class Squad(DataPipeline): + + def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs): + super().__init__(**kwargs) + self.max_seq_length = max_seq_length + self.input_pipeline_context = input_pipeline_context + + def decode_record(self, record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.io.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def build_dataset( + self, + input_file_pattern, + batch_size, + max_predictions_per_seq, + is_training=True, + epochs=1, + shuffle=False, + *args, + **kwargs + ): + """Creates input dataset from (tf)records files for pretraining.""" + name_to_features = { + 'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), + 'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), + 'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32), + 'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64), + } + + dataset = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training) + if self.use_horovod: + dataset = dataset.shard(num_shards=get_world_size(), index=get_rank()) + + if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1: + dataset = dataset.shard( + self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id + ) + + dataset = dataset.repeat() + + # We set shuffle buffer to exactly match total number of + # training files to ensure that training data is well shuffled. + input_files = [] + for input_pattern in input_file_pattern: + input_files.extend(tf.io.gfile.glob(input_pattern)) + dataset = dataset.shuffle(len(input_files)) + + # In parallel, create tf record dataset for each train files. + # cycle_length = 8 means that up to 8 files will be read and deserialized in + # parallel. You may want to increase this number if you have a large number of + # CPU cores. + dataset = dataset.interleave( + tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE + ) + + decode_fn = lambda record: self.decode_record(record, name_to_features) + dataset = dataset.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + def parser(record): + """Filter out features to use for pretraining.""" + x = { + 'input_word_ids': record['input_ids'], + 'input_mask': record['input_mask'], + 'input_type_ids': record['segment_ids'], + 'masked_lm_positions': record['masked_lm_positions'], + 'masked_lm_ids': record['masked_lm_ids'], + 'masked_lm_weights': record['masked_lm_weights'], + 'next_sentence_labels': record['next_sentence_labels'], + } + + y = record['masked_lm_weights'] + + return x, y + + dataset = dataset.map(parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + if is_training: + dataset = dataset.shuffle(100) + + dataset = dataset.batch(batch_size, drop_remainder=True) + dataset = dataset.prefetch(1024) + return dataset diff --git a/deepray/datasets/squad/squad.py b/deepray/datasets/squad/squad.py index 991ef79f..67c6d7ce 100644 --- a/deepray/datasets/squad/squad.py +++ b/deepray/datasets/squad/squad.py @@ -21,22 +21,18 @@ import tensorflow as tf from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.utils.horovod_utils import get_rank, get_world_size -FLAGS = flags.FLAGS - -class Squad(DataPipeLine): +class Squad(DataPipeline): def __init__(self, max_seq_length, dataset_type="squad", **kwargs): super().__init__(**kwargs) self._max_seq_length = max_seq_length self.dataset_type = dataset_type - def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs - ): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs): if self.dataset_type == "squad": return self.create_squad_dataset( input_file_pattern, @@ -224,7 +220,7 @@ def _select_data_from_record(record): if is_training: dataset = dataset.shuffle(100) - dataset = dataset.repeat(FLAGS.epochs) + dataset = dataset.repeat(flags.FLAGS.epochs) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(1024) diff --git a/deepray/datasets/squad/squad_dataset.py b/deepray/datasets/squad/squad_dataset.py new file mode 100644 index 00000000..4cfa4801 --- /dev/null +++ b/deepray/datasets/squad/squad_dataset.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BERT model input pipelines.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from deepray.datasets.datapipeline import DataPipeline +from deepray.utils.horovod_utils import get_rank, get_world_size + + +class Squad(DataPipeline): + + def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs): + super().__init__(**kwargs) + self.max_seq_length = max_seq_length + self.input_pipeline_context = input_pipeline_context + + def decode_record(self, record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.io.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def single_file_dataset(self, input_file, name_to_features): + """Creates a single-file dataset to be passed for BERT custom training.""" + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if self.use_horovod: + d = d.shard(num_shards=get_world_size(), index=get_rank()) + + d = d.map(lambda record: self.decode_record(record, name_to_features)) + + # When `input_file` is a path to a single file or a list + # containing a single path, disable auto sharding so that + # same input file is sent to all workers. + if isinstance(input_file, str) or len(input_file) == 1: + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + d = d.with_options(options) + return d + + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs): + """Creates input dataset from (tf)records files for train/eval.""" + name_to_features = { + 'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + 'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), + } + if is_training: + name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64) + name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64) + else: + name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64) + + dataset = self.single_file_dataset(input_file_pattern, name_to_features) + + # The dataset is always sharded by number of hosts. + # num_input_pipelines is the number of hosts rather than number of cores. + if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1: + dataset = dataset.shard( + self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id + ) + + def parser(record): + """Dispatches record to features and labels.""" + x, y = {}, {} + for name, tensor in record.items(): + if name in ('start_positions', 'end_positions'): + y[name] = tensor + elif name == 'input_ids': + x['input_word_ids'] = tensor + elif name == 'segment_ids': + x['input_type_ids'] = tensor + else: + x[name] = tensor + return x, y + + dataset = dataset.map(parser) + + if is_training: + dataset = dataset.shuffle(100) + # dataset = dataset.repeat() + + dataset = dataset.batch(batch_size, drop_remainder=True) + dataset = dataset.prefetch(1024) + return dataset diff --git a/deepray/datasets/squad/squad_test.py b/deepray/datasets/squad/squad_test.py index 1ceadbc5..03732f97 100644 --- a/deepray/datasets/squad/squad_test.py +++ b/deepray/datasets/squad/squad_test.py @@ -12,8 +12,6 @@ from .squad import Squad -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") SQUAD_VERSION = "1.1" diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py index 35327304..926b729f 100644 --- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py +++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py @@ -1,15 +1,12 @@ import multiprocessing import tensorflow as tf -from absl import flags -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline from deepray.utils.horovod_utils import get_rank, get_world_size -FLAGS = flags.FLAGS - -class TFRecordPipeline(DataPipeLine): +class TFRecordPipeline(DataPipeline): """ Build a pipeline fetching, shuffling, and preprocessing the tfrecord files. """ @@ -41,9 +38,7 @@ def parser(self, record): label_map[label] = tensor.pop(label) return tensor, label_map - def build_dataset( - self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs - ): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs): input_files = tf.io.gfile.glob(input_file_pattern) # When `input_file` is a path to a single file or a list diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py index 4fcde0c1..46052299 100644 --- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py +++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py @@ -11,8 +11,6 @@ from deepray.datasets.tfrecord_pipeline import TFRecordPipeline from deepray.utils.benchmark import PerformanceCalculator -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py index 770bb778..1d5b1ffc 100644 --- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py +++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py @@ -9,18 +9,17 @@ from sklearn.model_selection import train_test_split from texthero import preprocessing -from deepray.datasets.datapipeline import DataPipeLine +from deepray.datasets.datapipeline import DataPipeline os.environ['CURL_CA_BUNDLE'] = '' -FLAGS = flags.FLAGS FLAGS([ sys.argv[0], "--num_train_examples=111699", ]) -class ToxicCommentClassificationChallenge(DataPipeLine): +class ToxicCommentClassificationChallenge(DataPipeline): def __init__(self, path="/workspaces/dataset/jigsaw-toxic-comment-classification-challenge", **kwargs): super().__init__(**kwargs) @@ -72,7 +71,7 @@ def __init__(self, path="/workspaces/dataset/jigsaw-toxic-comment-classification self.train_bert = hero.clean(train['comment_text'], clean_text_bert_pipeline) self.test_bert = hero.clean(test['comment_text'], clean_text_bert_pipeline) - def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs): + def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs): if is_training: ds = tf.data.Dataset.from_tensor_slices((self.train_bert, self.y_train)) else: diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py index 53ecee03..6581d356 100644 --- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py +++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py @@ -9,8 +9,6 @@ from .toxic_comment_classification_challenge import ToxicCommentClassificationChallenge -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en.py b/deepray/datasets/wikicorpus_en/wikicorpus_en.py index 45b5fc03..95a9c8c5 100644 --- a/deepray/datasets/wikicorpus_en/wikicorpus_en.py +++ b/deepray/datasets/wikicorpus_en/wikicorpus_en.py @@ -21,7 +21,6 @@ from deepray.datasets.tfrecord_pipeline import TFRecordPipeline -FLAGS = flags.FLAGS FLAGS([ sys.argv[0], "--num_train_examples=24324736", diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py index dfd1666c..164133f5 100644 --- a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py +++ b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py @@ -13,8 +13,6 @@ from .wikicorpus_en import Wikicorpus_en -FLAGS = flags.FLAGS - TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/deepray/deepray.bzl b/deepray/deepray.bzl index 74a64031..d2428100 100644 --- a/deepray/deepray.bzl +++ b/deepray/deepray.bzl @@ -1,41 +1,59 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library") load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION", "D_GLIBCXX_USE_CXX11_ABI") -load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured") +load( + "@org_tensorflow//tensorflow:py.default.bzl", + _plain_py_library = "py_library", +) +load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs", "tf_copts") +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +cc_shared_library = native.cc_shared_library + +def _cuda_copts(opts = []): + """Gets the appropriate set of copts for (maybe) CUDA compilation. + + If we're doing CUDA compilation, returns copts for our particular CUDA + compiler. If we're not doing CUDA compilation, returns an empty list. + + """ + return select({ + "//conditions:default": [], + "@local_config_cuda//cuda:using_nvcc": [ + "-nvcc_options=relaxed-constexpr", + "-nvcc_options=ftz=true", + ] + opts, + "@local_config_cuda//cuda:using_clang": [ + "-fcuda-flush-denormals-to-zero", + ] + opts, + }) def custom_op_library( name, srcs = [], - cuda_srcs = [], + gpu_srcs = [], deps = [], - cuda_deps = [], + gpu_deps = [], copts = [], **kwargs): + """ + Reference: https://github.com/tensorflow/addons/blob/master/tensorflow_addons/tensorflow_addons.bzl + """ deps = deps + [ "@local_config_tf//:libtensorflow_framework", "@local_config_tf//:libtensorflow_cc", "@local_config_tf//:tf_header_lib", ] - if cuda_srcs: - copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]) - cuda_copts = copts + if_cuda_is_configured([ - "-x cuda", - "-nvcc_options=relaxed-constexpr", - "-nvcc_options=ftz=true", - ]) - cuda_deps = deps + if_cuda_is_configured(cuda_deps) + if_cuda_is_configured([ - "@local_config_cuda//cuda:cuda_headers", - "@local_config_cuda//cuda:cudart_static", - ]) + if gpu_srcs: basename = name.split(".")[0] - native.cc_library( + cuda_library( name = basename + "_gpu", - srcs = cuda_srcs, - deps = cuda_deps, - copts = cuda_copts, - alwayslink = 1, + srcs = gpu_srcs, + copts = copts + tf_copts() + _cuda_copts(), + deps = deps + gpu_deps, **kwargs ) - deps = deps + if_cuda_is_configured([":" + basename + "_gpu"]) + deps = deps + [":" + basename + "_gpu"] copts = copts + select({ "//deepray:windows": [ @@ -67,3 +85,281 @@ def custom_op_library( deps = deps, **kwargs ) + +def clean_dep(target): + """Returns string to 'target' in @org_tensorflow repository. + + Use this function when referring to targets in the @org_tensorflow + repository from macros that may be called from external repositories. + """ + + # A repo-relative label is resolved relative to the file in which the + # Label() call appears, i.e. @org_tensorflow. + return str(Label(target)) + +def filegroup(**kwargs): + native.filegroup(**kwargs) + +def _rpath_user_link_flags(name): + # Search parent directories up to the TensorFlow root directory for shared + # object dependencies, even if this op shared object is deeply nested + # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then + # the root and tensorflow/libtensorflow_framework.so should exist when + # deployed. Other shared object dependencies (e.g. shared between contrib/ + # ops) are picked up as long as they are in either the same or a parent + # directory in the tensorflow/ tree. + levels_to_root = native.package_name().count("/") + name.count("/") + return select({ + clean_dep("@platforms//os:macos"): [ + "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), + "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", + ], + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),), + ], + }) + +def _rpath_linkopts(name): + # Search parent directories up to the TensorFlow root directory for shared + # object dependencies, even if this op shared object is deeply nested + # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then + # the root and tensorflow/libtensorflow_framework.so should exist when + # deployed. Other shared object dependencies (e.g. shared between contrib/ + # ops) are picked up as long as they are in either the same or a parent + # directory in the tensorflow/ tree. + levels_to_root = native.package_name().count("/") + name.count("/") + return select({ + clean_dep("@platforms//os:macos"): [ + "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), + "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", + ], + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),), + ], + }) + +def _make_search_paths(prefix, levels_to_root): + return ",".join( + [ + "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level)) + for search_level in range(levels_to_root + 1) + ], + ) + +# buildozer: disable=function-docstring-args +def pybind_extension_opensource( + name, + srcs, + module_name = None, # Unused. + hdrs = [], + dynamic_deps = [], + static_deps = [], + deps = [], + additional_exported_symbols = [], + compatible_with = None, + copts = [], + data = [], + defines = [], + deprecation = None, + enable_stub_generation = False, # Unused. + additional_stubgen_deps = [], # Unused. + features = [], + link_in_framework = False, + licenses = None, + linkopts = [], + pytype_deps = [], + pytype_srcs = [], + restricted_to = None, + srcs_version = "PY3", + testonly = None, + visibility = None, + win_def_file = None): + """Builds a generic Python extension module.""" + _ignore = [enable_stub_generation, additional_stubgen_deps, module_name] # buildifier: disable=unused-variable + p = name.rfind("/") + if p == -1: + sname = name + prefix = "" + else: + sname = name[p + 1:] + prefix = name[:p + 1] + so_file = "%s%s.so" % (prefix, sname) + filegroup_name = "%s_filegroup" % name + pyd_file = "%s%s.pyd" % (prefix, sname) + exported_symbols = [ + "init%s" % sname, + "init_%s" % sname, + "PyInit_%s" % sname, + ] + additional_exported_symbols + + exported_symbols_file = "%s-exported-symbols.lds" % name + version_script_file = "%s-version-script.lds" % name + + exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols]) + version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols]) + + native.genrule( + name = name + "_exported_symbols", + outs = [exported_symbols_file], + cmd = "echo '%s' >$@" % exported_symbols_output, + output_licenses = ["unencumbered"], + visibility = ["//visibility:private"], + testonly = testonly, + ) + + native.genrule( + name = name + "_version_script", + outs = [version_script_file], + cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output, + output_licenses = ["unencumbered"], + visibility = ["//visibility:private"], + testonly = testonly, + ) + + if static_deps: + cc_library_name = so_file + "_cclib" + cc_library( + name = cc_library_name, + hdrs = hdrs, + srcs = srcs + hdrs, + data = data, + deps = deps, + compatible_with = compatible_with, + copts = copts + [ + "-fno-strict-aliasing", + "-fexceptions", + ] + select({ + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-fvisibility=hidden", + ], + }), + defines = defines, + features = features + ["-use_header_modules"], + restricted_to = restricted_to, + testonly = testonly, + visibility = visibility, + ) + + cc_shared_library( + name = so_file, + roots = [cc_library_name], + dynamic_deps = dynamic_deps, + static_deps = static_deps, + additional_linker_inputs = [exported_symbols_file, version_script_file], + compatible_with = compatible_with, + deprecation = deprecation, + features = features + ["-use_header_modules"], + licenses = licenses, + restricted_to = restricted_to, + shared_lib_name = so_file, + testonly = testonly, + user_link_flags = linkopts + _rpath_user_link_flags(name) + select({ + clean_dep("@platforms//os:macos"): [ + # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols + # not being exported. There should be a better way to deal with this. + "-Wl,-w", + "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, + ], + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-Wl,--version-script", + "$(location %s)" % version_script_file, + ], + }), + visibility = visibility, + ) + + # cc_shared_library can generate more than one file. + # Solution to avoid the error "variable '$<' : more than one input file." + filegroup( + name = filegroup_name, + srcs = [so_file], + output_group = "main_shared_library_output", + testonly = testonly, + ) + else: + if link_in_framework: + srcs += tf_binary_additional_srcs() + + cc_binary( + name = so_file, + srcs = srcs + hdrs, + data = data, + copts = copts + [ + "-fno-strict-aliasing", + "-fexceptions", + ] + select({ + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-fvisibility=hidden", + ], + }), + linkopts = linkopts + _rpath_linkopts(name) + select({ + clean_dep("@platforms//os:macos"): [ + # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols + # not being exported. There should be a better way to deal with this. + "-Wl,-w", + "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, + ], + clean_dep("//deepray:windows"): [], + "//conditions:default": [ + "-Wl,--version-script", + "$(location %s)" % version_script_file, + ], + }), + deps = deps + [ + exported_symbols_file, + version_script_file, + ], + defines = defines, + features = features + ["-use_header_modules"], + linkshared = 1, + testonly = testonly, + licenses = licenses, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + ) + + # For Windows, emulate the above filegroup with the shared object. + native.alias( + name = filegroup_name, + actual = so_file, + ) + + # For Windows only. + native.genrule( + name = name + "_pyd_copy", + srcs = [filegroup_name], + outs = [pyd_file], + cmd = "cp $< $@", + output_to_bindir = True, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + testonly = testonly, + ) + + _plain_py_library( + name = name, + data = select({ + clean_dep("//deepray:windows"): [pyd_file], + "//conditions:default": [so_file], + }) + pytype_srcs, + deps = pytype_deps, + srcs_version = srcs_version, + licenses = licenses, + testonly = testonly, + visibility = visibility, + deprecation = deprecation, + restricted_to = restricted_to, + compatible_with = compatible_with, + ) + +# Export open source version of pybind_extension under base name as well. +pybind_extension = pybind_extension_opensource diff --git a/deepray/layers/BUILD b/deepray/layers/BUILD index 4b73fd12..57ef2821 100644 --- a/deepray/layers/BUILD +++ b/deepray/layers/BUILD @@ -13,8 +13,9 @@ py_library( "//deepray/activations", "//deepray/layers/rnn", "//deepray/testing", - "//deepray/text", + # "//deepray/text", "//deepray/utils", + "@pypi_pandas//:pkg", ], ) diff --git a/deepray/layers/__init__.py b/deepray/layers/__init__.py index d445c511..88a5336f 100644 --- a/deepray/layers/__init__.py +++ b/deepray/layers/__init__.py @@ -38,10 +38,9 @@ from deepray.layers.spatial_pyramid_pooling import SpatialPyramidPooling2D from deepray.layers.tlu import TLU from deepray.layers.wrappers import WeightNormalization -from deepray.layers.esn import ESN from deepray.layers.stochastic_depth import StochasticDepth from deepray.layers.noisy_dense import NoisyDense -from deepray.layers.crf import CRF +# from deepray.layers.crf import CRF from deepray.layers.on_device_embedding import OnDeviceEmbedding from deepray.layers.position_embedding import PositionEmbedding diff --git a/deepray/layers/attention.py b/deepray/layers/attention.py index 9d22095d..9a974003 100644 --- a/deepray/layers/attention.py +++ b/deepray/layers/attention.py @@ -23,15 +23,13 @@ import numpy as np import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers +import tf_keras as keras from deepray.layers import dense_einsum from deepray.layers import masked_softmax -# @tf.keras.utils.register_keras_serializable(package="Text") -class Attention(tf.keras.layers.Layer): +class Attention(keras.layers.Layer): """Attention layer. This is an implementation of multi-headed attention based on "Attention @@ -80,12 +78,12 @@ def __init__( self._num_heads = num_heads self._head_size = head_size self._dropout_rate = dropout_rate - self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) - self._bias_initializer = tf.keras.initializers.get(bias_initializer) - self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) - self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) - self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) - self._bias_constraint = tf.keras.constraints.get(bias_constraint) + self._kernel_initializer = keras.initializers.get(kernel_initializer) + self._bias_initializer = keras.initializers.get(bias_initializer) + self._kernel_regularizer = keras.regularizers.get(kernel_regularizer) + self._bias_regularizer = keras.regularizers.get(bias_regularizer) + self._kernel_constraint = keras.constraints.get(kernel_constraint) + self._bias_constraint = keras.constraints.get(bias_constraint) self._query_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._head_size), @@ -125,20 +123,20 @@ def __init__( self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1]) - self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) + self._dropout = keras.layers.Dropout(rate=self._dropout_rate) def get_config(self): config = { "num_heads": self._num_heads, "head_size": self._head_size, "dropout_rate": self._dropout_rate, - "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer), - "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer), - "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer), - "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer), - "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer), - "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint), - "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint) + "kernel_initializer": keras.initializers.serialize(self._kernel_initializer), + "bias_initializer": keras.initializers.serialize(self._bias_initializer), + "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer), + "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer), + "activity_regularizer": keras.regularizers.serialize(self._activity_regularizer), + "kernel_constraint": keras.constraints.serialize(self._kernel_constraint), + "bias_constraint": keras.constraints.serialize(self._bias_constraint) } base_config = super(Attention, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -190,7 +188,6 @@ def call(self, inputs): return tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_tensor) -# @tf.keras.utils.register_keras_serializable(package="Text") class CachedAttention(Attention): """Attention layer with cache used for auto-agressive decoding. @@ -266,7 +263,7 @@ def call(self, inputs, decode_loop_step=None): return tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_tensor), cache -class WindowAttention(tf.keras.layers.Layer): +class WindowAttention(keras.layers.Layer): """ ## Window based multi-head self-attention @@ -284,9 +281,9 @@ def __init__(self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0, self.window_size = window_size self.num_heads = num_heads self.scale = (dim // num_heads)**-0.5 - self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias) - self.dropout = layers.Dropout(dropout_rate) - self.proj = layers.Dense(dim) + self.qkv = keras.layers.Dense(dim * 3, use_bias=qkv_bias) + self.dropout = keras.layers.Dropout(dropout_rate) + self.proj = keras.layers.Dense(dim) def build(self, input_shape): num_window_elements = (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) diff --git a/deepray/layers/dcn.py b/deepray/layers/dcn.py index 935b2a2d..48658c6a 100644 --- a/deepray/layers/dcn.py +++ b/deepray/layers/dcn.py @@ -16,9 +16,10 @@ from typing import Union, Text, Optional import tensorflow as tf +import tf_keras as keras -class Cross(tf.keras.layers.Layer): +class Cross(keras.layers.Layer): """Cross Layer in Deep & Cross Network to learn explicit feature interactions. A layer that creates explicit and bounded-degree feature interactions @@ -43,12 +44,12 @@ class Cross(tf.keras.layers.Layer): ```python # after embedding layer in a functional model: - input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64) + input = keras.Input(shape=(None,), name='index', dtype=tf.int64) x0 = dp.layers.Embedding(vocabulary_size=32, embedding_dim=6) x1 = Cross()(x0, x0) x2 = Cross()(x0, x1) - logits = tf.keras.layers.Dense(units=10)(x2) - model = tf.keras.Model(input, logits) + logits = keras.layers.Dense(units=10)(x2) + model = keras.Model(input, logits) ``` Args: @@ -82,11 +83,11 @@ def __init__( projection_dim: Optional[int] = None, diag_scale: Optional[float] = 0.0, use_bias: bool = True, - preactivation: Optional[Union[str, tf.keras.layers.Activation]] = None, - kernel_initializer: Union[Text, tf.keras.initializers.Initializer] = "truncated_normal", - bias_initializer: Union[Text, tf.keras.initializers.Initializer] = "zeros", - kernel_regularizer: Union[Text, None, tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Union[Text, None, tf.keras.regularizers.Regularizer] = None, + preactivation: Optional[Union[str, keras.layers.Activation]] = None, + kernel_initializer: Union[Text, keras.initializers.Initializer] = "truncated_normal", + bias_initializer: Union[Text, keras.initializers.Initializer] = "zeros", + kernel_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None, + bias_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None, **kwargs ): @@ -95,11 +96,11 @@ def __init__( self._projection_dim = projection_dim self._diag_scale = diag_scale self._use_bias = use_bias - self._preactivation = tf.keras.activations.get(preactivation) - self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) - self._bias_initializer = tf.keras.initializers.get(bias_initializer) - self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) - self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) + self._preactivation = keras.activations.get(preactivation) + self._kernel_initializer = keras.initializers.get(kernel_initializer) + self._bias_initializer = keras.initializers.get(bias_initializer) + self._kernel_regularizer = keras.regularizers.get(kernel_regularizer) + self._bias_regularizer = keras.regularizers.get(bias_regularizer) self._input_dim = None self._supports_masking = True @@ -111,7 +112,7 @@ def build(self, input_shape): last_dim = input_shape[-1] if self._projection_dim is None: - self._dense = tf.keras.layers.Dense( + self._dense = keras.layers.Dense( last_dim, kernel_initializer=_clone_initializer(self._kernel_initializer), bias_initializer=self._bias_initializer, @@ -122,14 +123,14 @@ def build(self, input_shape): activation=self._preactivation, ) else: - self._dense_u = tf.keras.layers.Dense( + self._dense_u = keras.layers.Dense( self._projection_dim, kernel_initializer=_clone_initializer(self._kernel_initializer), kernel_regularizer=self._kernel_regularizer, use_bias=False, dtype=self.dtype, ) - self._dense_v = tf.keras.layers.Dense( + self._dense_v = keras.layers.Dense( last_dim, kernel_initializer=_clone_initializer(self._kernel_initializer), bias_initializer=self._bias_initializer, @@ -183,11 +184,11 @@ def get_config(self): "projection_dim": self._projection_dim, "diag_scale": self._diag_scale, "use_bias": self._use_bias, - "preactivation": tf.keras.activations.serialize(self._preactivation), - "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer), - "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer), - "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer), - "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer), + "preactivation": keras.activations.serialize(self._preactivation), + "kernel_initializer": keras.initializers.serialize(self._kernel_initializer), + "bias_initializer": keras.initializers.serialize(self._bias_initializer), + "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer), + "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer), } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/deepray/layers/dense.py b/deepray/layers/dense.py new file mode 100644 index 00000000..98e2c3f6 --- /dev/null +++ b/deepray/layers/dense.py @@ -0,0 +1,287 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the Dense layer.""" +# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import + +import tensorflow.compat.v2 as tf +from packaging import version + +if version.parse(tf.__version__.replace("-tf", "+tf")) < version.parse("2.11"): + from keras import activations + from keras import backend + from keras import constraints + from keras import initializers + from keras import regularizers + from keras.dtensor import utils + from keras.engine.base_layer import Layer + from keras.engine.input_spec import InputSpec +else: + from keras.src.dtensor import utils + from keras.src import activations + from keras.src import backend + from keras.src import constraints + from keras.src import initializers + from keras.src import regularizers + from keras.src.dtensor import utils + from keras.src.engine.base_layer import Layer + from keras.src.engine.input_spec import InputSpec + + +class Dense(Layer): + """Just your regular densely-connected NN layer. + + `Dense` implements the operation: + `output = activation(dot(input, kernel) + bias)` + where `activation` is the element-wise activation function + passed as the `activation` argument, `kernel` is a weights matrix + created by the layer, and `bias` is a bias vector created by the layer + (only applicable if `use_bias` is `True`). These are all attributes of + `Dense`. + + Note: If the input to the layer has a rank greater than 2, then `Dense` + computes the dot product between the `inputs` and the `kernel` along the + last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`). + For example, if input has dimensions `(batch_size, d0, d1)`, + then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates + along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)` + (there are `batch_size * d0` such sub-tensors). + The output in this case will have shape `(batch_size, d0, units)`. + + Besides, layer attributes cannot be modified after the layer has been called + once (except the `trainable` attribute). + When a popular kwarg `input_shape` is passed, then keras will create + an input layer to insert before the current layer. This can be treated + equivalent to explicitly defining an `InputLayer`. + + Example: + + >>> # Create a `Sequential` model and add a Dense layer as the first layer. + >>> model = tf.keras.models.Sequential() + >>> model.add(tf.keras.Input(shape=(16,))) + >>> model.add(tf.keras.layers.Dense(32, activation='relu')) + >>> # Now the model will take as input arrays of shape (None, 16) + >>> # and output arrays of shape (None, 32). + >>> # Note that after the first layer, you don't need to specify + >>> # the size of the input anymore: + >>> model.add(tf.keras.layers.Dense(32)) + >>> model.output_shape + (None, 32) + + Args: + units: Positive integer, dimensionality of the output space. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. "linear" activation: `a(x) = x`). + use_bias: Boolean, whether the layer uses a bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix. + bias_initializer: Initializer for the bias vector. + kernel_regularizer: Regularizer function applied to + the `kernel` weights matrix. + bias_regularizer: Regularizer function applied to the bias vector. + activity_regularizer: Regularizer function applied to + the output of the layer (its "activation"). + kernel_constraint: Constraint function applied to + the `kernel` weights matrix. + bias_constraint: Constraint function applied to the bias vector. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., units)`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, units)`. + """ + + @utils.allow_initializer_layout + def __init__( + self, + units, + activation=None, + use_bias=True, + kernel_initializer='glorot_uniform', + bias_initializer='zeros', + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + **kwargs + ): + super(Dense, self).__init__(activity_regularizer=activity_regularizer, **kwargs) + + self.name = self.kwargs('name', None) + self.units = int(units) if not isinstance(units, int) else units + if self.units < 0: + raise ValueError( + f'Received an invalid value for `units`, expected ' + f'a positive integer. Received: units={units}' + ) + self.activation = activations.get(activation) + self.use_bias = use_bias + self.kernel_initializer = initializers.get(kernel_initializer) + self.bias_initializer = initializers.get(bias_initializer) + self.kernel_regularizer = regularizers.get(kernel_regularizer) + self.bias_regularizer = regularizers.get(bias_regularizer) + self.kernel_constraint = constraints.get(kernel_constraint) + self.bias_constraint = constraints.get(bias_constraint) + + self.input_spec = InputSpec(min_ndim=2) + self.supports_masking = True + + def build(self, input_shape): + dtype = tf.as_dtype(self.dtype or backend.floatx()) + if not (dtype.is_floating or dtype.is_complex): + raise TypeError('A Dense layer can only be built with a floating-point ' + f'dtype. Received: dtype={dtype}') + + input_shape = tf.TensorShape(input_shape) + last_dim = tf.compat.dimension_value(input_shape[-1]) + if last_dim is None: + raise ValueError( + 'The last dimension of the inputs to a Dense layer ' + 'should be defined. Found None. ' + f'Full input shape received: {input_shape}' + ) + self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim}) + self.kernel = self.add_weight( + '%skernel' % self.name + '_' if self.name else "", + shape=[last_dim, self.units], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + dtype=self.dtype, + trainable=True + ) + if self.use_bias: + self.bias = self.add_weight( + '%sbias' % self.name + '_' if self.name else "", + shape=[ + self.units, + ], + initializer=self.bias_initializer, + regularizer=self.bias_regularizer, + constraint=self.bias_constraint, + dtype=self.dtype, + trainable=True + ) + else: + self.bias = None + self.built = True + + def call(self, inputs): + if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype: + inputs = tf.cast(inputs, dtype=self._compute_dtype_object) + + is_ragged = isinstance(inputs, tf.RaggedTensor) + if is_ragged: + # In case we encounter a RaggedTensor with a fixed last dimension (last + # dimension not ragged), we can flatten the input and restore the ragged + # dimensions at the end. + if tf.compat.dimension_value(inputs.shape[-1]) is None: + raise ValueError( + 'Dense layer only supports RaggedTensors when the ' + 'innermost dimension is non-ragged. Received: ' + f'inputs.shape={inputs.shape}.' + ) + original_inputs = inputs + if inputs.flat_values.shape.rank > 1: + inputs = inputs.flat_values + else: + # Innermost partition is encoded using uniform_row_length. + # (This is unusual, but we can handle it.) + if inputs.shape.rank == 2: + inputs = inputs.to_tensor() + is_ragged = False + else: + for _ in range(original_inputs.ragged_rank - 1): + inputs = inputs.values + inputs = inputs.to_tensor() + original_inputs = tf.RaggedTensor.from_nested_row_splits(inputs, original_inputs.nested_row_splits[:-1]) + + rank = inputs.shape.rank + if rank == 2 or rank is None: + # We use embedding_lookup_sparse as a more efficient matmul operation for + # large sparse input tensors. The op will result in a sparse gradient, as + # opposed to sparse_ops.sparse_tensor_dense_matmul which results in dense + # gradients. This can lead to sigfinicant speedups, see b/171762937. + if isinstance(inputs, tf.SparseTensor): + # We need to fill empty rows, as the op assumes at least one id per row. + inputs, _ = tf.sparse.fill_empty_rows(inputs, 0) + # We need to do some munging of our input to use the embedding lookup as + # a matrix multiply. We split our input matrix into separate ids and + # weights tensors. The values of the ids tensor should be the column + # indices of our input matrix and the values of the weights tensor + # can continue to the actual matrix weights. + # The column arrangement of ids and weights + # will be summed over and does not matter. See the documentation for + # sparse_ops.sparse_tensor_dense_matmul a more detailed explanation + # of the inputs to both ops. + ids = tf.SparseTensor(indices=inputs.indices, values=inputs.indices[:, 1], dense_shape=inputs.dense_shape) + weights = inputs + outputs = tf.nn.embedding_lookup_sparse(self.kernel, ids, weights, combiner='sum') + else: + outputs = tf.matmul(a=inputs, b=self.kernel) + # Broadcast kernel to inputs. + else: + outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]]) + # Reshape the output back to the original ndim of the input. + if not tf.executing_eagerly(): + shape = inputs.shape.as_list() + output_shape = shape[:-1] + [self.kernel.shape[-1]] + outputs.set_shape(output_shape) + + if self.use_bias: + outputs = tf.nn.bias_add(outputs, self.bias) + + if self.activation is not None: + outputs = self.activation(outputs) + + if is_ragged: + outputs = original_inputs.with_flat_values(outputs) + + return outputs + + def compute_output_shape(self, input_shape): + input_shape = tf.TensorShape(input_shape) + input_shape = input_shape.with_rank_at_least(2) + if tf.compat.dimension_value(input_shape[-1]) is None: + raise ValueError( + 'The last dimension of the input shape of a Dense layer ' + 'should be defined. Found None. ' + f'Received: input_shape={input_shape}' + ) + return input_shape[:-1].concatenate(self.units) + + def get_config(self): + config = super(Dense, self).get_config() + config.update( + { + 'name': self.name, + 'units': self.units, + 'activation': activations.serialize(self.activation), + 'use_bias': self.use_bias, + 'kernel_initializer': initializers.serialize(self.kernel_initializer), + 'bias_initializer': initializers.serialize(self.bias_initializer), + 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), + 'bias_regularizer': regularizers.serialize(self.bias_regularizer), + 'activity_regularizer': regularizers.serialize(self.activity_regularizer), + 'kernel_constraint': constraints.serialize(self.kernel_constraint), + 'bias_constraint': constraints.serialize(self.bias_constraint) + } + ) + return config diff --git a/deepray/layers/dense_einsum.py b/deepray/layers/dense_einsum.py index 4fdbc24a..6abfbd37 100644 --- a/deepray/layers/dense_einsum.py +++ b/deepray/layers/dense_einsum.py @@ -24,7 +24,6 @@ _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"] -# @tf.keras.utils.register_keras_serializable(package="Text") class DenseEinsum(tf.keras.layers.Layer): """A densely connected layer that uses tf.einsum as the backing computation. diff --git a/deepray/layers/dynamic_embedding.py b/deepray/layers/dynamic_embedding.py index d3048144..742c632c 100644 --- a/deepray/layers/dynamic_embedding.py +++ b/deepray/layers/dynamic_embedding.py @@ -1,132 +1,139 @@ # -*- coding:utf-8 -*- """Dynamic Embedding layer.""" - from collections import defaultdict from typing import Dict, List from typing import Optional, Literal import pandas as pd import tensorflow as tf -import tensorflow_recommenders_addons as tfra -from absl import flags, logging +from absl import flags from tensorflow.python.keras import regularizers, initializers -from tensorflow_recommenders_addons import dynamic_embedding as de -from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import BasicEmbedding as DynamicEmbedding -from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import HvdAllToAllEmbedding from deepray.layers.bucketize import NumericaBucketIdLayer, Hash -from deepray.utils.horovod_utils import get_world_size, get_rank +from deepray.utils import logging_util +from deepray.utils.horovod_utils import get_world_size, get_rank, is_main_process + +logger = logging_util.get_logger() + +try: + import tensorflow_recommenders_addons as tfra + from tensorflow_recommenders_addons import dynamic_embedding as de + from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import BasicEmbedding as DynamicEmbedding + from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import HvdAllToAllEmbedding + + class EmbeddingLayerRedis(DynamicEmbedding): + + def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs): + self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer) + self.mask_value = mask_value + super().__init__(**kwargs) + + def call(self, ids): + with tf.name_scope(self.name + "/EmbeddingLookupUnique"): + ids_flat = tf.reshape(ids, [-1]) + with tf.device("/CPU:0"): + unique_ids, idx = tf.unique(ids_flat) + unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids) + embeddings_flat = tf.gather(unique_embeddings, idx) + embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0) + embeddings = tf.reshape(embeddings_flat, embeddings_shape) + return embeddings + + def get_config(self): + config = { + 'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer), + 'mask_value': self.mask_value + } + base_config = super(EmbeddingLayerRedis, self).get_config() + + return dict(list(base_config.items()) + list(config.items())) + + class EmbeddingLayerGPU(DynamicEmbedding): + + def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs): + self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer) + self.mask_value = mask_value + self.with_unique = kwargs.get("with_unique", True) + super().__init__(**kwargs) + + def call(self, ids): + with tf.name_scope(self.name + "/EmbeddingLookupUnique"): + if self.with_unique: + ids_flat = tf.reshape(ids, [-1]) + unique_ids, idx = tf.unique(ids_flat) + unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids) + embeddings_flat = tf.gather(unique_embeddings, idx) + embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0) + embeddings = tf.reshape(embeddings_flat, embeddings_shape) + else: + embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, ids) + return embeddings -FLAGS = flags.FLAGS + def get_config(self): + config = { + 'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer), + 'mask_value': self.mask_value + } + base_config = super(EmbeddingLayerGPU, self).get_config() + return dict(list(base_config.items()) + list(config.items())) +except ImportError as e: + logger.warning("An exception occurred when import tensorflow_recommenders_addons: " + str(e)) -class DynamicEmbeddingOption(object): - def __init__( - self, - device: Optional[Literal["HBM", "DRAM", "Redis", "HKV"]] = None, - init_capacity=1 * 1024 * 1024, - max_capacity=128 * 1024 * 1024, - max_hbm_for_vectors=4 * 1024 * 1024 * 1024 - ): - self.device_name = device - self.init_capacity = init_capacity - self.max_capacity = max_capacity - self.max_hbm_for_vectors = max_hbm_for_vectors +class DistributedDynamicEmbedding(tf.keras.layers.Layer): - if device == "Redis": - if FLAGS.redis_config_env: - redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir_env=FLAGS.redis_config_env) + def get_de_options(self, case, init_capacity, **kwargs): + redis_creator = None + cuckoo_creator = None + hkv_creator = None + + if case == "Redis": + if flags.FLAGS.redis_config_env: + redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir_env=flags.FLAGS.redis_config_env) else: - redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir=FLAGS.redis_config_dir) + redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir=flags.FLAGS.redis_config_dir) + redis_creator = tfra.dynamic_embedding.RedisTableCreator(redis_config) - self.devices = ['/CPU:0'] - self.kv_creator = tfra.dynamic_embedding.RedisTableCreator(redis_config) - return - elif device == "HKV": - self.devices = ['/GPU:0'] + if case == "HKV": hkv_config = tfra.dynamic_embedding.HkvHashTableConfig( init_capacity=init_capacity, - max_capacity=max_capacity, - max_hbm_for_vectors=max_hbm_for_vectors, + max_capacity=kwargs.get("max_capacity", 128 * 1024 * 1024), + max_hbm_for_values=kwargs.get("max_hbm_for_values", 4 * 1024 * 1024 * 1024), ) - if FLAGS.use_horovod: - self.kv_creator = tfra.dynamic_embedding.HkvHashTableCreator( + if flags.FLAGS.use_horovod: + hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator( hkv_config, saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank()) ) else: - self.kv_creator = tfra.dynamic_embedding.HkvHashTableCreator(hkv_config) - return - elif device == "HBM": - self.devices = ['/GPU:0'] - elif device == "DRAM": - self.devices = ['/CPU:0'] - else: - raise ValueError(f"Found device {device} not in supported type Redis, DRAM, HBM, HKV") - if FLAGS.use_horovod: - self.kv_creator = de.CuckooHashTableCreator( + hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator(hkv_config, saver=de.FileSystemSaver()) + + if flags.FLAGS.use_horovod: + cuckoo_creator = de.CuckooHashTableCreator( saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank()) ) else: - self.kv_creator = de.CuckooHashTableCreator(saver=de.FileSystemSaver()) - - -class EmbeddingLayerRedis(DynamicEmbedding): - - def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs): - self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer) - self.mask_value = mask_value - super().__init__(**kwargs) - - def call(self, ids): - with tf.name_scope(self.name + "/EmbeddingLookupUnique"): - ids_flat = tf.reshape(ids, [-1]) - with tf.device("/CPU:0"): - unique_ids, idx = tf.unique(ids_flat) - unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids) - embeddings_flat = tf.gather(unique_embeddings, idx) - embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0) - embeddings = tf.reshape(embeddings_flat, embeddings_shape) - return embeddings - - def get_config(self): - config = { - 'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer), - 'mask_value': self.mask_value - } - base_config = super(EmbeddingLayerRedis, self).get_config() - - return dict(list(base_config.items()) + list(config.items())) - - -class EmbeddingLayerGPU(DynamicEmbedding): - - def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs): - self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer) - self.mask_value = mask_value - super().__init__(**kwargs) - - def call(self, ids): - with tf.name_scope(self.name + "/EmbeddingLookupUnique"): - ids_flat = tf.reshape(ids, [-1]) - unique_ids, idx = tf.unique(ids_flat) - unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids) - embeddings_flat = tf.gather(unique_embeddings, idx) - embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0) - embeddings = tf.reshape(embeddings_flat, embeddings_shape) - return embeddings - - def get_config(self): - config = { - 'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer), - 'mask_value': self.mask_value + cuckoo_creator = de.CuckooHashTableCreator(saver=de.FileSystemSaver()) + + switcher = { + "Redis": { + "devices": ['/CPU:0'], + "kv_creator": redis_creator, + }, + "DRAM": { + "devices": ['/CPU:0'], + "kv_creator": cuckoo_creator, + }, + "HBM": { + "devices": ['/GPU:0'], + "kv_creator": cuckoo_creator, + }, + "HKV": { + "devices": ['/GPU:0'], + "kv_creator": hkv_creator, + }, } - base_config = super(EmbeddingLayerGPU, self).get_config() - - return dict(list(base_config.items()) + list(config.items())) - - -class DistributedDynamicEmbedding(tf.keras.layers.Layer): + return switcher.get(case, None) def __init__( self, @@ -135,7 +142,8 @@ def __init__( value_dtype: str, initializer=None, name: str = '', - de_option: DynamicEmbeddingOption = DynamicEmbeddingOption(device="DRAM"), + device: Optional[Literal["HBM", "DRAM", "Redis", "HKV", "EV"]] = "DRAM", + init_capacity=1 * 1024 * 1024, **kwargs ): super(DistributedDynamicEmbedding, self).__init__() @@ -143,35 +151,40 @@ def __init__( self.key_dtype = key_dtype self.value_dtype = value_dtype self.initializer = initializer - self.de_option = de_option + self.device = device + self.init_capacity = init_capacity - if de_option.device_name == "Redis": + if device == "Redis": + de_option = self.get_de_options(device, init_capacity, **kwargs) self.emb = EmbeddingLayerRedis( embedding_size=embedding_dim, key_dtype=key_dtype, value_dtype=value_dtype, initializer=initializer, name=name, - devices=de_option.devices, - kv_creator=de_option.kv_creator, + devices=de_option["devices"], + kv_creator=de_option["kv_creator"], **kwargs ) - logging.info(f"Create EmbeddingLayer for {name} on {de_option.device_name} with {embedding_dim} dim") + if is_main_process(): + logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim") return - if not FLAGS.use_horovod: + de_option = self.get_de_options(device, init_capacity, **kwargs) + if not flags.FLAGS.use_horovod: self.emb = EmbeddingLayerGPU( embedding_size=embedding_dim, key_dtype=key_dtype, value_dtype=value_dtype, initializer=initializer, name=name, - devices=de_option.devices, - init_capacity=de_option.init_capacity, - kv_creator=de_option.kv_creator, + devices=de_option["devices"], + init_capacity=init_capacity, + kv_creator=de_option["kv_creator"], **kwargs ) - logging.info(f"Create EmbeddingLayer for {name} on {de_option.device_name} with {embedding_dim} dim") + if is_main_process(): + logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim") else: self.emb = HvdAllToAllEmbedding( embedding_size=embedding_dim, @@ -179,12 +192,13 @@ def __init__( value_dtype=value_dtype, initializer=initializer, name=name, - devices=de_option.devices, - init_capacity=de_option.init_capacity, - kv_creator=de_option.kv_creator, + devices=de_option["devices"], + init_capacity=init_capacity, + kv_creator=de_option["kv_creator"], **kwargs ) - logging.info(f"Create HvdAllToAllEmbedding for {name} on {de_option.device_name} with {embedding_dim} dim") + if is_main_process(): + logger.info(f"Create HvdAllToAllEmbedding for {name} on {device} with {embedding_dim} dim") def call(self, ids, *args, **kwargs): return self.emb(ids) @@ -197,8 +211,8 @@ def get_config(self): "key_dtype": self.key_dtype, "value_dtype": self.value_dtype, "initializer": self.initializer, - "name": self.name, - "de_option": self.de_option, + "device": self.device, + "init_capacity": self.init_capacity } ) return config @@ -279,13 +293,13 @@ def factor2decimal(self, composition_part: int): return res def build(self, input_shape=None): - self.composition_emb = DistributedDynamicEmbedding( + self.composition_emb = EmbeddingVariable( embedding_dim=self.embedding_dim, key_dtype=self.key_dtype, value_dtype=self.value_dtype, initializer=self.initializer, name=f"embeddings_{self.suffix}/Compositional", - de_option=DynamicEmbeddingOption(device=self.device,) + device=self.device, ) def call(self, inputs, *args, **kwargs): @@ -384,13 +398,13 @@ def build(self, input_shape): name=self.fold_columns[name] ) else: - self.embedding_layers[self.fold_columns[name]] = DistributedDynamicEmbedding( + self.embedding_layers[self.fold_columns[name]] = EmbeddingVariable( embedding_dim=dim, key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype, value_dtype=tf.float32, initializer=tf.keras.initializers.GlorotUniform(), name='embedding_' + self.fold_columns[name], - de_option=DynamicEmbeddingOption(device=storage_type) + device=storage_type, ) self.split_dims[self.fold_columns[name]].append(length) diff --git a/deepray/layers/embedding.py b/deepray/layers/embedding.py index 71b91886..557b8314 100644 --- a/deepray/layers/embedding.py +++ b/deepray/layers/embedding.py @@ -20,24 +20,40 @@ import numpy as np import pandas as pd import tensorflow as tf -from absl import flags -from keras import backend -from keras import constraints -from keras import initializers -from keras import regularizers -from keras.dtensor import utils -from keras.engine import base_layer_utils -from keras.engine.base_layer import Layer -from tensorflow import keras -from tensorflow.keras import layers -from tensorflow.keras.layers import StringLookup -from tensorflow.python.keras.utils import tf_utils - +from packaging.version import parse + +if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"): + from keras import backend + from keras import constraints + from keras import initializers + from keras import regularizers + from keras.dtensor import utils + from keras.engine import base_layer_utils + from keras.engine.base_layer import Layer + from keras.utils import tf_utils +elif parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src import backend + from tf_keras.src import constraints + from tf_keras.src import initializers + from tf_keras.src import regularizers + from tf_keras.src.dtensor import utils + from tf_keras.src.engine import base_layer_utils + from tf_keras.src.engine.base_layer import Layer + from tf_keras.src.utils import tf_utils +else: + from keras.src import backend + from keras.src import constraints + from keras.src import initializers + from keras.src import regularizers + from keras.src.dtensor import utils + from keras.src.engine import base_layer_utils + from keras.src.engine.base_layer import Layer + from keras.src.utils import tf_utils + +import tf_keras as keras import deepray as dp from deepray.layers.bucketize import Hash -FLAGS = flags.FLAGS - def get_variable_path(checkpoint_path, name, i=0): tokens = name.split('/') @@ -55,7 +71,7 @@ class Embedding(Layer): e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]` This layer can only be used on positive integer inputs of a fixed range. The - `tf.keras.layers.TextVectorization`, `tf.keras.layers.StringLookup`, + `tf.keras.layers.TextVectorization`, `keras.layers.StringLookup`, and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare inputs for an `Embedding` layer. @@ -576,12 +592,12 @@ def __init__(self, vocabulary, embedding_dim, num_buckets, name=None): super().__init__(name=name) self.num_buckets = num_buckets - self.index_lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) - self.q_embeddings = layers.Embedding( + self.index_lookup = keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) + self.q_embeddings = keras.layers.Embedding( num_buckets, embedding_dim, ) - self.r_embeddings = layers.Embedding( + self.r_embeddings = keras.layers.Embedding( num_buckets, embedding_dim, ) @@ -649,17 +665,17 @@ def __init__(self, blocks_vocabulary, blocks_embedding_dims, base_embedding_dim, block_embedding_encoder = self.embedding_encoder(vocabulary, embedding_dim, num_oov_indices=1) self.block_embedding_encoders.append(block_embedding_encoder) if embedding_dim == base_embedding_dim: - self.block_embedding_projectors.append(layers.Lambda(lambda x: x)) + self.block_embedding_projectors.append(keras.layers.Lambda(lambda x: x)) else: - self.block_embedding_projectors.append(layers.Dense(units=base_embedding_dim)) + self.block_embedding_projectors.append(keras.layers.Dense(units=base_embedding_dim)) self.base_embedding_dim = 64 def embedding_encoder(self, vocabulary, embedding_dim, num_oov_indices=0, name=None): return keras.Sequential( [ - StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices), - layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim), + keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices), + keras.layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim), ], name=f"{name}_embedding" if name else None, ) diff --git a/deepray/layers/embedding_variable.py b/deepray/layers/embedding_variable.py new file mode 100644 index 00000000..ae9aef32 --- /dev/null +++ b/deepray/layers/embedding_variable.py @@ -0,0 +1,206 @@ +# -*- coding:utf-8 -*- +"""Dynamic Embedding layer.""" +import typing + +import horovod.tensorflow as hvd +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops + +from deepray.custom_ops.embedding_variable import config_pb2 +from deepray.custom_ops.embedding_variable import variables as ev_variables +from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable +from deepray.utils import logging_util +from deepray.utils.horovod_utils import get_world_size + +logger = logging_util.get_logger() + +StorageType = { + "HBM": config_pb2.StorageType.HBM, + "DRAM": config_pb2.StorageType.DRAM, + "HBM_DRAM": config_pb2.StorageType.HBM_DRAM, + "LEVELDB": config_pb2.StorageType.LEVELDB, + "SSDHASH": config_pb2.StorageType.SSDHASH, + "DRAM_LEVELDB": config_pb2.StorageType.DRAM_LEVELDB, + "DRAM_SSDHASH": config_pb2.StorageType.DRAM_SSDHASH +} + +CacheStrategy = {"LFU": config_pb2.CacheStrategy.LFU, "LRU": config_pb2.CacheStrategy.LRU} + + +def default_partition_fn(keys, shard_num): + """The default partition function. + partition keys by "mod" strategy. + + keys: a tensor presents the keys to be partitioned. + shard_num: the num of partitions + Returns: + a tensor with same shape as keys with type of `tf.int32`, + represents the corresponding partition-ids of keys. + """ + return math_ops.mod(keys, shard_num) + + +def int64_partition_fn(keys, shard_num): + return math_ops.cast(math_ops.mod(keys, shard_num), dtype=dtypes.int32) + + +def partition_fn_v2(keys, shard_num): + return tf.cast( + tf.strings.to_hash_bucket_fast( + tf.strings.as_string(keys), # 将 int 转为 string 再哈希 + num_buckets=shard_num + ), + tf.int32 + ) + + +class EmbeddingVariable(tf.keras.layers.Layer): + + def __init__( + self, + embedding_dim: int, + key_dtype=dtypes.int64, + value_dtype: str = None, + initializer=None, + name: str = '', + with_unique=False, + partition_fn: typing.Callable[[typing.Any, typing.Any], typing.Any] = None, + **kwargs + ): + super(EmbeddingVariable, self).__init__(name=name) + self.embedding_size = embedding_dim + self.with_unique = with_unique + self.world_size = get_world_size() + + if partition_fn is None: + if key_dtype == dtypes.int64: + partition_fn = int64_partition_fn + elif key_dtype == dtypes.int32: + partition_fn = default_partition_fn + + storage_type = kwargs.get("storage_type", None) + if storage_type: + ev_option = ev_variables.EmbeddingVariableOption( + storage_option=ev_variables.StorageOption( + storage_type=StorageType[storage_type], + storage_path=kwargs.get("storage_path", None), + storage_size=kwargs.get("storage_size", [1024 * 1024 * 1024]), + cache_strategy=CacheStrategy[kwargs.get("cache_strategy", "LFU")] + ) + ) + else: + ev_option = ev_variables.EmbeddingVariableOption() + + self.embedding_variable = get_embedding_variable( + embedding_dim=embedding_dim, + key_dtype=key_dtype, + value_dtype=value_dtype, + initializer=initializer, + name=name, + ev_option=ev_option, + ) + + self.partition_fn = partition_fn + if self.world_size > 1: + self.call = self.hvd_read + if self.world_size >= 8: # 小规模并行时用取模更快 + self.partition_fn = partition_fn_v2 + else: + self.call = self.unique_read if self.with_unique else self.read + + def make_partition(self, data, partition_index): + """ + Shard keys to shard_num partitions + + Args: + data: keys or values, usually the IDs of dynamic features. + partition_index: partitions index. + shard_num: partition number + Returns: + a pair of tensor: (partition result, partition indices) + """ + partitions = tf.dynamic_partition(data, partition_index, self.world_size) + indices = tf.dynamic_partition(math_ops.range(array_ops.shape(data)[0]), partition_index, self.world_size) + return partitions, indices + + def read(self, ids, *args, **kwargs): + return self.embedding_variable.sparse_read(ids) + + def unique_read(self, ids, *args, **kwargs): + """Read with deduplication for better performance with repeated IDs.""" + with ops.name_scope(f"{self.name}/EmbeddingWithUnique"): + ids_flat = tf.reshape(ids, [-1]) + unique_ids, idx = tf.unique(ids_flat) + unique_embeddings = self.embedding_variable.sparse_read(unique_ids) + embeddings_flat = tf.gather(unique_embeddings, idx) + embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0) + embeddings = tf.reshape(embeddings_flat, embeddings_shape) + return embeddings + + def hvd_read(self, ids, *args, **kwargs): + """ + Compute embedding output for feature ids. The output shape will be (shape(ids), + embedding_size). + + Args: + ids: feature ids of the input. It should be same dtype as the key_dtype + of the layer. + + Returns: + A embedding output with shape (shape(ids), embedding_size). + """ + is_ragged = isinstance(ids, tf.RaggedTensor) + + if is_ragged: + original_structure = ids + ids = ids.flat_values + + input_shape = tf.shape(ids) + embeddings_shape = tf.concat([input_shape, [self.embedding_size]], 0) + + ids_flat = tf.reshape(ids, [-1]) + + def distributed_lookup(ids): + partition_index = self.partition_fn(ids, self.world_size) + ids_partitions, gather_indices = self.make_partition(ids, partition_index) + partitions_sizes = tf.stack([tf.size(p) for p in ids_partitions], axis=0) + relocs_tensor = tf.concat(ids_partitions, axis=0) + # Provide a unique name for the first alltoall operation + flat_reloc_ids, remote_sizes = hvd.alltoall( + relocs_tensor, splits=partitions_sizes, name=f"{self.name}_alltoall_ids" + ) + + lookup_result = self.read(flat_reloc_ids) + lookup_result, _ = hvd.alltoall(lookup_result, splits=remote_sizes, name=f"{self.name}_alltoall_embeddings") + + input_shape = tf.shape(ids) + recover_shape = tf.concat((input_shape, (self.embedding_size,)), axis=0) + gather_indices = tf.expand_dims(tf.concat(gather_indices, axis=0), axis=-1) + lookup_result = tf.scatter_nd(gather_indices, lookup_result, recover_shape) + return lookup_result + + if self.with_unique: + # with ops.name_scope(name, "EmbeddingWithUnique"): + unique_ids, idx = tf.unique(ids_flat) + unique_embeddings = distributed_lookup(unique_ids) + embeddings_flat = tf.gather(unique_embeddings, idx) + else: + embeddings_flat = distributed_lookup(ids_flat) + + embeddings = tf.reshape(embeddings_flat, embeddings_shape) + + if is_ragged: + embeddings = tf.RaggedTensor.from_row_lengths(embeddings, original_structure.row_lengths()) + + return embeddings + + def get_config(self): + config = super().get_config() + config.update({ + "world_size": self.world_size, + "name": self.name, + }) + return config diff --git a/deepray/layers/feature_cross.py b/deepray/layers/feature_cross.py index a2b24694..40ad397c 100644 --- a/deepray/layers/feature_cross.py +++ b/deepray/layers/feature_cross.py @@ -351,8 +351,8 @@ def build(self, input_shape): kernel_regularizer=self.regularizer, name="compress_tower" ) - self._trainable_weights.extend(self.compress_tower.trainable_weights) - self._non_trainable_weights.extend(self.compress_tower.non_trainable_weights) + self.trainable_weights.extend(self.compress_tower.trainable_weights) + self.non_trainable_weights.extend(self.compress_tower.non_trainable_weights) return super(CDot, self).build(input_shape) def call(self, inputs, **kwargs): @@ -516,7 +516,6 @@ def __init__( allow_kernel_norm: bool = False, use_dropout=False, keep_prob=0.95, - mode: str = tf.estimator.ModeKeys.TRAIN, **kwargs ): super(DCN, self).__init__(**kwargs) @@ -529,7 +528,6 @@ def __init__( self.allow_kernel_norm = allow_kernel_norm self.use_dropout = use_dropout self.keep_prob = keep_prob - self.mode = mode def build(self, input_shape): dims = check_dim(input_shape[-1]) @@ -621,7 +619,7 @@ def build(self, input_shape): return super(DCN, self).build(input_shape) - def call(self, inputs, **kwargs): + def call(self, inputs, training=None, **kwargs): x0 = inputs xl = x0 @@ -660,7 +658,7 @@ def call(self, inputs, **kwargs): moe_out = tf.matmul(output_of_experts, gating_score_of_experts) xl = tf.squeeze(moe_out, -1) + xl - if self.use_dropout and self.mode == tf.estimator.ModeKeys.TRAIN: + if self.use_dropout and training: xl = tf.nn.dropout(xl, rate=1 - self.keep_prob) return xl @@ -683,15 +681,15 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable): for v in var: K.track_variable(v) if trainable: - self._trainable_weights.append(v) + self.trainable_weights.append(v) else: - self._non_trainable_weights.append(v) + self.non_trainable_weights.append(v) else: K.track_variable(var) if trainable: - self._trainable_weights.append(var) + self.trainable_weights.append(var) else: - self._non_trainable_weights.append(var) + self.non_trainable_weights.append(var) with tf.compat.v1.variable_scope('', reuse=tf.compat.v1.AUTO_REUSE): trainable_var_norm = tf.compat.v1.get_variable( @@ -703,15 +701,15 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable): for v in trainable_var_norm: K.track_variable(v) if trainable: - self._trainable_weights.append(v) + self.trainable_weights.append(v) else: - self._non_trainable_weights.append(v) + self.non_trainable_weights.append(v) else: K.track_variable(trainable_var_norm) if trainable: - self._trainable_weights.append(trainable_var_norm) + self.trainable_weights.append(trainable_var_norm) else: - self._non_trainable_weights.append(trainable_var_norm) + self.non_trainable_weights.append(trainable_var_norm) var = tf.multiply(normalized, trainable_var_norm, name='mul_var_norm') else: var = self.add_weight( @@ -731,7 +729,6 @@ def get_config(self): 'allow_kernel_norm': self.allow_kernel_norm, 'use_dropout': self.use_dropout, 'keep_prob': self.keep_prob, - 'mode': self.mode } base_config = super(DCN, self).get_config() @@ -819,8 +816,8 @@ def build(self, input_shape): ) ) - self._trainable_weights.extend(self._conv1d[-1].trainable_weights) - self._non_trainable_weights.extend(self._conv1d[-1].non_trainable_weights) + self.trainable_weights.extend(self._conv1d[-1].trainable_weights) + self.non_trainable_weights.extend(self._conv1d[-1].non_trainable_weights) return super(CIN, self).build(input_shape) def call(self, inputs, **kwargs): diff --git a/deepray/layers/masked_softmax.py b/deepray/layers/masked_softmax.py index 257b2ae9..1925955b 100644 --- a/deepray/layers/masked_softmax.py +++ b/deepray/layers/masked_softmax.py @@ -22,7 +22,6 @@ import tensorflow as tf -@tf.keras.utils.register_keras_serializable(package='Text') class MaskedSoftmax(tf.keras.layers.Layer): """Performs a softmax with optional masking on a tensor. diff --git a/deepray/layers/max_unpooling_2d.py b/deepray/layers/max_unpooling_2d.py index 01bda9c1..4e7cbc28 100644 --- a/deepray/layers/max_unpooling_2d.py +++ b/deepray/layers/max_unpooling_2d.py @@ -19,7 +19,46 @@ from typeguard import typechecked from typing import Union, Iterable -from deepray.utils.keras_utils import normalize_tuple + +def normalize_tuple(value, n, name): + """Transforms an integer or iterable of integers into an integer tuple. + + A copy of tensorflow.python.keras.util. + + Args: + value: The value to validate and convert. Could an int, or any iterable + of ints. + n: The size of the tuple to be returned. + name: The name of the argument being validated, e.g. "strides" or + "kernel_size". This is only used to format error messages. + + Returns: + A tuple of n integers. + + Raises: + ValueError: If something else than an int/long or iterable thereof was + passed. + """ + if isinstance(value, int): + return (value,) * n + else: + try: + value_tuple = tuple(value) + except TypeError: + raise TypeError("The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)) + if len(value_tuple) != n: + raise ValueError( + "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + ) + for single_value in value_tuple: + try: + int(single_value) + except (ValueError, TypeError): + raise ValueError( + "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " " + "including element " + str(single_value) + " of type" + " " + str(type(single_value)) + ) + return value_tuple def _calculate_output_shape(input_shape, pool_size, strides, padding): diff --git a/deepray/layers/max_unpooling_2d_v2.py b/deepray/layers/max_unpooling_2d_v2.py index 1dd4607e..6acd769f 100644 --- a/deepray/layers/max_unpooling_2d_v2.py +++ b/deepray/layers/max_unpooling_2d_v2.py @@ -19,7 +19,7 @@ from typeguard import typechecked from typing import Iterable -from deepray.utils.keras_utils import normalize_tuple +from deepray.layers.max_unpooling_2d import normalize_tuple def _max_unpooling_2d_v2(updates, mask, output_size): diff --git a/deepray/layers/mlp.py b/deepray/layers/mlp.py index bf1d59ea..b6caa864 100644 --- a/deepray/layers/mlp.py +++ b/deepray/layers/mlp.py @@ -2,8 +2,7 @@ from typing import List import tensorflow as tf -from tensorflow.keras.layers import BatchNormalization as BatchNorm -from tensorflow.python.keras import regularizers +import tf_keras as keras def extend_as_list(x, n): @@ -66,8 +65,8 @@ def __init__( self.hidden_units = hidden_units self.prefix = name self.use_bias = use_bias - self.kernel_regularizer = regularizers.get(kernel_regularizer) - self.bias_regularizer = regularizers.get(bias_regularizer) + self.kernel_regularizer = keras.regularizers.get(kernel_regularizer) + self.bias_regularizer = keras.regularizers.get(bias_regularizer) self.enable_batch_normalization = enable_batch_normalization self.batch_normalization_momentum = batch_normalization_momentum self.batch_normalization_renorm = batch_normalization_renorm @@ -95,15 +94,15 @@ def __init__( def build(self, input_shape): if self.enable_batch_normalization: - bn = BatchNorm( + bn = keras.layers.BatchNormalization( momentum=self.batch_normalization_momentum, renorm=self.batch_normalization_renorm, renorm_clipping=self.batch_normalization_renorm_clipping, renorm_momentum=self.batch_normalization_renorm_momentum, name=f"BatchNorm/in" ) - self._trainable_weights.extend(bn.trainable_weights) - self._non_trainable_weights.extend(bn.non_trainable_weights) + self.trainable_weights.extend(bn.trainable_weights) + self.non_trainable_weights.extend(bn.non_trainable_weights) self.add_loss(bn.losses) self._stacked_layers.append(bn) @@ -119,21 +118,21 @@ def build(self, input_shape): kernel_regularizer=self.kernel_regularizer, bias_regularizer=self.bias_regularizer ) - self._trainable_weights.extend(dense.trainable_weights) - self._non_trainable_weights.extend(dense.non_trainable_weights) + self.trainable_weights.extend(dense.trainable_weights) + self.non_trainable_weights.extend(dense.non_trainable_weights) self.add_loss(dense.losses) self._stacked_layers.append(dense) if not is_final_layer and self.enable_batch_normalization: - bn = BatchNorm( + bn = keras.layers.BatchNormalization( momentum=self.batch_normalization_momentum, renorm=self.batch_normalization_renorm, renorm_clipping=self.batch_normalization_renorm_clipping, renorm_momentum=self.batch_normalization_renorm_momentum, name=f"BatchNorm/out" ) - self._trainable_weights.extend(bn.trainable_weights) - self._non_trainable_weights.extend(bn.non_trainable_weights) + self.trainable_weights.extend(bn.trainable_weights) + self.non_trainable_weights.extend(bn.non_trainable_weights) self.add_loss(bn.losses) self._stacked_layers.append(bn) @@ -158,8 +157,8 @@ def get_config(self): "enable_batch_normalization": self.enable_batch_normalization, "batch_normalization_momentum": self.batch_normalization_momentum, "use_bias": self.use_bias, - 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), - 'bias_regularizer': regularizers.serialize(self.bias_regularizer), + 'kernel_regularizer': keras.regularizers.serialize(self.kernel_regularizer), + 'bias_regularizer': keras.regularizers.serialize(self.bias_regularizer), 'batch_normalization_renorm': self.batch_normalization_renorm, 'batch_normalization_renorm_clipping': self.batch_normalization_renorm_clipping, 'batch_normalization_renorm_momentum': self.batch_normalization_renorm_momentum diff --git a/deepray/layers/networks/__init__.py b/deepray/layers/networks/__init__.py index 1b10b038..e69de29b 100644 --- a/deepray/layers/networks/__init__.py +++ b/deepray/layers/networks/__init__.py @@ -1,17 +0,0 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Networks package definition.""" -from deepray.layers.networks.transformer_encoder import TransformerEncoder -from .span_labeling import SpanLabeling \ No newline at end of file diff --git a/deepray/layers/noisy_dense.py b/deepray/layers/noisy_dense.py index 07fa9d39..83caf31e 100644 --- a/deepray/layers/noisy_dense.py +++ b/deepray/layers/noisy_dense.py @@ -14,14 +14,7 @@ # ============================================================================== import tensorflow as tf -from tensorflow.keras import ( - activations, - initializers, - regularizers, - constraints, -) -from tensorflow.keras import backend as K -from tensorflow.keras.layers import InputSpec +import tf_keras as keras from typeguard import typechecked from deepray.utils import types @@ -137,7 +130,7 @@ def __init__( def build(self, input_shape): # Make sure dtype is correct - dtype = tf.dtypes.as_dtype(self.dtype or K.floatx()) + dtype = tf.dtypes.as_dtype(self.dtype or keras.floatx()) if not (dtype.is_floating or dtype.is_complex): raise TypeError("Unable to build `Dense` layer with non-floating point " "dtype %s" % (dtype,)) @@ -148,7 +141,7 @@ def build(self, input_shape): if self.last_dim is None: raise ValueError("The last dimension of the inputs to `Dense` " "should be defined. Found `None`.") - self.input_spec = InputSpec(min_ndim=2, axes={-1: self.last_dim}) + self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: self.last_dim}) # use factorising Gaussian variables if self.use_factorised: @@ -159,8 +152,8 @@ def build(self, input_shape): mu_init = (3.0 / self.last_dim)**(1 / 2) sigma_init = 0.017 - sigma_init = initializers.Constant(value=sigma_init) - mu_init = initializers.RandomUniform(minval=-mu_init, maxval=mu_init) + sigma_init = keras.initializers.Constant(value=sigma_init) + mu_init = keras.initializers.RandomUniform(minval=-mu_init, maxval=mu_init) # Learnable parameters self.sigma_kernel = self.add_weight( @@ -186,7 +179,7 @@ def build(self, input_shape): self.eps_kernel = self.add_weight( "eps_kernel", shape=[self.last_dim, self.units], - initializer=initializers.Zeros(), + initializer=keras.initializers.Zeros(), regularizer=None, constraint=None, dtype=self.dtype, @@ -223,7 +216,7 @@ def build(self, input_shape): shape=[ self.units, ], - initializer=initializers.Zeros(), + initializer=keras.initializers.Zeros(), regularizer=None, constraint=None, dtype=self.dtype, @@ -284,13 +277,13 @@ def get_config(self): "units": self.units, "sigma": self.sigma, "use_factorised": self.use_factorised, - "activation": activations.serialize(self.activation), + "activation": keras.activations.serialize(self.activation), "use_bias": self.use_bias, - "kernel_regularizer": regularizers.serialize(self.kernel_regularizer), - "bias_regularizer": regularizers.serialize(self.bias_regularizer), - "activity_regularizer": regularizers.serialize(self.activity_regularizer), - "kernel_constraint": constraints.serialize(self.kernel_constraint), - "bias_constraint": constraints.serialize(self.bias_constraint), + "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer), + "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer), + "activity_regularizer": keras.regularizers.serialize(self.activity_regularizer), + "kernel_constraint": keras.constraints.serialize(self.kernel_constraint), + "bias_constraint": keras.constraints.serialize(self.bias_constraint), } ) return config diff --git a/deepray/layers/on_device_embedding.py b/deepray/layers/on_device_embedding.py index 585c9fca..7cf4c4a8 100644 --- a/deepray/layers/on_device_embedding.py +++ b/deepray/layers/on_device_embedding.py @@ -24,7 +24,6 @@ from deepray.layers import tf_utils -# @tf.keras.utils.register_keras_serializable(package="Text") class OnDeviceEmbedding(tf.keras.layers.Layer): """Performs an embedding lookup suitable for accelerator devices. diff --git a/deepray/layers/pooling.py b/deepray/layers/pooling.py index 40c2c311..a3780066 100644 --- a/deepray/layers/pooling.py +++ b/deepray/layers/pooling.py @@ -1,9 +1,7 @@ import tensorflow as tf -from keras.engine.base_layer import Layer - -class Pooling(Layer): +class Pooling(tf.keras.layers.Layer): """ input shape: (batch_size, seq_len, emb_dim) output shape: (batch_size, 1, emb_dim) diff --git a/deepray/layers/rnn/esn_cell.py b/deepray/layers/rnn/esn_cell.py index 28db723d..6440c94c 100644 --- a/deepray/layers/rnn/esn_cell.py +++ b/deepray/layers/rnn/esn_cell.py @@ -15,7 +15,11 @@ """Implements ESN Cell.""" import tensorflow as tf -import tensorflow.keras as keras +from packaging.version import parse +if parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell +else: + from tensorflow.keras.layers.AbstractRNNCell import AbstractRNNCell from typeguard import typechecked from deepray.utils.types import ( @@ -25,7 +29,7 @@ @tf.keras.utils.register_keras_serializable(package="Deepray") -class ESNCell(keras.layers.AbstractRNNCell): +class ESNCell(AbstractRNNCell): """Echo State recurrent Network (ESN) cell. This implements the recurrent cell from the paper: H. Jaeger diff --git a/deepray/layers/rnn/layer_norm_lstm_cell.py b/deepray/layers/rnn/layer_norm_lstm_cell.py index 589e889c..1600fecd 100644 --- a/deepray/layers/rnn/layer_norm_lstm_cell.py +++ b/deepray/layers/rnn/layer_norm_lstm_cell.py @@ -15,7 +15,7 @@ """Implements LayerNormLSTM Cell.""" import tensorflow as tf -import tensorflow.keras as keras +import tf_keras as keras from typeguard import typechecked from deepray.utils.types import ( diff --git a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py index 7d4f0999..537a1107 100644 --- a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py +++ b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py @@ -15,7 +15,7 @@ """Implements LayerNormSimpleRNNCell Cell.""" import tensorflow as tf -import tensorflow.keras as keras +import tf_keras as keras from typeguard import typechecked from deepray.utils.types import ( diff --git a/deepray/layers/rnn/nas_cell.py b/deepray/layers/rnn/nas_cell.py index 7bf0c6f6..62e04f7e 100644 --- a/deepray/layers/rnn/nas_cell.py +++ b/deepray/layers/rnn/nas_cell.py @@ -15,7 +15,11 @@ """Implements NAS Cell.""" import tensorflow as tf -import tensorflow.keras as keras +from packaging.version import parse +if parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell +else: + from tensorflow.keras.layers.AbstractRNNCell import AbstractRNNCell from typeguard import typechecked from deepray.utils.types import ( @@ -27,7 +31,7 @@ @tf.keras.utils.register_keras_serializable(package="Deepray") -class NASCell(keras.layers.AbstractRNNCell): +class NASCell(AbstractRNNCell): """Neural Architecture Search (NAS) recurrent network cell. This implements the recurrent cell from the paper: diff --git a/deepray/layers/rnn/tests/esn_cell_test.py b/deepray/layers/rnn/tests/esn_cell_test.py index 3f5840e6..f6924270 100644 --- a/deepray/layers/rnn/tests/esn_cell_test.py +++ b/deepray/layers/rnn/tests/esn_cell_test.py @@ -16,7 +16,7 @@ import numpy as np import tensorflow as tf -import tensorflow.keras as keras +import tf_keras as keras from deepray.layers.rnn import ESNCell diff --git a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py index 43937ec7..49f26bd9 100644 --- a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py +++ b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py @@ -19,7 +19,7 @@ import numpy as np import tensorflow as tf -import tensorflow.keras as keras +import tf_keras as keras from deepray.layers.rnn import LayerNormLSTMCell diff --git a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py index b2a5045b..baaefefc 100644 --- a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py +++ b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py @@ -15,7 +15,7 @@ """Tests for LayerNormSimpleRNN Cell.""" import numpy as np -import tensorflow.keras as keras +import tf_keras as keras from deepray.layers.rnn import LayerNormSimpleRNNCell diff --git a/deepray/layers/rnn/tests/nas_cell_test.py b/deepray/layers/rnn/tests/nas_cell_test.py index 24dc7465..63079d0c 100644 --- a/deepray/layers/rnn/tests/nas_cell_test.py +++ b/deepray/layers/rnn/tests/nas_cell_test.py @@ -16,7 +16,7 @@ import numpy as np import tensorflow as tf -import tensorflow.keras as keras +import tf_keras as keras from deepray.layers.rnn import NASCell diff --git a/deepray/layers/self_attention_mask.py b/deepray/layers/self_attention_mask.py index ba0e5a92..fe0940f2 100644 --- a/deepray/layers/self_attention_mask.py +++ b/deepray/layers/self_attention_mask.py @@ -23,7 +23,6 @@ from deepray.layers import tf_utils -@tf.keras.utils.register_keras_serializable(package='Text') class SelfAttentionMask(tf.keras.layers.Layer): """Create 3D attention mask from a 2D tensor mask. diff --git a/deepray/layers/tests_bak/on_device_embedding_test.py b/deepray/layers/tests_bak/on_device_embedding_test.py new file mode 100644 index 00000000..7ab347a4 --- /dev/null +++ b/deepray/layers/tests_bak/on_device_embedding_test.py @@ -0,0 +1,183 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Keras-based one-hot embedding layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from deepray.layers import on_device_embedding + + +# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It +# guarantees forward compatibility of this code for the V2 switchover. +@keras_parameterized.run_all_keras_modes +class OnDeviceEmbeddingTest(keras_parameterized.TestCase): + + def test_layer_creation(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding(vocab_size=vocab_size, embedding_width=embedding_width) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # The output should be the same as the input, save that it has an extra + # embedding_width dimension on the end. + expected_output_shape = [None, sequence_length, embedding_width] + self.assertEqual(expected_output_shape, output_tensor.shape.as_list()) + self.assertEqual(output_tensor.dtype, tf.float32) + + def test_layer_creation_with_float16_dtype(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16" + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # The output should be the same as the input, save that it has an extra + # embedding_width dimension on the end. + expected_output_shape = [None, sequence_length, embedding_width] + self.assertEqual(expected_output_shape, output_tensor.shape.as_list()) + self.assertEqual(output_tensor.dtype, tf.float16) + + def test_layer_invocation(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding(vocab_size=vocab_size, embedding_width=embedding_width) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # Create a model from the test layer. + model = tf.keras.Model(input_tensor, output_tensor) + + # Invoke the model on test data. We can't validate the output data itself + # (the NN is too complex) but this will rule out structural runtime errors. + batch_size = 3 + input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length)) + output = model.predict(input_data) + self.assertEqual(tf.float32, output.dtype) + + def test_layer_invocation_with_float16_dtype(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16" + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # Create a model from the test layer. + model = tf.keras.Model(input_tensor, output_tensor) + + # Invoke the model on test data. We can't validate the output data itself + # (the NN is too complex) but this will rule out structural runtime errors. + batch_size = 3 + input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length)) + output = model.predict(input_data) + self.assertEqual(tf.float16, output.dtype) + + def test_one_hot_layer_creation(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # The output should be the same as the input, save that it has an extra + # embedding_width dimension on the end. + expected_output_shape = [None, sequence_length, embedding_width] + self.assertEqual(expected_output_shape, output_tensor.shape.as_list()) + self.assertEqual(output_tensor.dtype, tf.float32) + + def test_one_hot_layer_creation_with_float16_dtype(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # The output should be the same as the input, save that it has an extra + # embedding_width dimension on the end. + expected_output_shape = [None, sequence_length, embedding_width] + self.assertEqual(expected_output_shape, output_tensor.shape.as_list()) + self.assertEqual(output_tensor.dtype, tf.float16) + + def test_one_hot_layer_invocation(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # Create a model from the test layer. + model = tf.keras.Model(input_tensor, output_tensor) + + # Invoke the model on test data. We can't validate the output data itself + # (the NN is too complex) but this will rule out structural runtime errors. + batch_size = 3 + input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length)) + output = model.predict(input_data) + self.assertEqual(tf.float32, output.dtype) + + def test_one_hot_layer_invocation_with_float16_dtype(self): + vocab_size = 31 + embedding_width = 27 + test_layer = on_device_embedding.OnDeviceEmbedding( + vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True + ) + # Create a 2-dimensional input (the first dimension is implicit). + sequence_length = 23 + input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) + output_tensor = test_layer(input_tensor) + + # Create a model from the test layer. + model = tf.keras.Model(input_tensor, output_tensor) + + # Invoke the model on test data. We can't validate the output data itself + # (the NN is too complex) but this will rule out structural runtime errors. + batch_size = 3 + input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length)) + output = model.predict(input_data) + self.assertEqual(tf.float16, output.dtype) + + +if __name__ == "__main__": + tf.test.main() diff --git a/deepray/layers/tf_utils.py b/deepray/layers/tf_utils.py index 2c5370f9..42301f6b 100644 --- a/deepray/layers/tf_utils.py +++ b/deepray/layers/tf_utils.py @@ -22,6 +22,7 @@ import tensorflow as tf from deepray import activations +from deepray.activations import swish def pack_inputs(inputs): @@ -92,9 +93,9 @@ def get_activation(identifier): if isinstance(identifier, six.string_types): name_to_fn = { "gelu": tf.keras.activations.gelu, - "simple_swish": activations.simple_swish, - "hard_swish": activations.hard_swish, - "identity": activations.identity, + "simple_swish": swish.simple_swish, + "hard_swish": swish.hard_swish, + "identity": swish.identity, } identifier = str(identifier).lower() if identifier in name_to_fn: diff --git a/deepray/layers/transformer.py b/deepray/layers/transformer.py index 08d1e28f..b00085a6 100644 --- a/deepray/layers/transformer.py +++ b/deepray/layers/transformer.py @@ -25,7 +25,6 @@ from deepray.layers import dense_einsum -# @tf.keras.utils.register_keras_serializable(package="Text") class Transformer(tf.keras.layers.Layer): """Transformer layer. @@ -129,7 +128,9 @@ def build(self, input_shape): ) self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) self._attention_layer_norm = ( - tf.keras.layers.LayerNormalization(name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32) + tf.keras.layers.LayerNormalization( + name=f"{self.name}/self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32 + ) ) self._intermediate_dense = dense_einsum.DenseEinsum( output_shape=self._intermediate_size, @@ -157,7 +158,7 @@ def build(self, input_shape): ) self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) self._output_layer_norm = tf.keras.layers.LayerNormalization( - name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32 + name=f"{self.name}/output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32 ) super(Transformer, self).build(input_shape) @@ -196,12 +197,12 @@ def call(self, inputs): attention_output = self._attention_dropout(attention_output) # Use float32 in keras layer norm and the gelu activation in the # intermediate dense layer for numeric stability - if self.dtype == tf.float16: + if self.dtype == tf.float16 or self.dtype == tf.bfloat16: input_tensor = tf.cast(input_tensor, tf.float32) attention_output = tf.cast(attention_output, tf.float32) attention_output = self._attention_layer_norm(input_tensor + attention_output) intermediate_output = self._intermediate_dense(attention_output) - if self.dtype == tf.float16: + if self.dtype == tf.float16 or self.dtype == tf.bfloat16: # Casts to float32 so that activation is done in float32. intermediate_output = tf.cast(intermediate_output, tf.float32) intermediate_output = self._intermediate_activation_layer(intermediate_output) @@ -211,10 +212,10 @@ def call(self, inputs): layer_output = self._output_dense(intermediate_output) layer_output = self._output_dropout(layer_output) # Use float32 in keras layer norm for numeric stability - if self.dtype == tf.float16: + if self.dtype == tf.float16 or self.dtype == tf.bfloat16: layer_output = tf.cast(layer_output, tf.float32) layer_output = self._output_layer_norm(layer_output + attention_output) - if self.dtype == tf.float16: + if self.dtype == tf.float16 or self.dtype == tf.bfloat16: layer_output = tf.cast(layer_output, tf.float16) return layer_output diff --git a/deepray/layers/transformer_scaffold.py b/deepray/layers/transformer_scaffold.py index e8d865ed..488f8a59 100644 --- a/deepray/layers/transformer_scaffold.py +++ b/deepray/layers/transformer_scaffold.py @@ -25,7 +25,6 @@ from deepray.layers import dense_einsum -# @tf.keras.utils.register_keras_serializable(package="Text") class TransformerScaffold(tf.keras.layers.Layer): """Transformer scaffold layer. diff --git a/deepray/losses/__init__.py b/deepray/losses/__init__.py index f77298c1..295e6c4b 100644 --- a/deepray/losses/__init__.py +++ b/deepray/losses/__init__.py @@ -15,16 +15,20 @@ """Additional losses that conform to Keras API.""" import abc -import tensorflow as tf from absl import flags -from keras.engine import compile_utils -from tensorflow.keras.losses import BinaryCrossentropy +import tensorflow as tf +from packaging.version import parse + +if parse(tf.__version__) < parse("2.11"): + from keras.engine import compile_utils +elif parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.engine import compile_utils + import tf_keras as keras +else: + from keras.src.engine import compile_utils +from tensorflow.keras.losses import BinaryCrossentropy from deepray.losses.contrastive import contrastive_loss, ContrastiveLoss -from deepray.losses.focal_loss import ( - sigmoid_focal_crossentropy, - SigmoidFocalCrossEntropy, -) from deepray.losses.giou_loss import giou_loss, GIoULoss from deepray.losses.kappa_loss import WeightedKappaLoss from deepray.losses.lifted import lifted_struct_loss, LiftedStructLoss @@ -42,8 +46,7 @@ TripletSemiHardLoss, TripletHardLoss, ) - -FLAGS = flags.FLAGS +from deepray.losses.softmax_loss import SoftmaxLoss class Loss(compile_utils.LossesContainer): @@ -64,7 +67,7 @@ def __call__(self, y_true, y_pred, sample_weight=None, regularization_losses=Non self._built = True loss_value = self.call(y_true, y_pred, sample_weight) total_loss_mean_value = tf.nn.compute_average_loss( - loss_value, global_batch_size=FLAGS.batch_size * FLAGS.num_accumulation_steps + loss_value, global_batch_size=flags.FLAGS.batch_size * flags.FLAGS.num_accumulation_steps ) self._loss_metric.update_state(total_loss_mean_value, diff --git a/deepray/losses/_loss_util.py b/deepray/losses/_loss_util.py new file mode 100644 index 00000000..5eaa4561 --- /dev/null +++ b/deepray/losses/_loss_util.py @@ -0,0 +1,281 @@ +# Copyright 2024 The TensorFlow Ranking Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implements the losses for TF-Ranking.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc +import math +from typing import Callable, Dict, Tuple +import tensorflow as tf + +_PADDING_LABEL = -1. +_PADDING_PREDICTION = -1e6 +_PADDING_WEIGHT = 0. + +TensorLike = tf.types.experimental.TensorLike +TransformationFunction = Callable[[TensorLike], tf.Tensor] +LossFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor] +MetricFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor] + + +def serialize_keras_object(obj): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.serialize_keras_object(obj) + else: + return tf.keras.utils.serialize_keras_object(obj) + + +def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + else: + return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + + +class _RankingLoss(object, metaclass=abc.ABCMeta): + """Interface for ranking loss.""" + + def __init__(self, name, lambda_weight=None, temperature=1.0, ragged=False): + """Constructor. + + Args: + name: A string used as the name for this loss. + lambda_weight: A `_LambdaWeight` object. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + """ + self._name = name + self._lambda_weight = lambda_weight + self._temperature = temperature + self._ragged = ragged + + @property + def name(self): + """The loss name.""" + return self._name + + def _prepare_and_validate_params(self, labels, logits, weights, mask): + """Prepares and validate input parameters. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + A tuple (labels, logits, weights, mask) of `tf.Tensor` objects that are + ready to be used in the loss. + """ + if self._ragged: + labels, logits, weights, mask = ragged_to_dense(labels, logits, weights) + + if mask is None: + mask = is_label_valid(labels) + + if weights is None: + weights = 1.0 + + labels = tf.convert_to_tensor(labels) + logits = tf.convert_to_tensor(logits) + weights = tf.convert_to_tensor(weights) + mask = tf.convert_to_tensor(mask) + + return labels, logits, weights, mask + + def compute_unreduced_loss(self, labels, logits, mask=None): + """Computes the unreduced loss. + + Args: + labels: A `Tensor` or `RaggedTensor` of the same shape as `logits` + representing graded relevance. + logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size]. + Each value is the ranking score of the corresponding item. + mask: An optional `Tensor` of the same shape as logits indicating which + entries are valid for computing the loss. Will be ignored if the loss + was constructed with ragged=True. + + Returns: + A tuple(losses, loss_weights) that have the same shape. + """ + labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask) + return self._compute_unreduced_loss_impl(labels, logits, mask) + + @abc.abstractmethod + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """Implementation for the unreduced loss. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + mask: An optional `Tensor` of the same shape as logits indicating which + entries are valid for computing the loss. + + Returns: + A tuple(losses, loss_weights) that have the same shape. + """ + raise NotImplementedError('Calling an abstract method.') + + def normalize_weights(self, labels, weights): + """Normalizes weights. + + This is needed for `tf.estimator` given that the reduction may be + `SUM_OVER_NONZERO_WEIGHTS`. + + This method is also needed to compute normalized weights when calling + `compute_unreduced_loss`, which is done in the tf.keras losses. + + Args: + labels: A `Tensor` of shape [batch_size, list_size] representing graded + relevance. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + + Returns: + The normalized weights. + """ + if self._ragged: + labels, _, weights, _ = utils.ragged_to_dense(labels, None, weights) + return self._normalize_weights_impl(labels, weights) + + def _normalize_weights_impl(self, labels, weights): + """See `normalize_weights`.""" + del labels + return 1.0 if weights is None else weights + + def get_logits(self, logits): + """Computes logits rescaled by temperature. + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + + Returns: + Tensor of rescaled logits. + """ + if not tf.is_tensor(logits): + logits = tf.convert_to_tensor(value=logits) + return logits / self._temperature + + def compute(self, labels, logits, weights, reduction, mask=None): + """Computes the reduced loss for tf.estimator (not tf.keras). + + Note that this function is not compatible with keras. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to + reduce training loss over batch. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + Reduced loss for training and eval. + """ + logits = self.get_logits(logits) + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction) + + @abc.abstractmethod + def compute_per_list(self, labels, logits, weights, mask=None): + """Computes the per-list loss. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + A pair of `Tensor` objects of shape [batch_size] containing per-list + losses and weights. + """ + raise NotImplementedError('Calling an abstract method.') + + def eval_metric(self, labels, logits, weights, mask=None): + """Computes the eval metric for the loss in tf.estimator (not tf.keras). + + Note that this function is not compatible with keras. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the metric. + + Returns: + A metric op. + """ + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + return tf.compat.v1.metrics.mean(losses, weights) + + +def ragged_to_dense(labels, predictions, weights): + """Converts given inputs from ragged tensors to dense tensors. + + Args: + labels: A `tf.RaggedTensor` of the same shape as `predictions` representing + relevance. + predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each + value is the ranking score of the corresponding example. + weights: An optional `tf.RaggedTensor` of the same shape of predictions or a + `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and + the latter case is per-list. + + Returns: + A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s. + """ + # TODO: Add checks to validate (ragged) shapes of input tensors. + mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool) + labels = labels.to_tensor(_PADDING_LABEL) + if predictions is not None: + predictions = predictions.to_tensor(_PADDING_PREDICTION) + if isinstance(weights, tf.RaggedTensor): + weights = weights.to_tensor(_PADDING_WEIGHT) + return labels, predictions, weights, mask + + +def is_label_valid(labels): + """Returns a boolean `Tensor` for label validity.""" + labels = tf.convert_to_tensor(value=labels) + return tf.greater_equal(labels, 0.) diff --git a/deepray/losses/contrastive.py b/deepray/losses/contrastive.py index 7d138562..501d47a2 100644 --- a/deepray/losses/contrastive.py +++ b/deepray/losses/contrastive.py @@ -15,9 +15,9 @@ """Implements contrastive loss.""" import tensorflow as tf +from tensorflow.python.keras import losses from typeguard import typechecked -from deepray.utils.keras_utils import LossFunctionWrapper from deepray.utils.types import TensorLike, Number @@ -66,7 +66,7 @@ def contrastive_loss(y_true: TensorLike, y_pred: TensorLike, margin: Number = 1. @tf.keras.utils.register_keras_serializable(package="Deepray") -class ContrastiveLoss(LossFunctionWrapper): +class ContrastiveLoss(losses.LossFunctionWrapper): r"""Computes the contrastive loss between `y_true` and `y_pred`. This loss encourages the embedding to be close to each other for diff --git a/deepray/losses/focal_loss.py b/deepray/losses/focal_loss.py index 7b4b6cd7..21e2ddbe 100644 --- a/deepray/losses/focal_loss.py +++ b/deepray/losses/focal_loss.py @@ -15,15 +15,14 @@ """Implements Focal loss.""" import tensorflow as tf -import tensorflow.keras.backend as K +import tf_keras as keras from typeguard import typechecked -from deepray.utils.keras_utils import LossFunctionWrapper from deepray.utils.types import FloatTensorLike, TensorLike @tf.keras.utils.register_keras_serializable(package="Deepray") -class SigmoidFocalCrossEntropy(LossFunctionWrapper): +class SigmoidFocalCrossEntropy(keras.losses.LossFunctionWrapper): """Implements the focal loss function. Focal loss was first introduced in the RetinaNet paper @@ -118,7 +117,7 @@ def sigmoid_focal_crossentropy( y_true = tf.cast(y_true, dtype=y_pred.dtype) # Get the cross_entropy for each entry - ce = K.binary_crossentropy(y_true, y_pred, from_logits=from_logits) + ce = keras.binary_crossentropy(y_true, y_pred, from_logits=from_logits) # If logits are provided then convert the predictions into probabilities if from_logits: diff --git a/deepray/losses/giou_loss.py b/deepray/losses/giou_loss.py index a2dda7af..81a49d96 100644 --- a/deepray/losses/giou_loss.py +++ b/deepray/losses/giou_loss.py @@ -17,14 +17,14 @@ from typing import Optional import tensorflow as tf +from tensorflow.python.keras import losses from typeguard import typechecked -from deepray.utils.keras_utils import LossFunctionWrapper from deepray.utils.types import TensorLike @tf.keras.utils.register_keras_serializable(package="Deepray") -class GIoULoss(LossFunctionWrapper): +class GIoULoss(losses.LossFunctionWrapper): """Implements the GIoU loss function. GIoU loss was first introduced in the diff --git a/deepray/losses/lifted.py b/deepray/losses/lifted.py index 9146440c..7dade105 100644 --- a/deepray/losses/lifted.py +++ b/deepray/losses/lifted.py @@ -14,13 +14,14 @@ # ============================================================================== """Implements lifted_struct_loss.""" +from typing import Optional + import tensorflow as tf -from deepray.losses import metric_learning +from tensorflow.python.keras import losses +from typeguard import typechecked -from deepray.utils.keras_utils import LossFunctionWrapper +from deepray.losses import metric_learning from deepray.utils.types import FloatTensorLike, TensorLike -from typeguard import typechecked -from typing import Optional @tf.keras.utils.register_keras_serializable(package="Deepray") @@ -106,7 +107,7 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float @tf.keras.utils.register_keras_serializable(package="Deepray") -class LiftedStructLoss(LossFunctionWrapper): +class LiftedStructLoss(losses.LossFunctionWrapper): """Computes the lifted structured loss. The loss encourages the positive distances (between a pair of embeddings diff --git a/deepray/losses/losses_impl.py b/deepray/losses/losses_impl.py new file mode 100644 index 00000000..6230ac9e --- /dev/null +++ b/deepray/losses/losses_impl.py @@ -0,0 +1,1937 @@ +# Copyright 2024 The TensorFlow Ranking Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implements the losses for TF-Ranking.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc +import math + +import tensorflow as tf +from deepray.losses import utils + +# The smallest probability that is used to derive smallest logit for invalid or +# padding entries. +_EPSILON = 1e-10 + + +def _safe_default_gain_fn(labels): + """Calculates safe gain functions for NDCG. + + In applications such as distillation, the labels could have extreme values + that might result in numerical error when using the original gain function. + This should only be applied to NDCG related losses, but not DCG ones. It + should be applied on both the numerator and the denominator of NDCG. + + Args: + labels: A `Tensor` with shape [batch_size, list_size], representing graded + relevance. + Returns: + A `tensor` of safe gain function values of shape [batch_size, list_size]. + """ + max_labels = tf.reduce_max(labels, axis=-1, keepdims=True) + gains = tf.pow(2., labels - max_labels) - tf.pow(2., -max_labels) + return gains + + +def _check_tensor_shapes(tensors): + """Checks the tensor shapes to be compatible.""" + for tensor in tensors: + tensor = tf.convert_to_tensor(value=tensor) + tensor.get_shape().assert_has_rank(2) + tensor.get_shape().assert_is_compatible_with(tf.convert_to_tensor(value=tensors[0]).get_shape()) + + +def _apply_pairwise_op(op, tensor): + """Applies the op on tensor in the pairwise manner.""" + _check_tensor_shapes([tensor]) + return op(tf.expand_dims(tensor, 2), tf.expand_dims(tensor, 1)) + + +def _get_valid_pairs_and_clean_labels(labels): + """Returns a boolean Tensor for valid pairs and cleaned labels.""" + labels = tf.convert_to_tensor(value=labels) + labels.get_shape().assert_has_rank(2) + is_valid = utils.is_label_valid(labels) + valid_pairs = _apply_pairwise_op(tf.logical_and, is_valid) + labels = tf.compat.v1.where(is_valid, labels, tf.zeros_like(labels)) + return valid_pairs, labels + + +def approx_ranks(logits): + r"""Computes approximate ranks given a list of logits. + + Given a list of logits, the rank of an item in the list is one plus the total + number of items with a larger logit. In other words, + + rank_i = 1 + \sum_{j \neq i} I_{s_j > s_i}, + + where "I" is the indicator function. The indicator function can be + approximated by a generalized sigmoid: + + I_{s_j < s_i} \approx 1/(1 + exp(-(s_j - s_i)/temperature)). + + This function approximates the rank of an item using this sigmoid + approximation to the indicator function. This technique is at the core + of "A general approximation framework for direct optimization of + information retrieval measures" by Qin et al. + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + + Returns: + A `Tensor` of ranks with the same shape as logits. + """ + list_size = tf.shape(input=logits)[1] + x = tf.tile(tf.expand_dims(logits, 2), [1, 1, list_size]) + y = tf.tile(tf.expand_dims(logits, 1), [1, list_size, 1]) + pairs = tf.sigmoid(y - x) + return tf.reduce_sum(input_tensor=pairs, axis=-1) + .5 + + +def inverse_max_dcg( + labels, + gain_fn=lambda labels: tf.pow(2.0, labels) - 1., + rank_discount_fn=lambda rank: 1. / tf.math.log1p(rank), + topn=None +): + """Computes the inverse of max DCG. + + Args: + labels: A `Tensor` with shape [batch_size, list_size]. Each value is the + graded relevance of the corresponding item. + gain_fn: A gain function. By default this is set to: 2^label - 1. + rank_discount_fn: A discount function. By default this is set to: + 1/log(1+rank). + topn: An integer as the cutoff of examples in the sorted list. + + Returns: + A `Tensor` with shape [batch_size, 1]. + """ + ideal_sorted_labels, = utils.sort_by_scores(labels, [labels], topn=topn) + rank = tf.range(tf.shape(input=ideal_sorted_labels)[1]) + 1 + discounted_gain = gain_fn(ideal_sorted_labels) * rank_discount_fn(tf.cast(rank, dtype=tf.float32)) + discounted_gain = tf.reduce_sum(input_tensor=discounted_gain, axis=1, keepdims=True) + return tf.compat.v1.where(tf.greater(discounted_gain, 0.), 1. / discounted_gain, tf.zeros_like(discounted_gain)) + + +def ndcg(labels, ranks=None, perm_mat=None): + """Computes NDCG from labels and ranks. + + Args: + labels: A `Tensor` with shape [batch_size, list_size], representing graded + relevance. + ranks: A `Tensor` of the same shape as labels, or [1, list_size], or None. + If ranks=None, we assume the labels are sorted in their rank. + perm_mat: A `Tensor` with shape [batch_size, list_size, list_size] or None. + Permutation matrices with rows correpond to the ranks and columns + correspond to the indices. An argmax over each row gives the index of the + element at the corresponding rank. + + Returns: + A `tensor` of NDCG, ApproxNDCG, or ExpectedNDCG of shape [batch_size, 1]. + """ + if ranks is not None and perm_mat is not None: + raise ValueError('Cannot use both ranks and perm_mat simultaneously.') + + if ranks is None: + list_size = tf.shape(labels)[1] + ranks = tf.range(list_size) + 1 + discounts = 1. / tf.math.log1p(tf.cast(ranks, dtype=tf.float32)) + gains = _safe_default_gain_fn(tf.cast(labels, dtype=tf.float32)) + if perm_mat is not None: + gains = tf.reduce_sum(input_tensor=perm_mat * tf.expand_dims(gains, 1), axis=-1) + dcg = tf.reduce_sum(input_tensor=gains * discounts, axis=-1, keepdims=True) + normalized_dcg = dcg * inverse_max_dcg(labels, gain_fn=_safe_default_gain_fn) + + return normalized_dcg + + +class _LambdaWeight(object, metaclass=abc.ABCMeta): + """Interface for ranking metric optimization. + + This class wraps weights used in the LambdaLoss framework for ranking metric + optimization (https://ai.google/research/pubs/pub47258). Such an interface is + to be instantiated by concrete lambda weight models. The instance is used + together with standard loss such as logistic loss and softmax loss. + """ + # TODO: Define a public version of `_LambdaWeight` for typing + # annotations. + + @abc.abstractmethod + def pair_weights(self, labels, ranks): + """Returns the weight adjustment `Tensor` for example pairs. + + Args: + labels: A dense `Tensor` of labels with shape [batch_size, list_size]. + ranks: A dense `Tensor` of ranks with the same shape as `labels` that are + sorted by logits. + + Returns: + A `Tensor` that can weight example pairs. + """ + raise NotImplementedError('Calling an abstract method.') + + def individual_weights(self, labels, ranks): + """Returns the weight `Tensor` for individual examples. + + Args: + labels: A dense `Tensor` of labels with shape [batch_size, list_size]. + ranks: A dense `Tensor` of ranks with the same shape as `labels` that are + sorted by logits. + + Returns: + A `Tensor` that can weight individual examples. + """ + del ranks + return labels + + +class LabelDiffLambdaWeight(_LambdaWeight): + """A simple LambdaWeight to compute the pair label difference.""" + + def pair_weights(self, labels, ranks): + """Returns the absolute label difference for each pair.""" + del ranks # Unused. + return tf.abs(_apply_pairwise_op(tf.subtract, labels)) + + +class AbstractDCGLambdaWeight(_LambdaWeight): + """Abstract LambdaWeight for Discounted Cumulative Gain (DCG) metric.""" + + def __init__(self, topn=None, gain_fn=lambda label: label, rank_discount_fn=lambda rank: 1. / rank, normalized=False): + """Initializer. + + Ranks are 1-based, not 0-based. + + Args: + topn: (int) The topn for the DCG metric. + gain_fn: (function) Transforms labels. + rank_discount_fn: (function) The rank discount function. + normalized: (bool) If True, normalize weight by the max DCG. + """ + self._topn = topn + self._gain_fn = gain_fn + self._rank_discount_fn = rank_discount_fn + self._normalized = normalized + + @abc.abstractmethod + def _pair_rank_discount(self, ranks, topn): + """Computes the rank-based discount for a pair. + + Args: + ranks: A 2D `Tensor` for the 1-based ranks. + topn: A scalar `Tensor` for the topn cutoff. + + Returns: + A pairwise weights `Tensor` based on the `rank_discount_fn`. + """ + raise NotImplementedError('Calling an abstract method.') + + def pair_weights(self, labels, ranks): + """See `_LambdaWeight`.""" + with tf.compat.v1.name_scope(name='dcg_lambda_weight'): + _check_tensor_shapes([labels, ranks]) + valid_pair, labels = _get_valid_pairs_and_clean_labels(labels) + gain = self._gain_fn(labels) + if self._normalized: + gain *= inverse_max_dcg(labels, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, topn=self._topn) + pair_gain = _apply_pairwise_op(tf.subtract, gain) + pair_gain *= tf.cast(valid_pair, dtype=tf.float32) + + list_size = tf.shape(input=labels)[1] + topn = self._topn or list_size + pair_weight = tf.abs(pair_gain) * self._pair_rank_discount(ranks, topn) + + # For LambdaLoss with relative rank difference, the scale of loss becomes + # much smaller when applying LambdaWeight. This affects the training can + # make the optimal learning rate become much larger. We use a heuristic to + # scale it up to the same magnitude as standard pairwise loss. + pair_weight *= tf.cast(tf.shape(input=labels)[1], dtype=tf.float32) + return pair_weight + + def individual_weights(self, labels, ranks): + """See `_LambdaWeight`.""" + with tf.compat.v1.name_scope(name='dcg_lambda_weight'): + _check_tensor_shapes([labels, ranks]) + labels = tf.convert_to_tensor(value=labels) + labels = tf.compat.v1.where(utils.is_label_valid(labels), labels, tf.zeros_like(labels)) + gain = self._gain_fn(labels) + if self._normalized: + gain *= inverse_max_dcg(labels, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, topn=self._topn) + rank_discount = self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32)) + return gain * rank_discount + + +class DCGLambdaWeight(AbstractDCGLambdaWeight): + """LambdaWeight for Discounted Cumulative Gain metric.""" + + def __init__( + self, + topn=None, + gain_fn=lambda label: label, + rank_discount_fn=lambda rank: 1. / rank, + normalized=False, + smooth_fraction=0. + ): + """Initializer. + + Ranks are 1-based, not 0-based. Given rank i and j, there are two types of + pair weights: + u = |rank_discount_fn(|i-j|) - rank_discount_fn(|i-j| + 1)| + v = |rank_discount_fn(i) - rank_discount_fn(j)| + where u is the newly introduced one in LambdaLoss paper + (https://ai.google/research/pubs/pub47258) and v is the original one in the + LambdaMART paper "From RankNet to LambdaRank to LambdaMART: An Overview". + The final pair weight contribution of ranks is + (1-smooth_fraction) * u + smooth_fraction * v. + + Args: + topn: (int) The topn for the DCG metric. + gain_fn: (function) Transforms labels. + rank_discount_fn: (function) The rank discount function. + normalized: (bool) If True, normalize weight by the max DCG. + smooth_fraction: (float) parameter to control the contribution from + LambdaMART. + """ + super().__init__(topn, gain_fn, rank_discount_fn, normalized) + if not 0. <= smooth_fraction <= 1.: + raise ValueError('smooth_fraction %s should be in range [0, 1].' % smooth_fraction) + self._smooth_fraction = smooth_fraction + + def _pair_rank_discount(self, ranks, topn): + """See `_LambdaWeight`.""" + + def _discount_for_relative_rank_diff(): + """Rank-based discount in the LambdaLoss paper.""" + # The LambdaLoss is not well defined when topn is active and topn < + # list_size. The following implementation is based on Equation 18 proposed + # in https://research.google/pubs/pub47258/. Please refer to + # `DCGLambdaWeightV2` for a better implemention to handle topn. + pair_valid_rank = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn)) + rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32) + pair_discount = tf.where( + tf.logical_and(tf.greater(rank_diff, 0), pair_valid_rank), + tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)), tf.zeros_like(rank_diff) + ) + return pair_discount + + def _discount_for_absolute_rank(): + """Standard discount in the LambdaMART paper.""" + # When the rank discount is (1 / rank) for example, the discount is + # |1 / r_i - 1 / r_j|. When i or j > topn, the discount becomes 0. + rank_discount = tf.compat.v1.where( + tf.greater(ranks, topn), tf.zeros_like(tf.cast(ranks, dtype=tf.float32)), + self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32)) + ) + pair_discount = tf.abs(_apply_pairwise_op(tf.subtract, rank_discount)) + return pair_discount + + u = _discount_for_relative_rank_diff() + v = _discount_for_absolute_rank() + pair_discount = (1. - self._smooth_fraction) * u + self._smooth_fraction * v + pair_mask = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn)) + return pair_discount * tf.cast(pair_mask, dtype=tf.float32) + + +class DCGLambdaWeightV2(AbstractDCGLambdaWeight): + """The V2 version of LambdaWeight for DCG metric. + + V2: Everything is the same as LambdaLoss when topn=None. When topn is + activated, for any pair i, j where max(i, j) > topn, we multiply the inverse + of 1-1/log(1+max(i,j)) for example. + """ + + def _pair_rank_discount(self, ranks, topn): + """Implements the rank discount for pairs in topn metrics.""" + rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32) + max_rank = tf.cast(_apply_pairwise_op(tf.math.maximum, ranks), tf.float32) + multiplier = tf.where( + tf.greater(max_rank, tf.cast(topn, tf.float32)), 1. / (1. - self._rank_discount_fn(max_rank)), 1. + ) + pair_discount = tf.where( + tf.greater(rank_diff, 0.), + tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)) * multiplier, + tf.zeros_like(rank_diff) + ) + return pair_discount + + +class YetiDCGLambdaWeight(DCGLambdaWeightV2): + """A simple LambdaWeight to compute pair weight on neighbor pairs.""" + + def pair_weights(self, labels: tf.Tensor, ranks: tf.Tensor) -> tf.Tensor: + """See `_LambdaWeight`.""" + pair_weight = super().pair_weights(labels, ranks) + with tf.compat.v1.name_scope(name='yeti_dcg_lambda_weight'): + neighbor_pair = tf.equal(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), 1) + pair_weight *= tf.cast(neighbor_pair, dtype=tf.float32) + return pair_weight + + +class PrecisionLambdaWeight(_LambdaWeight): + """LambdaWeight for Precision metric.""" + + def __init__(self, topn, positive_fn=lambda label: tf.greater_equal(label, 1.0)): + """Constructor. + + Args: + topn: (int) The K in Precision@K metric. + positive_fn: (function): A function on `Tensor` that output boolean True + for positive examples. The rest are negative examples. + """ + self._topn = topn + self._positive_fn = positive_fn + + def pair_weights(self, labels, ranks): + """See `_LambdaWeight`. + + The current implementation here is that for any pairs of documents i and j, + we set the weight to be 1 if + - i and j have different labels. + - i <= topn and j > topn or i > topn and j <= topn. + This is exactly the same as the original LambdaRank method. The weight is + the gain of swapping a pair of documents. + + Args: + labels: A dense `Tensor` of labels with shape [batch_size, list_size]. + ranks: A dense `Tensor` of ranks with the same shape as `labels` that are + sorted by logits. + + Returns: + A `Tensor` that can weight example pairs. + """ + with tf.compat.v1.name_scope(name='precision_lambda_weight'): + _check_tensor_shapes([labels, ranks]) + valid_pair, labels = _get_valid_pairs_and_clean_labels(labels) + binary_labels = tf.cast(self._positive_fn(labels), dtype=tf.float32) + label_diff = tf.abs(_apply_pairwise_op(tf.subtract, binary_labels)) + label_diff *= tf.cast(valid_pair, dtype=tf.float32) + # i <= topn and j > topn or i > topn and j <= topn, i.e., xor(i <= topn, j + # <= topn). + rank_mask = _apply_pairwise_op(tf.math.logical_xor, tf.less_equal(ranks, self._topn)) + return label_diff * tf.cast(rank_mask, dtype=tf.float32) + + +class ListMLELambdaWeight(_LambdaWeight): + """LambdaWeight for ListMLE cost function.""" + + def __init__(self, rank_discount_fn): + """Constructor. + + Ranks are 1-based, not 0-based. + + Args: + rank_discount_fn: (function) The rank discount function. + """ + self._rank_discount_fn = rank_discount_fn + + def pair_weights(self, labels, ranks): + """See `_LambdaWeight`.""" + pass + + def individual_weights(self, labels, ranks): + """See `_LambdaWeight`.""" + with tf.compat.v1.name_scope(name='p_list_mle_lambda_weight'): + _check_tensor_shapes([labels, ranks]) + labels = tf.convert_to_tensor(value=labels) + rank_discount = self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32)) + return tf.ones_like(labels) * rank_discount + + +def _compute_ranks(logits, is_valid): + """Computes ranks by sorting valid logits. + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + is_valid: A `Tensor` of the same shape as `logits` representing validity of + each entry. + + Returns: + The `ranks` Tensor. + """ + _check_tensor_shapes([logits, is_valid]) + # Only sort entries with is_valid = True. + scores = tf.compat.v1.where( + is_valid, logits, -1e-6 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=1, keepdims=True) + ) + return utils.sorted_ranks(scores) + + +def _pairwise_comparison(labels, logits, mask, pairwise_logits_op=tf.subtract): + r"""Returns pairwise comparison `Tensor`s. + + Given a list of n items, the labels of graded relevance l_i and the logits + s_i, we form n^2 pairs. For each pair, we have the following: + + / + | 1 if l_i > l_j for valid l_i and l_j. + * `pairwise_labels` = | + | 0 otherwise + \ + * `pairwise_logits` = pairwise_logits_op(s_i, s_j) + + Args: + labels: A `Tensor` with shape [batch_size, list_size]. + logits: A `Tensor` with shape [batch_size, list_size]. + mask: A `Tensor` with shape [batch_size, list_size] indicating which entries + are valid for computing the pairwise comparisons. + pairwise_logits_op: A pairwise function which operates on 2 tensors. + + Returns: + A tuple of (pairwise_labels, pairwise_logits) with each having the shape + [batch_size, list_size, list_size]. + """ + # Compute the difference for all pairs in a list. The output is a Tensor with + # shape [batch_size, list_size, list_size] where the entry [-1, i, j] stores + # the information for pair (i, j). + pairwise_label_diff = _apply_pairwise_op(tf.subtract, labels) + pairwise_logits = _apply_pairwise_op(pairwise_logits_op, logits) + # Only keep the case when l_i > l_j. + pairwise_labels = tf.cast(tf.greater(pairwise_label_diff, 0), dtype=tf.float32) + valid_pair = _apply_pairwise_op(tf.logical_and, mask) + pairwise_labels *= tf.cast(valid_pair, dtype=tf.float32) + return pairwise_labels, pairwise_logits + + +class GumbelSampler(object): + """Random sampler for sampling gumbel distributed logits.""" + + def __init__(self, name=None, sample_size=8, temperature=1.0, seed=None, ragged=False): + """Constructor.""" + self._name = name + self._sample_size = sample_size + self._temperature = temperature + self._seed = seed + self._ragged = ragged + + def sample(self, labels, logits, weights=None): + """Samples scores from Concrete(logits). + + If the sampler was constructed with `ragged=True` this method expects + `labels`, `logits` and item-wise `weights` to be a `RaggedTensor`. + + Args: + labels: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size] + same as `logits`, representing graded relevance. Or in the diversity + tasks, a `Tensor` (or `RaggedTensor`) with shape [batch_size, list_size, + subtopic_size]. Each value represents relevance to a subtopic, 1 for + relevent subtopic, 0 for irrelevant, and -1 for paddings. When the + actual subtopic number of a query is smaller than the `subtopic_size`, + `labels` will be padded to `subtopic_size` with -1. + logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size]. + Each value is the ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` or `RaggedTensor` with shape [batch_size, + list_size] for item-wise weights. If None, the weight of a list in the + mini-batch is set to the sum of the labels of the items in that list. + + Returns: + A tuple of expanded labels, logits, and weights where the first dimension + is now batch_size * sample_size. Logit Tensors are sampled from + Concrete(logits) while labels and weights are simply tiled so the + resulting + Tensor has the updated dimensions. + """ + with tf.compat.v1.name_scope(self._name, 'gumbel_softmax_sample', (labels, logits, weights)): + # Convert ragged tensors to dense and construct a mask. + if self._ragged: + is_weights_ragged = isinstance(weights, tf.RaggedTensor) + labels, logits, weights, mask = utils.ragged_to_dense(labels, logits, weights) + + batch_size = tf.shape(input=labels)[0] + list_size = tf.shape(input=labels)[1] + + # Expand labels. + expanded_labels = tf.expand_dims(labels, 1) + expanded_labels = tf.repeat(expanded_labels, [self._sample_size], axis=1) + expanded_labels = utils.reshape_first_ndims(expanded_labels, 2, [batch_size * self._sample_size]) + + # Sample logits from Concrete(logits). + sampled_logits = tf.expand_dims(logits, 1) + sampled_logits = tf.tile(sampled_logits, [1, self._sample_size, 1]) + sampled_logits += _sample_gumbel([batch_size, self._sample_size, list_size], seed=self._seed) + sampled_logits = tf.reshape(sampled_logits, [batch_size * self._sample_size, list_size]) + + is_label_valid = utils.is_label_valid(expanded_labels) + if is_label_valid.shape.rank > 2: + is_label_valid = tf.reduce_any(is_label_valid, axis=-1) + sampled_logits = tf.compat.v1.where( + is_label_valid, sampled_logits / self._temperature, + tf.math.log(1e-20) * tf.ones_like(sampled_logits) + ) + sampled_logits = tf.math.log(tf.nn.softmax(sampled_logits) + 1e-20) + + expanded_weights = weights + if expanded_weights is not None: + true_fn = lambda: tf.expand_dims(tf.expand_dims(expanded_weights, 1), 1) + false_fn = lambda: tf.expand_dims(expanded_weights, 1) + expanded_weights = tf.cond(pred=tf.math.equal(tf.rank(expanded_weights), 1), true_fn=true_fn, false_fn=false_fn) + expanded_weights = tf.tile(expanded_weights, [1, self._sample_size, 1]) + expanded_weights = tf.reshape(expanded_weights, [batch_size * self._sample_size, -1]) + + # Convert dense tensors back to ragged. + if self._ragged: + # Construct expanded mask for the number of samples. + expanded_mask = tf.expand_dims(mask, 1) + expanded_mask = tf.repeat(expanded_mask, [self._sample_size], axis=1) + expanded_mask = tf.reshape(expanded_mask, [batch_size * self._sample_size, list_size]) + # Convert labels and sampled logits to ragged tensors. + expanded_labels = tf.ragged.boolean_mask(expanded_labels, expanded_mask) + sampled_logits = tf.ragged.boolean_mask(sampled_logits, expanded_mask) + # If ragged weights were provided, convert dense weights back to ragged. + if is_weights_ragged: + expanded_weights = tf.ragged.boolean_mask(expanded_weights, expanded_mask) + + return expanded_labels, sampled_logits, expanded_weights + + +def _sample_gumbel(shape, eps=1e-20, seed=None): + u = tf.random.uniform(shape, minval=0, maxval=1, dtype=tf.float32, seed=seed) + return -tf.math.log(-tf.math.log(u + eps) + eps) + + +class _RankingLoss(object, metaclass=abc.ABCMeta): + """Interface for ranking loss.""" + + def __init__(self, name, lambda_weight=None, temperature=1.0, ragged=False): + """Constructor. + + Args: + name: A string used as the name for this loss. + lambda_weight: A `_LambdaWeight` object. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + """ + self._name = name + self._lambda_weight = lambda_weight + self._temperature = temperature + self._ragged = ragged + + @property + def name(self): + """The loss name.""" + return self._name + + def _prepare_and_validate_params(self, labels, logits, weights, mask): + """Prepares and validate input parameters. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + A tuple (labels, logits, weights, mask) of `tf.Tensor` objects that are + ready to be used in the loss. + """ + if self._ragged: + labels, logits, weights, mask = utils.ragged_to_dense(labels, logits, weights) + + if mask is None: + mask = utils.is_label_valid(labels) + + if weights is None: + weights = 1.0 + + labels = tf.convert_to_tensor(labels) + logits = tf.convert_to_tensor(logits) + weights = tf.convert_to_tensor(weights) + mask = tf.convert_to_tensor(mask) + + return labels, logits, weights, mask + + def compute_unreduced_loss(self, labels, logits, mask=None): + """Computes the unreduced loss. + + Args: + labels: A `Tensor` or `RaggedTensor` of the same shape as `logits` + representing graded relevance. + logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size]. + Each value is the ranking score of the corresponding item. + mask: An optional `Tensor` of the same shape as logits indicating which + entries are valid for computing the loss. Will be ignored if the loss + was constructed with ragged=True. + + Returns: + A tuple(losses, loss_weights) that have the same shape. + """ + labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask) + return self._compute_unreduced_loss_impl(labels, logits, mask) + + @abc.abstractmethod + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """Implementation for the unreduced loss. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + mask: An optional `Tensor` of the same shape as logits indicating which + entries are valid for computing the loss. + + Returns: + A tuple(losses, loss_weights) that have the same shape. + """ + raise NotImplementedError('Calling an abstract method.') + + def normalize_weights(self, labels, weights): + """Normalizes weights. + + This is needed for `tf.estimator` given that the reduction may be + `SUM_OVER_NONZERO_WEIGHTS`. + + This method is also needed to compute normalized weights when calling + `compute_unreduced_loss`, which is done in the tf.keras losses. + + Args: + labels: A `Tensor` of shape [batch_size, list_size] representing graded + relevance. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + + Returns: + The normalized weights. + """ + if self._ragged: + labels, _, weights, _ = utils.ragged_to_dense(labels, None, weights) + return self._normalize_weights_impl(labels, weights) + + def _normalize_weights_impl(self, labels, weights): + """See `normalize_weights`.""" + del labels + return 1.0 if weights is None else weights + + def get_logits(self, logits): + """Computes logits rescaled by temperature. + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + + Returns: + Tensor of rescaled logits. + """ + if not tf.is_tensor(logits): + logits = tf.convert_to_tensor(value=logits) + return logits / self._temperature + + def compute(self, labels, logits, weights, reduction, mask=None): + """Computes the reduced loss for tf.estimator (not tf.keras). + + Note that this function is not compatible with keras. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to + reduce training loss over batch. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + Reduced loss for training and eval. + """ + logits = self.get_logits(logits) + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction) + + @abc.abstractmethod + def compute_per_list(self, labels, logits, weights, mask=None): + """Computes the per-list loss. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the loss. + + Returns: + A pair of `Tensor` objects of shape [batch_size] containing per-list + losses and weights. + """ + raise NotImplementedError('Calling an abstract method.') + + def eval_metric(self, labels, logits, weights, mask=None): + """Computes the eval metric for the loss in tf.estimator (not tf.keras). + + Note that this function is not compatible with keras. + + Args: + labels: A `Tensor` of the same shape as `logits` representing graded + relevance. + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise + weights, or a `Tensor` with shape [batch_size, list_size] for item-wise + weights. + mask: A `Tensor` of the same shape as logits indicating which entries are + valid for computing the metric. + + Returns: + A metric op. + """ + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + return tf.compat.v1.metrics.mean(losses, weights) + + +class _PairwiseLoss(_RankingLoss, metaclass=abc.ABCMeta): + """Interface for pairwise ranking loss.""" + + @abc.abstractmethod + def _pairwise_loss(self, pairwise_logits): + """The loss of pairwise logits with l_i > l_j.""" + raise NotImplementedError('Calling an abstract method.') + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + ranks = _compute_ranks(logits, mask) + pairwise_labels, pairwise_logits = _pairwise_comparison(labels, logits, mask) + pairwise_weights = pairwise_labels + if self._lambda_weight is not None: + pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks) + + pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient') + return self._pairwise_loss(pairwise_logits), pairwise_weights + + def compute_per_list(self, labels, logits, weights, mask=None): + """See `_RankingLoss`.""" + # Prepare input params. + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + + # Pairwise losses and weights will be of shape + # [batch_size, list_size, list_size]. + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + + # Compute the weighted per-pair loss. + weighted_per_pair_loss = tf.math.multiply(losses, weights) + + # Sum the inner dimensions to obtain per-list weights. For pairwise losses + # this typically indicates the (weighted) number of pairwise preferences per + # list. + per_list_weights = tf.reduce_sum(weights, axis=[1, 2]) + + # This computes the per-list losses by summing all weighted pairwise losses. + per_list_losses = tf.reduce_sum(weighted_per_pair_loss, axis=[1, 2]) + + # Normalize the per-list losses so that lists with different numbers of + # pairs have comparable losses. The different numbers of pairs is reflected + # in the per-list weights. + per_list_losses = tf.math.divide_no_nan(per_list_losses, per_list_weights) + + return per_list_losses, per_list_weights + + def _normalize_weights_impl(self, labels, weights): + """See _RankingLoss.""" + # The `weights` is item-wise and is applied non-symmetrically to update + # pairwise_weights as + # pairwise_weights(i, j) = w_i * pairwise_weights(i, j). + # This effectively applies to all pairs with l_i > l_j. Note that it is + # actually symmetric when `weights` are constant per list, i.e., listwise + # weights. + if weights is None: + weights = 1. + weights = tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels)) + return tf.expand_dims(weights, axis=2) + + +class PairwiseLogisticLoss(_PairwiseLoss): + """Implements pairwise logistic loss.""" + + def _pairwise_loss(self, pairwise_logits): + """See `_PairwiseLoss`.""" + # The following is the same as log(1 + exp(-pairwise_logits)). + return tf.nn.relu(-pairwise_logits) + tf.math.log1p(tf.exp(-tf.abs(pairwise_logits))) + + +class PairwiseHingeLoss(_PairwiseLoss): + """Implements pairwise hinge loss.""" + + def _pairwise_loss(self, pairwise_logits): + """See `_PairwiseLoss`.""" + return tf.nn.relu(1 - pairwise_logits) + + +class PairwiseSoftZeroOneLoss(_PairwiseLoss): + """Implements pairwise hinge loss.""" + + def _pairwise_loss(self, pairwise_logits): + """See `_PairwiseLoss`.""" + return tf.compat.v1.where( + tf.greater(pairwise_logits, 0), 1. - tf.sigmoid(pairwise_logits), tf.sigmoid(-pairwise_logits) + ) + + +class PairwiseMSELoss(_PairwiseLoss): + """Implements pairwise MSE loss. + + This loss computes over all pairs, including those with the same labels, but + excluding self pairs in the diagonal of the pairwise matrix. + """ + + def _pairwise_loss(self, pairwise_logits): + # Unused because of overridding `_compute_unreduced_loss_impl`. + pass + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + + # Compute loss. + pairwise_label_diff = _apply_pairwise_op(tf.subtract, labels) + pairwise_logit_diff = _apply_pairwise_op(tf.subtract, logits) + pairwise_mse_loss = tf.math.square(pairwise_logit_diff - pairwise_label_diff) + valid_pair = _apply_pairwise_op(tf.logical_and, mask) + + # Compute weights. + pairwise_weights = tf.ones_like(pairwise_mse_loss) + batch_size, list_size = tf.unstack(tf.shape(input=labels)) + # Excluding the self pairs. + pairwise_weights -= tf.eye(list_size, batch_shape=[batch_size], dtype=pairwise_weights.dtype) + # Including only valid pairs + pairwise_weights *= tf.cast(valid_pair, tf.float32) + if self._lambda_weight is not None: + ranks = _compute_ranks(logits, mask) + pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks) + pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient') + + return pairwise_mse_loss, pairwise_weights + + +class _ListwiseLoss(_RankingLoss): + """Interface for listwise loss.""" + + def _normalize_weights_impl(self, labels, weights): + """See `_RankingLoss`.""" + if weights is None: + return 1.0 + else: + weights = tf.convert_to_tensor(value=weights) + labels = tf.convert_to_tensor(value=labels) + is_valid = utils.is_label_valid(labels) + labels = tf.where(is_valid, labels, tf.zeros_like(labels)) + return tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=(weights * labels), axis=1, keepdims=True), + tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + ) + + def compute_per_list(self, labels, logits, weights, mask=None): + """See `_RankingLoss`.""" + # Prepare input params. + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + + # Listwise losses and weights will be of shape [batch_size, 1]. + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + + # This removes the inner dimension of size 1 to make the output shape + # [batch_size]. + per_list_losses = tf.squeeze(losses, axis=1) + per_list_weights = tf.squeeze(weights, axis=1) + return per_list_losses, per_list_weights + + +class CircleLoss(_ListwiseLoss): + """Implements circle loss. + + This is the Circle loss originally proposed by Sun et al. + ["Circle Loss: A Unified Perspective of Pair Similarity Optimization"]. See + https://arxiv.org/abs/2002.10857. + + For a model that outputs similarity scores `s` on data point with + corresponding label y, the circle loss from Eq.(6) in the paper is + L_circle = log(1 + sum_{i is p,j is n} + exp(gamma * (a_j * (s_j - d_n) - a_i * (s_i - d_p)))), + defined for the binary label, p for data points with positive labels and n for + data points with negative labels. + a_i = relu(1 + margin - s_i) + a_j = relu(s_j + margin) + d_p = 1 - margin + d_n = margin + We can extend to non-binary labels with an indiactor function, + L_circle = log(1 + sum_{i, j} I_{y_i > y_j} + exp(gamma * (a_j * (s_j - d_n) - a_i * (s_i - d_p)))), + Note the loss takes only the similarity scores. We will clip any score value + beyond 0 and 1 to confine the scores in [0, 1], please be aware of that. + """ + + def __init__(self, name, lambda_weight=None, gamma=64, margin=0.25, ragged=False): + """Initializer. + + Args: + name: A string used as the name for this loss. + lambda_weight: A `_LambdaWeight` object. + gamma: A float parameter used in circle loss. + margin: A float parameter defining the margin in circle loss. + ragged: A boolean indicating whether the input tensors are ragged. + """ + super().__init__(name, lambda_weight=lambda_weight, temperature=1.0, ragged=ragged) + self._margin = margin + self._gamma = gamma + + def get_logits(self, logits): + """See `_RankingLoss`.""" + # Add a clip to confine scores in [0, 1]. + return tf.clip_by_value(tf.convert_to_tensor(value=logits), 0., 1.) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + + def circle_loss_pairwise_op(score_i, score_j): + alpha_i = tf.stop_gradient(tf.nn.relu(1 - score_i + self._margin), name='circle_loss_alpha_pos') + alpha_j = tf.stop_gradient(tf.nn.relu(score_j + self._margin), name='circle_loss_alpha_neg') + return alpha_i * (1 - score_i - self._margin) + alpha_j * (score_j - self._margin) + + pairwise_labels, pairwise_logits = _pairwise_comparison( + labels, logits, mask, pairwise_logits_op=circle_loss_pairwise_op + ) + pairwise_weights = tf.stop_gradient(pairwise_labels, name='weights_stop_gradient') + # TODO: try lambda_weights for circle loss. + # Pairwise losses and weights will be of shape + # [batch_size, list_size, list_size]. + losses = tf.exp(self._gamma * pairwise_logits) + + # This computes the per-list losses and weights for circle loss. + per_list_losses = tf.math.log1p(tf.reduce_sum(tf.math.multiply(losses, pairwise_weights), axis=[1, 2])) + per_list_weights = tf.reduce_sum(pairwise_weights, axis=[ + 1, 2 + ]) / tf.reduce_sum(tf.cast(pairwise_weights > 0, tf.float32), axis=[1, 2]) + + # Return per-list losses and weights with shape [batch_size, 1]. + return tf.expand_dims(per_list_losses, 1), tf.expand_dims(per_list_weights, 1) + + +class SoftmaxLoss(_ListwiseLoss): + """Implements softmax loss.""" + + def precompute(self, labels, logits, weights, mask=None): + """Precomputes Tensors for softmax cross entropy inputs.""" + if mask is None: + mask = utils.is_label_valid(labels) + ranks = _compute_ranks(logits, mask) + # Reset the masked labels to 0 and reset the masked logits to a logit with + # ~= 0 contribution in softmax. + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) + if self._lambda_weight is not None and isinstance(self._lambda_weight, DCGLambdaWeight): + labels = self._lambda_weight.individual_weights(labels, ranks) + if weights is not None: + labels *= weights + return labels, logits + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + # Padding for rows with label_sum = 0. + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + padded_labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels)) + padded_labels = tf.compat.v1.where(mask, padded_labels, tf.zeros_like(padded_labels)) + padded_label_sum = tf.reduce_sum(input_tensor=padded_labels, axis=1, keepdims=True) + labels_for_softmax = tf.math.divide_no_nan(padded_labels, padded_label_sum) + logits_for_softmax = logits + # Padded labels have 0 weights in label_sum. + weights_for_softmax = tf.reshape(label_sum, [-1]) + losses = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels_for_softmax, logits_for_softmax) + return losses, weights_for_softmax + + def compute(self, labels, logits, weights, reduction, mask=None): + """See `_RankingLoss`.""" + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + logits = self.get_logits(logits) + labels, logits = self.precompute(labels, logits, weights, mask) + losses, weights = self._compute_unreduced_loss_impl(labels, logits, mask) + return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction) + + def eval_metric(self, labels, logits, weights, mask=None): + """See `_RankingLoss`.""" + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + logits = self.get_logits(logits) + labels, logits = self.precompute(labels, logits, weights, mask) + losses, weights = self._compute_unreduced_loss_impl(labels, logits, mask) + return tf.compat.v1.metrics.mean(losses, weights) + + def compute_per_list(self, labels, logits, weights, mask=None): + """See `_RankingLoss`.""" + # Prepare input params. + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + + # As opposed to the other listwise losses, SoftmaxLoss returns already + # squeezed losses, which can be returned directly. + logits = self.get_logits(logits) + labels, logits = self.precompute(labels, logits, weights, mask) + return self._compute_unreduced_loss_impl(labels, logits, mask) + + def compute_unreduced_loss(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask) + logits = self.get_logits(logits) + labels, logits = self.precompute(labels, logits, weights=None, mask=mask) + return self._compute_unreduced_loss_impl(labels, logits, mask) + + +class PolyOneSoftmaxLoss(SoftmaxLoss): + """Implements poly1 softmax loss.""" + + def __init__(self, name, lambda_weight=None, epsilon=1.0, temperature=1.0, ragged=False): + """Constructor. + + Args: + name: A string used as the name for this loss. + lambda_weight: A `_LambdaWeight` object. + epsilon: A float number for contribution of the first polynomial. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + """ + super().__init__(name, lambda_weight=lambda_weight, temperature=temperature, ragged=ragged) + self._epsilon = epsilon + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + # Padding for rows with label_sum = 0. + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + padded_labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels)) + padded_labels = tf.compat.v1.where(mask, padded_labels, tf.zeros_like(padded_labels)) + padded_label_sum = tf.reduce_sum(input_tensor=padded_labels, axis=1, keepdims=True) + labels_for_softmax = tf.math.divide_no_nan(padded_labels, padded_label_sum) + logits_for_softmax = logits + # Padded labels have 0 weights in label_sum. + weights_for_softmax = tf.reshape(label_sum, [-1]) + pt = tf.reduce_sum(labels_for_softmax * tf.nn.softmax(logits_for_softmax), axis=-1) + ce = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels_for_softmax, logits_for_softmax) + losses = ce + self._epsilon * (1 - pt) + return losses, weights_for_softmax + + +class UniqueSoftmaxLoss(_ListwiseLoss): + """Implements unique rating softmax loss.""" + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) + pairwise_labels, _ = _pairwise_comparison(labels, logits, mask) + # Used in denominator to compute unique softmax probability for each doc. + denominator_logits = tf.expand_dims(logits, axis=1) * pairwise_labels + denominator_logits = tf.concat([denominator_logits, tf.expand_dims(logits, axis=2)], axis=2) + denominator_mask = tf.concat([pairwise_labels, tf.expand_dims(tf.ones_like(logits), axis=2)], axis=2) + denominator_logits = tf.where( + tf.greater(denominator_mask, 0.0), denominator_logits, + -1e-3 + tf.reduce_min(denominator_logits) * tf.ones_like(denominator_logits) + ) + logits_max = tf.reduce_max(denominator_logits, axis=-1, keepdims=True) + # Subtract the max so that exp(denominator_logits) is numerically valid. + denominator_logits -= logits_max + logits -= tf.squeeze(logits_max, axis=-1) + # Set gains for loss weights. + gains = tf.pow(2.0, labels) - 1 + # Compute the softmax loss for each doc. + per_doc_softmax = -logits + tf.math.log(tf.reduce_sum(tf.exp(denominator_logits) * denominator_mask, axis=-1)) + losses = tf.reduce_sum(per_doc_softmax * gains, axis=1, keepdims=True) + return losses, tf.ones_like(losses) + + +class _PointwiseLoss(_RankingLoss): + """Interface for pointwise loss.""" + + def _normalize_weights_impl(self, labels, weights): + """See _RankingLoss.""" + if weights is None: + weights = 1. + return tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels)) + + def compute_per_list(self, labels, logits, weights, mask=None): + """See `_RankingLoss`.""" + # Prepare input params. + labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask) + + # Pointwise losses and weights will be of shape [batch_size, list_size]. + losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask) + weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights) + + # Compute the weighted per-item loss. + weighted_per_item_loss = tf.math.multiply(losses, weights) + + # Sum the inner dimensions to obtain per-list weights. For pointwise losses + # this typically indicates the (weighted) number of items per list. + per_list_weights = tf.reduce_sum(weights, axis=1) + + # This computes the per-list losses by summing all weighted per-item losses. + per_list_losses = tf.reduce_sum(weighted_per_item_loss, axis=1) + + # Normalize the per-list losses so that lists with different numbers of + # items have comparable losses. The different numbers of items is reflected + # in the per-list weights. + per_list_losses = tf.math.divide_no_nan(per_list_losses, per_list_weights) + return per_list_losses, per_list_weights + + +class ClickEMLoss(_PointwiseLoss): + """Implements the click EM loss with examination and relevance. + + The implementation is based on the the paper by Wang et al: "Position bias + estimation for unbiased learning to rank in personal search." It assumes that + a click is generated by a factorized model P(examination) * P(relevance), + which are latent variables determined by `exam_logits` and `rel_logits` + respectively. An EM algorithm is used for estimation and this function + implements the expectation step to estimate the P(latent | observed), i.e., + P(examination | click) and P(relevance | click). + """ + + def __init__(self, name, temperature=1.0, exam_loss_weight=1.0, rel_loss_weight=1.0, ragged=False): + super().__init__(name, None, temperature, ragged) + self._exam_loss_weight = exam_loss_weight + self._rel_loss_weight = rel_loss_weight + + def _compute_latent_prob(self, clicks, exam_logits, rel_logits): + """Computes the probability of latent variables in EM. + + The original compuation is as follows and can be unstable: + exam_prob = sigmoid(exam_logits) + rel_prob = sigmoid(rel_logits) + exam_prob_posterior = exam_prob * (1 - rel_prob) / (1 - exam_prob * + rel_prob) + rel_prob_posterior = rel_prob * (1 - exam_prob) / (1 - exam_prob * + rel_prob). + + To increase the numeric stability, we compute the posteriror logits first. + Using the exam_logits_posterior as an example, we have: + exam_logit_posterior = logit(exam_prob_posterior) + = log(exam_prob_posterior / (1 - exam_prob_posterior)) + It can be reduced to exam_logits and rel_logits: + exam_logit_posterior = exam_logits - log(1 + exp(rel_logits)) + = exam_logits - softplus(rel_logits) + + We can do similar reduction for rel_logit_posterior. Then we compute the + posterior probablity by apply sigmoid on the logits. + + Args: + clicks: A 2-D `Tensor` for clicks as observed data. A value >= 1.0 is + treated as clicked. + exam_logits: A 2-D `Tensor` to compute P(examination) and has the same + shape as `clicks`. + rel_logits: A 2-D `Tensor` to compute P(relevance) and has the same shape + as `clicks`. + + Returns: + A tuple of (exam_given_clicks, rel_given_clicks) representing + P(examination | click) and P(relevance | click). + """ + with tf.compat.v1.name_scope(name='compute_latent_prob'): + is_clicked = tf.greater_equal(tf.cast(clicks, tf.float32), 1.0) + exam_logits_posterior = exam_logits - tf.math.softplus(rel_logits) + rel_logits_posterior = rel_logits - tf.math.softplus(exam_logits) + exam_prob_posterior = tf.compat.v1.where( + is_clicked, tf.ones_like(exam_logits_posterior), tf.sigmoid(exam_logits_posterior) + ) + rel_prob_posterior = tf.compat.v1.where( + is_clicked, tf.ones_like(rel_logits_posterior), tf.sigmoid(rel_logits_posterior) + ) + return tf.stop_gradient(exam_prob_posterior), tf.stop_gradient(rel_prob_posterior) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """Computes the loss for each element. + + Args: + labels: A `Tensor` with shape [batch_size, list_size] representing clicks. + logits: A `Tensor` with shape [batch_size, list_size, 2], where the first + value in the 3rd-dim is the logits for examination and the second value + is the logits for relevance. + mask: A `Tensor` of the same shape as labels indicating which entries are + valid for computing the loss. + + Returns: + A tuple(losses, loss_weights). + """ + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + exam_logits, rel_logits = tf.unstack(logits, axis=2) + exam_logits = tf.compat.v1.where(mask, exam_logits, tf.zeros_like(exam_logits)) + rel_logits = tf.compat.v1.where(mask, rel_logits, tf.zeros_like(rel_logits)) + # The distribution in the E step. + exam_latent_prob, rel_latent_prob = self._compute_latent_prob(labels, exam_logits, rel_logits) + # The loss in the M step. + losses = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits( + labels=exam_latent_prob, logits=exam_logits + ) * self._exam_loss_weight + losses += tf.compat.v1.nn.sigmoid_cross_entropy_with_logits( + labels=rel_latent_prob, logits=rel_logits + ) * self._rel_loss_weight + return losses, tf.cast(mask, dtype=tf.float32) + + +class SigmoidCrossEntropyLoss(_PointwiseLoss): + """Implements sigmoid cross entropy loss.""" + + def __init__(self, name, temperature=1.0, ragged=False): + """Overwrite the constructor. + + Args: + name: A string used as the name for this loss. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + """ + super().__init__(name, None, temperature, ragged) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits)) + losses = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) + return losses, tf.cast(mask, dtype=tf.float32) + + +class MeanSquaredLoss(_PointwiseLoss): + """Implements the means squared error loss.""" + + def __init__(self, name, ragged=False): + """Overwrite the constructor. + + Args: + name: A string used as the name for this loss. + ragged: A boolean indicating whether the input tensors are ragged. + """ + # temperature is not used in this loss. + super().__init__(name, None, temperature=1.0, ragged=ragged) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits)) + losses = tf.compat.v1.squared_difference(labels, logits) + return losses, tf.cast(mask, dtype=tf.float32) + + +class MixtureEMLoss(_ListwiseLoss): + """Implements the Mixture EM loss with examination and relevance. + + An Expecatation-Maximization (EM) algorithm is used for estimation and this + function. + """ + + def __init__(self, name, temperature=1.0, alpha=1.0, ragged=False): + super().__init__(name, None, temperature, ragged) + self._alpha = alpha + + def _compute_model_prob(self, per_list_logodds): + """Computes the probability of models in EM. + + Args: + per_list_logodds: A `Tensor` with shape [batch_size, 1, model_num]. + + Returns: + A `Tensor` of probability with shape [batch_size, 1, model_num]. + """ + with tf.compat.v1.name_scope(name='compute_model_prob'): + return tf.stop_gradient( + tf.exp(-self._alpha * (per_list_logodds - tf.reduce_min(per_list_logodds, axis=2, keepdims=True))) + ) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """Computes the loss for each element. + + Args: + labels: A `Tensor` with shape [batch_size, list_size] representing clicks. + logits: A `Tensor` with shape [batch_size, list_size, model_num], where + the 3rd-dim is dimension for the models to mix. + mask: A `Tensor` of the same shape as labels indicating which entries are + valid for computing the loss. + + Returns: + A tuple(losses, loss_weights). + """ + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + # The loss in the M step. + # shape = [batch_size, list_size, model_num] + losses = tf.stack( + [ + tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=model_logits) + for model_logits in tf.unstack(logits, axis=-1) + ], + axis=2 + ) + losses = tf.where(tf.expand_dims(mask, axis=-1), losses, tf.zeros_like(losses, dtype=tf.float32)) + + # The model probability in the E step. + losses_no_gradient = tf.stop_gradient(losses) + # shape = [batch_size, 1, model_num] + per_list_logodds = tf.reduce_sum(losses_no_gradient, axis=1, keepdims=True) + model_prob = self._compute_model_prob(per_list_logodds) + prob_norm = tf.reduce_sum(model_prob, axis=2, keepdims=True) + + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + nonzero_mask = tf.greater(label_sum, 0.0) + return tf.reshape(tf.reduce_sum(losses * model_prob / prob_norm, axis=[1, 2]), + [-1, 1]), tf.cast(nonzero_mask, dtype=tf.float32) + + +class ListMLELoss(_ListwiseLoss): + """Implements ListMLE loss.""" + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + # Reset the masked labels to 0 and reset the masked logits to a logit with + # ~= 0 contribution. + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) + scores = tf.compat.v1.where( + mask, labels, + tf.reduce_min(input_tensor=labels, axis=1, keepdims=True) - 1e-6 * tf.ones_like(labels) + ) + # Use a fixed ops-level seed and the randomness is controlled by the + # graph-level seed. + sorted_labels, sorted_logits = utils.sort_by_scores(scores, [labels, logits], shuffle_ties=True, seed=37) + + raw_max = tf.reduce_max(input_tensor=sorted_logits, axis=1, keepdims=True) + sorted_logits = sorted_logits - raw_max + sums = tf.cumsum(tf.exp(sorted_logits), axis=1, reverse=True) + sums = tf.math.log(sums) - sorted_logits + + if self._lambda_weight is not None and isinstance(self._lambda_weight, ListMLELambdaWeight): + batch_size, list_size = tf.unstack(tf.shape(input=sorted_labels)) + sums *= self._lambda_weight.individual_weights( + sorted_labels, tf.tile(tf.expand_dims(tf.range(list_size) + 1, 0), [batch_size, 1]) + ) + + negative_log_likelihood = tf.reduce_sum(input_tensor=sums, axis=1, keepdims=True) + return negative_log_likelihood, tf.ones_like(negative_log_likelihood) + + +class ApproxNDCGLoss(_ListwiseLoss): + """Implements ApproxNDCG loss.""" + + # Use a different default temperature. + def __init__(self, name, lambda_weight=None, temperature=0.1, ragged=False): + """See `_ListwiseLoss`.""" + super().__init__(name, lambda_weight, temperature, ragged) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where( + mask, logits, -1e3 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=-1, keepdims=True) + ) + + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels)) + ranks = approx_ranks(logits) + + return -ndcg(labels, ranks), tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1]) + + +class ApproxMRRLoss(_ListwiseLoss): + """Implements ApproxMRR loss.""" + + # Use a different default temperature. + def __init__(self, name, lambda_weight=None, temperature=0.1, ragged=False): + """See `_ListwiseLoss`.""" + super().__init__(name, lambda_weight, temperature, ragged) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where( + mask, logits, -1e3 * tf.ones_like(logits) + tf.math.reduce_min(input_tensor=logits, axis=-1, keepdims=True) + ) + + label_sum = tf.math.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + + nonzero_mask = tf.math.greater(tf.reshape(label_sum, [-1]), 0.0) + labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels)) + + rr = 1. / approx_ranks(logits) + rr = tf.math.reduce_sum(input_tensor=rr * labels, axis=-1, keepdims=True) + mrr = rr / tf.math.reduce_sum(input_tensor=labels, axis=-1, keepdims=True) + return -mrr, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1]) + + +class NeuralSortCrossEntropyLoss(_ListwiseLoss): + """Implements Cross-entropy loss of neural sort permutation matrix.""" + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits)) + + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + + # shape = [batch_size, list_size, list_size]. + true_perm = neural_sort(labels, mask=mask) + smooth_perm = neural_sort(logits, mask=mask) + + losses = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2( + labels=true_perm, logits=tf.math.log(1e-20 + smooth_perm), axis=2 + ) + + # Neural sort will place masked entries last. Losses are still computed on + # those entries so we need to cancel those out. This means we need to mask + # out the last n entries, where n is the number of masked items per list. We + # do so by sorting the mask and setting (masked) invalid losses to 0. + sorted_mask = tf.cast(tf.sort(tf.cast(mask, dtype=tf.float32), axis=1, direction='DESCENDING'), dtype=tf.bool) + losses = tf.where(sorted_mask, losses, tf.zeros_like(losses)) + + # shape = [batch_size, list_size]. + losses = tf.math.divide_no_nan( + tf.reduce_sum(input_tensor=losses, axis=-1, keepdims=True), + tf.reduce_sum(input_tensor=tf.cast(mask, dtype=tf.float32), axis=-1, keepdims=True) + ) + + return losses, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1]) + + +class NeuralSortNDCGLoss(_ListwiseLoss): + """Implements PiRank-NDCG loss. + + The PiRank-NDCG loss is a differentiable approximation of the NDCG metric + using the NeuralSort trick, which generates a permutation matrix based on + ranking scores. Please refer to https://arxiv.org/abs/2012.06731 for the + PiRank method. For PiRank-NDCG in specific, + NDCG_metric = - sum_i (2^y_i - 1) / log(1 + r_i) / maxDCG, + where y_i and r_i are the label and the score rank of the ith document + respectively. This metric can be also written as the sum over rank r with an + indicator function I, + NDCG_metric = - sum_{i,r} (2^y_i - 1) / log(1 + r) * I(r, r_i) / maxDCG, + where the indicator function I(r, r_i) = 1 if r = r_i and 0 otherwise, which + is the permutation matrix. + + Approximated with a differentiable permutation matrix using neural sort, + PiRank-NDCG = - sum_{i,r} (2^y_i - 1) / log(1 + r) * P(r, i) / maxDCG, + where P(r, i) is the approximation of the permutation matrix. + """ + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits)) + + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + # shape = [batch_size, list_size]. + labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels)) + # shape = [batch_size, list_size, list_size]. + smooth_perm = neural_sort(logits, mask=mask) + + return -ndcg(labels, perm_mat=smooth_perm), tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1]) + + +def neural_sort(logits, name=None, mask=None): + r"""Generate the permutation matrix from logits by deterministic neuralsort. + + The sort on a list of logits can be approximated by a differentiable + permutation matrix using Neural Sort (https://arxiv.org/abs/1903.08850). + The approximation is achieved by constructing a list of functions on logits, + fn_i(k) = (list_size + 1 - 2*i) * logit_k - sum_j |logit_k - logit_j|, + whose value is maximal when k is at the ith largest logit. + So that the permutation matrix can be expressed as + / 1 if j = argmax_k fn_i(k) + P_ij = | = one_hot(argmax(fn_i(j))). + \ 0 otherwise + And the differentiable approximation of the matrix is applied with softmax, + P^_ij = softmax(fn_i(j) / temperature), + where the parameter temperature tunes the smoothiness of the approximation. + + #### References + [1]: Aditya Grover, Eric Wang, Aaron Zweig, Stefano Ermon. + Stochastic Optimization of Sorting Networks via Continuous Relaxations. + https://arxiv.org/abs/1903.08850 + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. (We are using logits here, + noticing the original paper is using probability weights, i.e., the + exponentials of the logits). + name: A string used as the name for this loss. + mask: A `Tensor` with the same shape as logits indicating which entries are + valid for computing the neural_sort. Invalid entries are pushed to the + end. + + Returns: + A tensor of permutation matrices whose dimension is [batch_size, list_size, + list_size]. + """ + with tf.compat.v1.name_scope(name, 'neural_sort', [logits]): + if mask is None: + mask = tf.ones_like(logits, dtype=tf.bool) + + # Reset logits to 0 and compute number of valid entries for each list in the + # batch. + logits = tf.where(mask, logits, tf.zeros_like(logits)) + num_valid_entries = tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True) + + # Compute logit differences and mask out invalid entries. + logit_diff = tf.abs(tf.expand_dims(logits, 2) - tf.expand_dims(logits, 1)) + valid_pair_mask = _apply_pairwise_op(tf.logical_and, mask) + logit_diff = tf.where(valid_pair_mask, logit_diff, tf.zeros_like(logit_diff)) + # shape = [batch_size, 1, list_size]. + logit_diff_sum = tf.reduce_sum(input_tensor=logit_diff, axis=1, keepdims=True) + + # Compute masked range so that masked items do not influence scaling. + masked_range = tf.cumsum(tf.cast(mask, dtype=tf.int32), axis=1) + scaling = tf.cast(num_valid_entries + 1 - 2 * masked_range, dtype=tf.float32) + # shape = [batch_size, list_size]. + scaling = tf.expand_dims(scaling, 2) + # shape = [batch_size, list_size, list_size]. + # Use broadcast to align the dims. + scaled_logits = scaling * tf.expand_dims(logits, 1) + + p_logits = scaled_logits - logit_diff_sum + + # Masked entries will be forcefully kept in-place by setting their values to + # -inf everywhere, except for masked rows where they share equal probability + # with other masked items. + p_logits = tf.where(valid_pair_mask, p_logits, -math.inf) + p_logits = tf.where(_apply_pairwise_op(tf.logical_or, mask), p_logits, tf.zeros_like(p_logits)) + + # By swapping the rows of masked items to the end of the permutation matrix, + # we force masked items to be placed last. + sorted_mask_indices = tf.argsort(tf.cast(mask, dtype=tf.int32), axis=1, direction='DESCENDING', stable=True) + p_logits = tf.gather(p_logits, sorted_mask_indices, batch_dims=1, axis=1) + + smooth_perm = tf.nn.softmax(p_logits, -1) + + return smooth_perm + + +def gumbel_neural_sort(logits, name=None, sample_size=8, temperature=1.0, seed=None): + """Generate the permutation matrix from logits by stochastic neuralsort. + + By sampling logits from the Gumbel distribution, + sampled_logits = logits + Gumbel(0, 1), + the determinstic neural sort z of sampled_logits obeys the distribution with + Prob(z|logits) = (exp(logit_z1) / Z) * (exp(logit_z2) / Z-exp(logit_z1)) * + ... * (exp(logit_zn) / Z-sum_i^(n-1)exp(logit_zi)), + where Z = sum_i exp(logit_i). + + Args: + logits: A `Tensor` with shape [batch_size, list_size]. Each value is the + ranking score of the corresponding item. + name: A string used as the name for this loss. + sample_size: An integer representing the number of samples drawn from the + Concrete distribution defined by scores. + temperature: The Gumbel-Softmax temperature. + seed: Seed for pseudo-random number generator. + + Returns: + A `Tensor` of permutation matrices whose dimension is [batch_size, + sample_size, list_size, list_size]. + """ + with tf.compat.v1.name_scope(name, 'gumbel_neural_sort', [logits]): + batch_size = tf.shape(input=logits)[0] + list_size = tf.shape(input=logits)[1] + + # Sample logits from Concrete(logits). + sampled_logits = tf.expand_dims(logits, 1) + sampled_logits += _sample_gumbel([batch_size, sample_size, list_size], seed=seed) + sampled_logits = tf.reshape(sampled_logits, [batch_size * sample_size, list_size]) + + # Sort by constructing the relaxed permuation matrix from sampled logits. + smooth_perm = neural_sort(sampled_logits / temperature, name) + smooth_perm = tf.reshape(smooth_perm, [batch_size, sample_size, list_size, list_size]) + + return smooth_perm + + +class OrdinalLoss(_PointwiseLoss): + """Implements ordinal loss.""" + + def __init__(self, name, ordinal_size, temperature=1.0, ragged=False, use_fraction_label=False): + """Initializer. + + Args: + name: A string used as the name for this loss. + ordinal_size: A integer number of ordinal levels of labels. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + use_fraction_label: A boolean indicating when the input labels contain + fractions, whether to leverage the fraction part. + """ + super().__init__(name, None, temperature, ragged) + self._ordinal_size = ordinal_size + self._use_fraction_label = use_fraction_label + + def _labels_to_ordinals(self, labels, mask): + """Helper function to transform input labels to ordinal values. + + When use_fraction_label is false, ordinals will be 1.0 if labels >= i for + the ordinal head i, with i = 1, ..., ordinal_size. + When use_fraction_label is true, the fraction part of labels will be counted + if labels > i-1 but < i. + + For a fraction label 1.2, and ordinal_size=2 + when use_fraction_label is false, it maps to an ordinal like [1.0, 0.0], + when use_fraction_label is true, it maps to an ordinal like [1.0, 0.2]. + + Args: + labels: A Tensor of shape [batch_size, list_size]. + mask: A Tensor of shape [batch_size, list_size]. + + Returns: + ordinals, shape [batch_size, list_size, ordinal_size] + """ + one_to_n = tf.range(1, self._ordinal_size + 1, dtype=tf.float32) + unsqueezed = tf.repeat(tf.expand_dims(labels, axis=2), self._ordinal_size, axis=-1) + ordinals = tf.where(unsqueezed >= one_to_n, tf.ones_like(unsqueezed), 0.0) + if self._use_fraction_label: + fractions = unsqueezed - one_to_n + 1.0 + fractions = tf.where(tf.logical_and(fractions > 0.0, fractions < 1.0), fractions, 0.0) + ordinals += fractions + return tf.where(tf.expand_dims(mask, axis=-1), ordinals, 0.0) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + if logits.shape.rank != 3: + raise ValueError('Predictions for ordinal loss must have rank 3.') + elif logits.shape[-1] != self._ordinal_size: + raise ValueError( + 'The last dimension of logits must be the number of ordinal levels ' + f'{self._ordinal_size}, the actual dimension is {logits.shape[-1]}.' + ) + labels = tf.where(mask, labels, 0.0) + logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0) + ordinals = self._labels_to_ordinals(labels, mask) + losses = tf.where( + tf.expand_dims(mask, -1), tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=ordinals, logits=logits), 0.0 + ) + return tf.reduce_sum(losses, axis=-1), tf.cast(mask, dtype=tf.float32) + + +class MultiClassLoss(_PointwiseLoss): + """Implements multi-class loss.""" + + def __init__(self, name, num_classes, temperature=1.0, ragged=False, from_logits=False, label_smoothing=0.0): + """Initializer. + + Args: + name: A string used as the name for this loss. + num_classes: A integer number of classes. To use this loss, + num_classes must be greater than 1. + temperature: A float number to modify the logits=logits/temperature. + ragged: A boolean indicating whether the input tensors are ragged. + from_logits: A boolean indicating whether the input is logits or probs. + label_smoothing: A float number of label smoothing. + """ + super().__init__(name, None, temperature, ragged) + self._num_classes = num_classes + self._from_logits = from_logits + self._label_smoothing = label_smoothing + + def _labels_to_one_hot_class(self, labels, mask): + """Helper function to transform input labels to one hot class labels. + + Args: + labels: A Tensor of shape [batch_size, list_size]. + mask: A Tensor of shape [batch_size, list_size]. + + Returns: + one-hot class label, shape [batch_size, list_size, num_classes] + """ + classes = tf.one_hot(tf.cast(labels, tf.int32), self._num_classes, dtype=tf.float32) + return tf.where(tf.expand_dims(mask, axis=-1), classes, 0.0) + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + if logits.shape.rank != 3: + raise ValueError('Predictions for multi-class loss must have rank 3.') + elif logits.shape[-1] != self._num_classes: + raise ValueError( + 'The last dimension of logits must be the number of classes ' + f'{self._num_classes}, the actual dimension is {logits.shape[-1]}.' + ) + labels = tf.where(mask, labels, 0.0) + logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0) + classes = self._labels_to_one_hot_class(labels, mask) + losses = tf.keras.losses.CategoricalCrossentropy( + from_logits=self._from_logits, + label_smoothing=self._label_smoothing, + axis=-1, + reduction=tf.keras.losses.Reduction.NONE, + name='categorical_crossentropy' + )(classes, logits, tf.cast(mask, dtype=tf.float32)) + return losses, tf.cast(mask, dtype=tf.float32) + + +class CoupledRankDistilLoss(_ListwiseLoss): + r"""Implements Coupled-RankDistil loss. + + The Coupled-RankDistil loss ([Reddi et al, 2021][reddi2021]) is the + cross-entropy between k-Plackett's probability of logits (student) and labels + (teacher). + + The k-Plackett's probability model is defined as: + $$ + \mathcal{P}_k(\pi|s) = \frac{1}{(N-k)!} \\ + \frac{\prod_{i=1}^k exp(s_{\pi(i)})}{\sum_{j=k}^N log(exp(s_{\pi(i)}))}. + $$ + + The Coupled-RankDistil loss is defined as: + $$ + \mathcal{L}(y, s) = -\sum_{\pi} \mathcal{P}_k(\pi|y) log\mathcal{P}(\pi|s) \\ + = \mathcal{E}_{\pi \sim \matcal{P}(.|y)} [-\log \mathcal{P}(\pi|s)] + $$ + + References: + - [RankDistil: Knowledge Distillation for Ranking, Reddi et al, + 2021][reddi2021] + + [reddi2021]: https://research.google/pubs/pub50695/ + """ + + def __init__(self, name, sample_size, topk=None, temperature=1., ragged=False): + """Initializer. + + Args: + name: A string used as the name for this loss. + sample_size: Number of permutations to sample from teacher scores. + topk: top-k entries over which order is matched. A penalty is applied over + non top-k items. + temperature: A float number to modify the logits as + `logits=logits/temperature`. + ragged: A boolean indicating whether the input tensors are ragged. + """ + super().__init__(name, None, temperature, ragged) + self._sample_size = sample_size + self._topk = topk + + def _compute_unreduced_loss_impl(self, labels, logits, mask=None): + """See `_RankingLoss`.""" + if mask is None: + mask = utils.is_label_valid(labels) + labels = tf.where(mask, labels, tf.zeros_like(labels)) + label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0) + + teacher_scores = tf.where(mask, labels, tf.math.log(_EPSILON) * tf.ones_like(labels)) + + student_scores = tf.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) + + # Sample teacher scores. + # [batch_size, list_size] -> [batch_size, sample_size, list_size]. + sampled_teacher_scores = tf.expand_dims(teacher_scores, 1) + sampled_teacher_scores = tf.repeat(sampled_teacher_scores, [self._sample_size], axis=1) + + batch_size, list_size = tf.unstack(tf.shape(input=labels)) + sampled_teacher_scores += _sample_gumbel([batch_size, self._sample_size, list_size], seed=37) + sampled_teacher_scores = tf.math.log(tf.nn.softmax(sampled_teacher_scores) + _EPSILON) + + # Expand student scores. + # [batch_size, list_size] -> [batch_size, sample_size, list_size]. + expanded_student_scores = tf.expand_dims(student_scores, 1) + expanded_student_scores = tf.repeat(expanded_student_scores, [self._sample_size], axis=1) + + # Sort teacher scores and student scores to obtain top-k student scores + # whose order is based on teacher scores. + sorted_student_scores = utils.sort_by_scores( + utils.reshape_first_ndims(sampled_teacher_scores, 2, [batch_size * self._sample_size]), + [utils.reshape_first_ndims(expanded_student_scores, 2, [batch_size * self._sample_size])], + shuffle_ties=True, + seed=37 + )[0] + sorted_student_scores = utils.reshape_first_ndims(sorted_student_scores, 1, [batch_size, self._sample_size]) + topk = self._topk or list_size + topk_student_scores = sorted_student_scores[:, :, :topk] + + # For \pi from teacher scores, compute top-k Plackett's probability as: + # \prod_{i=1}^k exp(s_{\pi(i)}) / \sum_{j=k}^N log(exp(s_{\pi(i)})). + + # Compute the denominator mask for \sum_{j=k}^N log(exp(s_{\pi(i)}). + # We apply logsumexp over valid entries in this mask. + # topk_pl_denominator_mask = batch x sample_size x valid_denom_entries, + # where valid_denom_entries = [[1 1 1 1 1 1] + # [0 1 1 1 1 1] + # [0 0 1 1 1 1]]. + # An alternative implementation would be to use `cumulative_logsumexp` with + # `reverse=True` to compute the denominator term. + ones = tf.ones((topk, list_size), dtype=tf.float32) + ones_upper = tf.linalg.band_part(ones, 0, -1) + topk_pl_denominator_mask = tf.tile(tf.expand_dims(ones_upper, axis=0), [batch_size * self._sample_size, 1, 1]) + # [batch_size * sample_size, topk, list_size] -> + # [batch_size, sample_size, topk, list_size]. + topk_pl_denominator_mask = tf.cast( + utils.reshape_first_ndims(topk_pl_denominator_mask, 1, [batch_size, self._sample_size]), dtype=tf.bool + ) + sorted_student_scores = tf.tile(tf.expand_dims(sorted_student_scores, 2), [1, 1, topk, 1]) + + sorted_student_scores_denom = tf.where( + topk_pl_denominator_mask, sorted_student_scores, + tf.math.log(_EPSILON) * tf.ones_like(sorted_student_scores) + ) + logprob = topk_student_scores - tf.math.reduce_logsumexp(sorted_student_scores_denom, axis=3) + # Compute log-likelihood over top-k Plackett-Luce scores. + # [batch_size, sample_size, topk] -> [batch_size, sample_size]. + logprob = tf.reduce_sum(logprob, axis=2) + + # Compute RankDistil loss as a mean over samples. + # [batch_size, sample_size] -> [batch_size, 1]. + nll = tf.reduce_mean(-logprob, axis=1, keepdims=True) + + return nll, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1]) diff --git a/deepray/losses/quantiles.py b/deepray/losses/quantiles.py index 4bbb8843..1a4f9803 100644 --- a/deepray/losses/quantiles.py +++ b/deepray/losses/quantiles.py @@ -15,8 +15,9 @@ """Implements quantiles losses.""" import tensorflow as tf +from tensorflow.python.keras import losses from typeguard import typechecked -from deepray.utils.keras_utils import LossFunctionWrapper + from deepray.utils.types import TensorLike, FloatTensorLike @@ -68,7 +69,7 @@ def pinball_loss(y_true: TensorLike, y_pred: TensorLike, tau: FloatTensorLike = @tf.keras.utils.register_keras_serializable(package="Deepray") -class PinballLoss(LossFunctionWrapper): +class PinballLoss(losses.LossFunctionWrapper): """Computes the pinball loss between `y_true` and `y_pred`. `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))` diff --git a/deepray/losses/softmax_loss.py b/deepray/losses/softmax_loss.py new file mode 100644 index 00000000..76970d30 --- /dev/null +++ b/deepray/losses/softmax_loss.py @@ -0,0 +1,167 @@ +from typing import Any, Dict, Optional + +import tensorflow as tf + +from deepray.losses import losses_impl +from deepray.losses import utils + +# The smallest probability that is used to derive smallest logit for invalid or +# padding entries. +_EPSILON = 1e-10 + + +class _RankingLoss(tf.keras.losses.Loss): + """Base class for all ranking losses. + + Please see tf.keras.losses.Loss for more information about such a class and + https://www.tensorflow.org/tutorials/distribute/custom_training on how to do + customized training. + """ + + def __init__( + self, + reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO, + name: Optional[str] = None, + ragged: bool = False + ): + super().__init__(reduction, name) + # An instance of loss in `losses_impl`. Overwrite this in subclasses. + self._loss = None + self._ragged = ragged + + def __call__( + self, + y_true: utils.TensorLike, + y_pred: utils.TensorLike, + sample_weight: Optional[utils.TensorLike] = None + ) -> tf.Tensor: + """See tf.keras.losses.Loss.""" + if self._loss is None: + raise ValueError('self._loss is not defined. Please use a subclass.') + sample_weight = self._loss.normalize_weights(y_true, sample_weight) + return super().__call__(y_true, y_pred, sample_weight) + + def call(self, y_true: utils.TensorLike, y_pred: utils.TensorLike) -> tf.Tensor: + """See tf.keras.losses.Loss.""" + y_pred = self._loss.get_logits(y_pred) + losses, weights = self._loss.compute_unreduced_loss(labels=y_true, logits=y_pred) + return tf.multiply(losses, weights) + + def get_config(self) -> Dict[str, Any]: + config = super().get_config() + config.update({'ragged': self._ragged}) + return config + + +class _ListwiseLoss(_RankingLoss): + """Base class for listwise ranking losses.""" + + def __init__( + self, + reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO, + name: Optional[str] = None, + lambda_weight: Optional[losses_impl._LambdaWeight] = None, + temperature: float = 1.0, + ragged: bool = False, + **kwargs + ): + super().__init__(reduction, name, ragged) + self._lambda_weight = lambda_weight + self._temperature = temperature + + def get_config(self) -> Dict[str, Any]: + config = super().get_config() + config.update( + { + 'lambda_weight': utils.serialize_keras_object(self._lambda_weight), + 'temperature': self._temperature, + } + ) + return config + + @classmethod + def from_config(cls, config, custom_objects=None): + config = config.copy() + config.update({ + 'lambda_weight': utils.deserialize_keras_object(config['lambda_weight']), + }) + return cls(**config) + + +class SoftmaxLoss(_ListwiseLoss): + r"""Computes Softmax cross-entropy loss between `y_true` and `y_pred`. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + loss = - sum_i y_i * log(softmax(s_i)) + ``` + + Standalone usage: + + >>> y_true = [[1., 0.]] + >>> y_pred = [[0.6, 0.8]] + >>> loss = dp.losses.SoftmaxLoss() + >>> loss(y_true, y_pred).numpy() + 0.7981389 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[1., 0.], [0., 1., 0.]]) + >>> y_pred = tf.ragged.constant([[0.6, 0.8], [0.5, 0.8, 0.4]]) + >>> loss = dp.losses.SoftmaxLoss(ragged=True) + >>> loss(y_true, y_pred).numpy() + 0.83911896 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', loss=tfr.keras.losses.SoftmaxLoss()) + ``` + + Definition: + + $$ + \mathcal{L}(\{y\}, \{s\}) = - \sum_i y_i + \log\left(\frac{\exp(s_i)}{\sum_j \exp(s_j)}\right) + $$ + """ + + def __init__( + self, + reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO, + name: Optional[str] = None, + lambda_weight: Optional[losses_impl._LambdaWeight] = None, + temperature: float = 1.0, + ragged: bool = False + ): + """Softmax cross-entropy loss. + + Args: + reduction: (Optional) The `tf.keras.losses.Reduction` to use (see + `tf.keras.losses.Loss`). + name: (Optional) The name for the op. + lambda_weight: (Optional) A lambdaweight to apply to the loss. Can be one + of `tfr.keras.losses.DCGLambdaWeight`, + `tfr.keras.losses.NDCGLambdaWeight`, or, + `tfr.keras.losses.PrecisionLambdaWeight`. + temperature: (Optional) The temperature to use for scaling the logits. + ragged: (Optional) If True, this loss will accept ragged tensors. If + False, this loss will accept dense tensors. + """ + super().__init__(reduction, name, lambda_weight, temperature, ragged) + self._loss = losses_impl.SoftmaxLoss( + name='{}_impl'.format(name) if name else None, + lambda_weight=lambda_weight, + temperature=temperature, + ragged=ragged + ) + + def __call__( + self, + y_true: utils.TensorLike, + y_pred: utils.TensorLike, + sample_weight: Optional[utils.TensorLike] = None + ) -> tf.Tensor: + """See _RankingLoss.""" + losses, sample_weight = self._loss.compute_per_list(y_true, y_pred, sample_weight) + return tf.keras.__internal__.losses.compute_weighted_loss(losses, sample_weight, reduction=self._get_reduction()) diff --git a/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py new file mode 100644 index 00000000..5f92306b --- /dev/null +++ b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py @@ -0,0 +1,377 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for masked LM loss.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import tensorflow as tf + +from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from official.nlp.modeling import networks +from deepray.losses import weighted_sparse_categorical_crossentropy + + +@keras_parameterized.run_all_keras_modes +class ClassificationLossTest(keras_parameterized.TestCase): + + def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"): + # First, create a transformer stack that we can use to get the LM's + # vocabulary weight. + xformer_stack = networks.TransformerEncoder( + vocab_size=vocab_size, + num_layers=1, + sequence_length=sequence_length, + hidden_size=hidden_size, + num_attention_heads=4, + ) + word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) + mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) + type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) + lm_outputs, _ = xformer_stack([word_ids, mask, type_ids]) + + # Create a maskedLM from the transformer stack. + test_network = networks.MaskedLM( + num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output + ) + + # Create a model from the masked LM layer. + lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size)) + masked_lm_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32) + output = test_network([lm_input_tensor, masked_lm_positions]) + return tf.keras.Model([lm_input_tensor, masked_lm_positions], output) + + def create_classification_model(self, input_width, num_classes): + test_object = networks.Classification(input_width=input_width, num_classes=num_classes) + # Create a 2-dimensional input (the first dimension is implicit). + pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32) + output = test_object(pooled_data) + return tf.keras.Model(pooled_data, output) + + def test_per_example_loss_3d_input(self): + """Test per-example loss with a 3-dimensional input, from a masked LM.""" + vocab_size = 100 + sequence_length = 32 + hidden_size = 64 + num_predictions = 21 + model = self.create_lm_model( + vocab_size=vocab_size, + sequence_length=sequence_length, + hidden_size=hidden_size, + num_predictions=num_predictions + ) + + # Get the output of the masked LM. + batch_size = 3 + lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size)) + masked_position_data = np.random.randint(2, size=(batch_size, num_predictions)) + output_data = model.predict([lm_input_data, masked_position_data]) + + # Calculate per-example loss. + labels = np.random.randint(vocab_size, size=(batch_size, num_predictions)) + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels + ) + + # Per-example loss data should have one value per prediction, and those + # values shouldn't be zero in this case (as we're using random data). + expected_shape = [batch_size, num_predictions] + self.assertEqual(expected_shape, per_example_loss_data.shape.as_list()) + self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data) + + def test_per_example_loss_2d_input(self): + """Test per-example loss with a 2-d input, from a classifier.""" + input_width = 512 + num_classes = 10 + model = self.create_classification_model(input_width, num_classes) + + # Invoke the network as part of a Model. + batch_size = 3 + input_data = 10 * np.random.random_sample((batch_size, input_width)) + output_data = model.predict(input_data) + + # Calculate per example loss. + labels = np.random.randint(num_classes, size=(batch_size)) + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels + ) + + # Per-example loss data should have one value per batch item, and those + # values shouldn't be zero in this case (as we're using random data). + self.assertEqual([batch_size], per_example_loss_data.shape.as_list()) + self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data) + + def test_per_example_loss_weights_3d_input(self): + """Test weighted per-example loss with a 3-d input, from a masked LM.""" + vocab_size = 100 + sequence_length = 32 + hidden_size = 64 + num_predictions = 21 + model = self.create_lm_model( + vocab_size=vocab_size, + sequence_length=sequence_length, + hidden_size=hidden_size, + num_predictions=num_predictions + ) + + # Get the output of the masked LM. + batch_size = 3 + lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size)) + masked_position_data = np.random.randint(2, size=(batch_size, num_predictions)) + output_data = model.predict([lm_input_data, masked_position_data]) + + # Calculate per-example loss with weights. + labels = np.random.randint(vocab_size, size=(batch_size, num_predictions)) + weights = np.random.randint(2, size=(batch_size, num_predictions)) + + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels, weights=weights + ) + + # Weighted per-example loss data should be equivalent to multiplying the + # loss tensor by the weights tensor. + expected_weighted_loss = per_example_loss_data * weights + self.assertAllClose(expected_weighted_loss, per_example_loss_data) + + def test_per_example_loss_weights_2d_input(self): + """Test weighted per-example loss with a 2-d input, from a classifier.""" + input_width = 512 + num_classes = 10 + model = self.create_classification_model(input_width, num_classes) + + # Invoke the network as part of a Model. + batch_size = 3 + input_data = 10 * np.random.random_sample((batch_size, input_width)) + output_data = model.predict(input_data) + + # Calculate per-example loss with weights. + labels = np.random.randint(num_classes, size=(batch_size)) + weights = np.random.randint(2, size=(batch_size)) + + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels, weights=weights + ) + + # Weighted per-example loss data should be equivalent to multiplying the + # loss tensor by the weights tensor. + expected_weighted_loss = per_example_loss_data * weights + self.assertAllClose(expected_weighted_loss, per_example_loss_data) + + def test_loss_3d_input(self): + """Test overall loss with a 3-dimensional input, from a masked LM.""" + vocab_size = 100 + sequence_length = 32 + hidden_size = 64 + num_predictions = 21 + model = self.create_lm_model( + vocab_size=vocab_size, + sequence_length=sequence_length, + hidden_size=hidden_size, + num_predictions=num_predictions + ) + + # Get the output of the masked LM. + batch_size = 3 + lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size)) + masked_position_data = np.random.randint(2, size=(batch_size, num_predictions)) + output_data = model.predict([lm_input_data, masked_position_data]) + + # Calculate loss. + labels = np.random.randint(vocab_size, size=(batch_size, num_predictions)) + weights = np.random.randint(2, size=(batch_size, num_predictions)) + per_example_loss_data = weighted_sparse_categorical_crossentropy.loss( + predictions=output_data, labels=labels, weights=weights + ) + + # Total loss data should have one value, and that value shouldn't be zero + # in this case (as we're using random data). + expected_shape = [] # Scalar + self.assertEqual(expected_shape, per_example_loss_data.shape.as_list()) + self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data) + + def test_loss_2d_input(self): + """Test overall loss with a 2-d input, from a classifier.""" + input_width = 512 + num_classes = 10 + model = self.create_classification_model(input_width, num_classes) + + # Invoke the network as part of a Model. + batch_size = 3 + input_data = 10 * np.random.random_sample((batch_size, input_width)) + output_data = model.predict(input_data) + + # Calculate per example loss. + labels = np.random.randint(num_classes, size=(batch_size)) + loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels) + + # Loss data should have one value only, and that value shouldn't be zero in + # this case (as we're using random data). + self.assertNotAllClose(0, loss_data) + + def test_loss_weights_3d_input(self): + """Test masked loss with a 3-dimensional input, from a masked LM.""" + vocab_size = 100 + sequence_length = 32 + hidden_size = 64 + num_predictions = 21 + model = self.create_lm_model( + vocab_size=vocab_size, + sequence_length=sequence_length, + hidden_size=hidden_size, + num_predictions=num_predictions + ) + + # Get the output of the masked LM. + batch_size = 3 + lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size)) + masked_position_data = np.random.randint(2, size=(batch_size, num_predictions)) + output_data = model.predict([lm_input_data, masked_position_data]) + + # Calculate a fully masked weight tensor. This should give a loss of zero. + labels = np.random.randint(vocab_size, size=(batch_size, num_predictions)) + null_weights = np.zeros((batch_size, num_predictions)) + weighted_loss_data = weighted_sparse_categorical_crossentropy.loss( + predictions=output_data, labels=labels, weights=null_weights + ) + + # Because the tensor is fully masked, the loss should be 0. + self.assertAllClose(0, weighted_loss_data) + + def test_loss_weights_2d_input(self): + """Test masked loss with a 2-d input, from a classifier.""" + input_width = 512 + num_classes = 10 + model = self.create_classification_model(input_width, num_classes) + + # Invoke the network as part of a Model. + batch_size = 3 + input_data = 10 * np.random.random_sample((batch_size, input_width)) + output_data = model.predict(input_data) + + # Calculate a fully masked weight tensor. This should give a loss of zero. + labels = np.random.randint(num_classes, size=(batch_size)) + null_weights = np.zeros((batch_size)) + weighted_loss_data = weighted_sparse_categorical_crossentropy.loss( + predictions=output_data, labels=labels, weights=null_weights + ) + + # Because the tensor is fully masked, the loss should be 0. + self.assertAllClose(0, weighted_loss_data) + + def test_mismatched_predictions_and_labels_ranks_squeezes(self): + """Test that the loss asserts when rank(predictions)-1 != rank(labels).""" + batch_size = 3 + output_data = np.random.random_sample((batch_size, 10)) + labels = np.random.randint(10, size=(batch_size, 1)) + + # All that this test tests is that the squeeze is successful. + _ = weighted_sparse_categorical_crossentropy.per_example_loss(predictions=output_data, labels=labels) + + def test_mismatched_weights_and_labels_ranks_fail(self): + """Test that the loss asserts when rank(predictions) != rank(labels).""" + batch_size = 3 + output_data = np.random.random_sample((batch_size, 10, 15)) + labels = np.random.randint(10, size=(batch_size, 10)) + weights = np.random.randint(2, size=(batch_size)) + + with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"): + _ = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels, weights=weights + ) + with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"): + _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights) + + def test_tf_tensor_inputs(self): + """Test that tf.Tensors can be used as inputs to the loss function.""" + batch_size = 3 + output_data = tf.convert_to_tensor(np.random.random_sample((batch_size, 10, 15))) + labels = tf.convert_to_tensor(np.random.randint(10, size=(batch_size, 10))) + weights = tf.convert_to_tensor(np.random.randint(2, size=(batch_size, 10))) + + # We're not trying to validate numerical correctness, just ensure that + # we can in fact pass tensors to these functions without causing runtime + # errors from the shape checking code. + _ = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels, weights=weights + ) + _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights) + + def test_legacy_lm_loss_compatibility(self): + """Test to validate computational correctness during refactors.""" + # This is the empirical output of a masked LM with the following parameters: + # batch_size = 3 + # vocab_size = 5 + # sequence_length = 4 + # num_predictions = 2 + output_data = np.array( + [ + [ + [-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571], + [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683] + ], + [ + [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741], + [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741] + ], + [ + [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509], + [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509] + ] + ] + ) + labels = np.array([[4, 0], [2, 2], [2, 1]]) + + # Validate that per_example loss calculations are the same. + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels + ) + expected_per_example_loss_data = [[1.2923571, 2.7117882], [2.287932, 2.287932], [3.0924666, 1.8219438]] + self.assertAllClose(expected_per_example_loss_data, per_example_loss_data) + + # Validate that overall loss calculations are the same. + weights = np.array([[1, 0], [0, 0], [0, 0]]) + loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights) + expected_loss_data = 1.2923441 + self.assertAllClose(expected_loss_data, loss_data) + + def test_legacy_classification_loss_compatibility(self): + """Test to validate computational correctness during refactors.""" + # This is the empirical output of a classifier with the following params: + # batch_size = 2 + # num_classes = 3 + output_data = np.array( + [[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00], [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]] + ) + labels = np.array([2, 1]) + + # Validate that per_example loss calculations are the same. + per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss( + predictions=output_data, labels=labels + ) + expected_per_example_loss_data = [6.4434357, 6.4009643] + self.assertAllClose(expected_per_example_loss_data, per_example_loss_data) + + # Validate that overall loss calculations are the same. + weights = None + loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights) + expected_loss_data = 6.4222 + self.assertAllClose(expected_loss_data, loss_data) + + +if __name__ == "__main__": + tf.test.main() diff --git a/deepray/losses/triplet.py b/deepray/losses/triplet.py index 0dea42a7..df566b28 100644 --- a/deepray/losses/triplet.py +++ b/deepray/losses/triplet.py @@ -14,12 +14,14 @@ # ============================================================================== """Implements triplet loss.""" +from typing import Optional, Union, Callable + import tensorflow as tf +from tensorflow.python.keras import losses +from typeguard import typechecked + from deepray.losses import metric_learning -from deepray.utils.keras_utils import LossFunctionWrapper from deepray.utils.types import FloatTensorLike, TensorLike -from typeguard import typechecked -from typing import Optional, Union, Callable def _masked_maximum(data, mask, dim=1): @@ -272,7 +274,7 @@ def triplet_hard_loss( @tf.keras.utils.register_keras_serializable(package="Deepray") -class TripletSemiHardLoss(LossFunctionWrapper): +class TripletSemiHardLoss(losses.LossFunctionWrapper): """Computes the triplet loss with semi-hard negative mining. The loss encourages the positive distances (between a pair of embeddings @@ -309,7 +311,7 @@ def __init__( @tf.keras.utils.register_keras_serializable(package="Deepray") -class TripletHardLoss(LossFunctionWrapper): +class TripletHardLoss(losses.LossFunctionWrapper): """Computes the triplet loss with hard negative and hard positive mining. The loss encourages the maximum positive distance (between a pair of embeddings diff --git a/deepray/losses/utils.py b/deepray/losses/utils.py new file mode 100644 index 00000000..ee3d2f4d --- /dev/null +++ b/deepray/losses/utils.py @@ -0,0 +1,563 @@ +# Copyright 2024 The TensorFlow Ranking Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for ranking library.""" + +from typing import Callable, Dict, Tuple + +import tensorflow as tf + +_PADDING_LABEL = -1. +_PADDING_PREDICTION = -1e6 +_PADDING_WEIGHT = 0. + +TensorLike = tf.types.experimental.TensorLike +TransformationFunction = Callable[[TensorLike], tf.Tensor] +LossFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor] +MetricFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor] +TensorLike = tf.types.experimental.TensorLike +GainFunction = Callable[[TensorLike], tf.Tensor] +RankDiscountFunction = Callable[[TensorLike], tf.Tensor] +PositiveFunction = Callable[[TensorLike], tf.Tensor] + + +def _to_nd_indices(indices): + """Returns indices used for tf.gather_nd or tf.scatter_nd. + + Args: + indices: A `Tensor` of shape [batch_size, size] with integer values. The + values are the indices of another `Tensor`. For example, `indices` is the + output of tf.argsort or tf.math.top_k. + + Returns: + A `Tensor` with shape [batch_size, size, 2] that can be used by tf.gather_nd + or tf.scatter_nd. + + """ + indices.get_shape().assert_has_rank(2) + batch_ids = tf.ones_like(indices) * tf.expand_dims(tf.range(tf.shape(input=indices)[0]), 1) + return tf.stack([batch_ids, indices], axis=-1) + + +def gather_per_row(inputs, indices): + """Gathers the values from input tensor based on per-row indices. + + Example Usage: + ```python + scores = [[1., 3., 2.], [1., 2., 3.]] + indices = [[1, 2], [2, 1]] + tfr.utils.gather_per_row(scores, indices) + ``` + Returns [[3., 2.], [3., 2.]] + + Args: + inputs: (tf.Tensor) A tensor of shape [batch_size, list_size] or + [batch_size, list_size, feature_dims]. + indices: (tf.Tensor) A tensor of shape [batch_size, size] of positions to + gather inputs from. Each index corresponds to a row entry in input_tensor. + + Returns: + A tensor of values gathered from inputs, of shape [batch_size, size] or + [batch_size, size, feature_dims], depending on whether the input was 2D or + 3D. + """ + indices = tf.cast(indices, dtype=tf.int32) + return tf.gather(inputs, indices, batch_dims=1, axis=1) + + +def is_label_valid(labels): + """Returns a boolean `Tensor` for label validity.""" + labels = tf.convert_to_tensor(value=labels) + return tf.greater_equal(labels, 0.) + + +def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None): + """Gets indices which would shuffle a tensor. + + Args: + shape: The shape of the indices to generate. + mask: An optional mask that indicates which entries to place first. Its + shape should be equal to given shape. + shuffle_ties: Whether to randomly shuffle ties. + seed: The ops-level random seed. + + Returns: + An int32 `Tensor` with given `shape`. Its entries are indices that would + (randomly) shuffle the values of a `Tensor` of given `shape` along the last + axis while placing masked items first. + """ + # Generate random values when shuffling ties or all zeros when not. + if shuffle_ties: + shuffle_values = tf.random.uniform(shape, seed=seed) + else: + shuffle_values = tf.zeros(shape, dtype=tf.float32) + + # Since shuffle_values is always in [0, 1), we can safely increase entries + # where mask=False with 2.0 to make sure those are placed last during the + # argsort op. + if mask is not None: + shuffle_values = tf.where(mask, shuffle_values, shuffle_values + 2.0) + + # Generate indices by sorting the shuffle values. + return tf.argsort(shuffle_values, stable=True) + + +def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=None, mask=None): + """Sorts list of features according to per-example scores. + + Args: + scores: A `Tensor` of shape [batch_size, list_size] representing the + per-example scores. + features_list: A list of `Tensor`s to be sorted. The shape of the `Tensor` + can be [batch_size, list_size] or [batch_size, list_size, feature_dims]. + The latter is applicable for example features. + topn: An integer as the cutoff of examples in the sorted list. + shuffle_ties: A boolean. If True, randomly shuffle before the sorting. + seed: The ops-level random seed used when `shuffle_ties` is True. + mask: An optional `Tensor` of shape [batch_size, list_size] representing + which entries are valid for sorting. Invalid entries will be pushed to the + end. + + Returns: + A list of `Tensor`s as the list of sorted features by `scores`. + """ + with tf.compat.v1.name_scope(name='sort_by_scores'): + scores = tf.cast(scores, tf.float32) + scores.get_shape().assert_has_rank(2) + list_size = tf.shape(input=scores)[1] + if topn is None: + topn = list_size + topn = tf.minimum(topn, list_size) + + # Set invalid entries (those whose mask value is False) to the minimal value + # of scores so they will be placed last during sort ops. + if mask is not None: + scores = tf.where(mask, scores, tf.reduce_min(scores)) + + # Shuffle scores to break ties and/or push invalid entries (according to + # mask) to the end. + shuffle_ind = None + if shuffle_ties or mask is not None: + shuffle_ind = _get_shuffle_indices(tf.shape(input=scores), mask, shuffle_ties=shuffle_ties, seed=seed) + scores = tf.gather(scores, shuffle_ind, batch_dims=1, axis=1) + + # Perform sort and return sorted feature_list entries. + _, indices = tf.math.top_k(scores, topn, sorted=True) + if shuffle_ind is not None: + indices = tf.gather(shuffle_ind, indices, batch_dims=1, axis=1) + return [tf.gather(f, indices, batch_dims=1, axis=1) for f in features_list] + + +def sorted_ranks(scores, shuffle_ties=True, seed=None): + """Returns an int `Tensor` as the ranks (1-based) after sorting scores. + + Example: Given scores = [[1.0, 3.5, 2.1]], the returned ranks will be [[3, 1, + 2]]. It means that scores 1.0 will be ranked at position 3, 3.5 will be ranked + at position 1, and 2.1 will be ranked at position 2. + + Args: + scores: A `Tensor` of shape [batch_size, list_size] representing the + per-example scores. + shuffle_ties: See `sort_by_scores`. + seed: See `sort_by_scores`. + + Returns: + A 1-based int `Tensor`s as the ranks. + """ + with tf.compat.v1.name_scope(name='sorted_ranks'): + batch_size, list_size = tf.unstack(tf.shape(input=scores)) + # The current position in the list for each score. + positions = tf.tile(tf.expand_dims(tf.range(list_size), 0), [batch_size, 1]) + # For score [[1.0, 3.5, 2.1]], sorted_positions are [[1, 2, 0]], meaning the + # largest score is at position 1, the 2nd is at position 2 and 3rd is at + # position 0. + sorted_positions = sort_by_scores(scores, [positions], shuffle_ties=shuffle_ties, seed=seed)[0] + # The indices of sorting sorted_positions will be [[2, 0, 1]] and ranks are + # 1-based and thus are [[3, 1, 2]]. + ranks = tf.argsort(sorted_positions) + 1 + return ranks + + +def shuffle_valid_indices(is_valid, seed=None): + """Returns a shuffle of indices with valid ones on top.""" + return organize_valid_indices(is_valid, shuffle=True, seed=seed) + + +def organize_valid_indices(is_valid, shuffle=True, seed=None): + """Organizes indices in such a way that valid items appear first. + + Args: + is_valid: A boolean `Tensor` for entry validity with shape [batch_size, + list_size]. + shuffle: A boolean indicating whether valid items should be shuffled. + seed: An int for random seed at the op level. It works together with the + seed at global graph level together to determine the random number + generation. See `tf.set_random_seed`. + + Returns: + A tensor of indices with shape [batch_size, list_size, 2]. The returned + tensor can be used with `tf.gather_nd` and `tf.scatter_nd` to compose a new + [batch_size, list_size] tensor. The values in the last dimension are the + indices for an element in the input tensor. + """ + with tf.compat.v1.name_scope(name='organize_valid_indices'): + is_valid = tf.convert_to_tensor(value=is_valid) + is_valid.get_shape().assert_has_rank(2) + output_shape = tf.shape(input=is_valid) + + if shuffle: + values = tf.random.uniform(output_shape, seed=seed) + else: + values = ( + tf.ones_like(is_valid, tf.float32) * tf.reverse(tf.cast(tf.range(output_shape[1]), dtype=tf.float32), [-1]) + ) + + rand = tf.where(is_valid, values, tf.ones(output_shape) * -1e-6) + # shape(indices) = [batch_size, list_size] + indices = tf.argsort(rand, direction='DESCENDING', stable=True) + return _to_nd_indices(indices) + + +def reshape_first_ndims(tensor, first_ndims, new_shape): + """Reshapes the first n dims of the input `tensor` to `new shape`. + + Args: + tensor: The input `Tensor`. + first_ndims: A int denoting the first n dims. + new_shape: A list of int representing the new shape. + + Returns: + A reshaped `Tensor`. + """ + assert tensor.get_shape().ndims is None or tensor.get_shape().ndims >= first_ndims, ( + 'Tensor shape is less than {} dims.'.format(first_ndims) + ) + new_shape = tf.concat([new_shape, tf.shape(input=tensor)[first_ndims:]], 0) + if isinstance(tensor, tf.SparseTensor): + return tf.sparse.reshape(tensor, new_shape) + + return tf.reshape(tensor, new_shape) + + +def reshape_to_2d(tensor): + """Converts the given `tensor` to a 2-D `Tensor`.""" + with tf.compat.v1.name_scope(name='reshape_to_2d'): + rank = tensor.shape.rank if tensor.shape is not None else None + if rank is not None and rank != 2: + if rank >= 3: + tensor = tf.reshape(tensor, tf.shape(input=tensor)[0:2]) + else: + while tensor.shape.rank < 2: + tensor = tf.expand_dims(tensor, -1) + return tensor + + +def _circular_indices(size, num_valid_entries): + """Creates circular indices with padding and mask for non-padded ones. + + This returns a indices and a mask Tensor, where the mask is True for valid + entries and False for padded entries. + + The returned indices have the shape of [batch_size, size], where the + batch_size is obtained from the 1st dim of `num_valid_entries`. For a + batch_size = 1, when size = 3, returns [[0, 1, 2]], when num_valid_entries = + 2, returns [[0, 1, 0]]. The first 2 are valid and the returned mask is [True, + True, False]. + + Args: + size: A scalar int `Tensor` for the size. + num_valid_entries: A 1-D `Tensor` with shape [batch_size] representing the + number of valid entries for each instance in a batch. + + Returns: + A tuple of Tensors (batch_indices, batch_indices_mask). The first has + shape [batch_size, size] and the second has shape [batch_size, size]. + """ + with tf.compat.v1.name_scope(name='circular_indices'): + # shape = [batch_size, size] with value [[0, 1, ...], [0, 1, ...], ...]. + batch_indices = tf.tile(tf.expand_dims(tf.range(size), 0), [tf.shape(input=num_valid_entries)[0], 1]) + num_valid_entries = tf.reshape(num_valid_entries, [-1, 1]) + batch_indices_mask = tf.less(batch_indices, num_valid_entries) + # Use mod to make the indices to the ranges of valid entries. + num_valid_entries = tf.where(tf.less(num_valid_entries, 1), tf.ones_like(num_valid_entries), num_valid_entries) + batch_indices = tf.math.mod(batch_indices, num_valid_entries) + return batch_indices, batch_indices_mask + + +def padded_nd_indices(is_valid, shuffle=False, seed=None): + """Pads the invalid entries by valid ones and returns the nd_indices. + + For example, when we have a batch_size = 1 and list_size = 3. Only the first 2 + entries are valid. We have: + ``` + is_valid = [[True, True, False]] + nd_indices, mask = padded_nd_indices(is_valid) + ``` + nd_indices has a shape [1, 3, 2] and mask has a shape [1, 3]. + + ``` + nd_indices = [[[0, 0], [0, 1], [0, 0]]] + mask = [[True, True, False]] + ``` + nd_indices can be used by gather_nd on a Tensor t + ``` + padded_t = tf.gather_nd(t, nd_indices) + ``` + and get the following Tensor with first 2 dims are [1, 3]: + ``` + padded_t = [[t(0, 0), t(0, 1), t(0, 0)]] + ``` + + Args: + is_valid: A boolean `Tensor` for entry validity with shape [batch_size, + list_size]. + shuffle: A boolean that indicates whether valid indices should be shuffled. + seed: Random seed for shuffle. + + Returns: + A tuple of Tensors (nd_indices, mask). The first has shape [batch_size, + list_size, 2] and it can be used in gather_nd or scatter_nd. The second has + the shape of [batch_size, list_size] with value True for valid indices. + """ + with tf.compat.v1.name_scope(name='nd_indices_with_padding'): + is_valid = tf.convert_to_tensor(value=is_valid) + list_size = tf.shape(input=is_valid)[1] + num_valid_entries = tf.reduce_sum(input_tensor=tf.cast(is_valid, dtype=tf.int32), axis=1) + indices, mask = _circular_indices(list_size, num_valid_entries) + # Valid indices of the tensor are shuffled and put on the top. + # [batch_size, list_size, 2]. + shuffled_indices = organize_valid_indices(is_valid, shuffle=shuffle, seed=seed) + # Construct indices for gather_nd [batch_size, list_size, 2]. + nd_indices = _to_nd_indices(indices) + nd_indices = tf.gather_nd(shuffled_indices, nd_indices) + return nd_indices, mask + + +def de_noise(counts, noise, ratio=0.9): + """Returns a float `Tensor` as the de-noised `counts`. + + The implementation is based on the the paper by Zhang and Xu: "Fast Exact + Maximum Likelihood Estimation for Mixture of Language Models." It assumes that + the observed `counts` are generated from a mixture of `noise` and the true + distribution: `ratio * noise_distribution + (1 - ratio) * true_distribution`, + where the contribution of `noise` is controlled by `ratio`. This method + returns the true distribution. + + Args: + counts: A 2-D `Tensor` representing the observations. All values should be + nonnegative. + noise: A 2-D `Tensor` representing the noise distribution. This should be + the same shape as `counts`. All values should be positive and are + normalized to a simplex per row. + ratio: A float in (0, 1) representing the contribution from noise. + + Returns: + A 2-D float `Tensor` and each row is a simplex. + Raises: + ValueError: if `ratio` is not in (0,1). + InvalidArgumentError: if any of `counts` is negative or any of `noise` is + not positive. + """ + if not 0 < ratio < 1: + raise ValueError('ratio should be in (0, 1), but get {}'.format(ratio)) + odds = (1 - ratio) / ratio + + counts = tf.cast(counts, dtype=tf.float32) + noise = tf.cast(noise, dtype=tf.float32) + + counts.get_shape().assert_has_rank(2) + noise.get_shape().assert_has_rank(2) + noise.get_shape().assert_is_compatible_with(counts.get_shape()) + + with tf.compat.v1.name_scope(name='de_noise'): + counts_nonneg = tf.debugging.assert_greater_equal(counts, 0.) + noise_pos = tf.debugging.assert_greater(noise, 0.) + with tf.control_dependencies([counts_nonneg, noise_pos]): + # Normalize noise to be a simplex per row. + noise = noise / tf.reduce_sum(noise, axis=1, keepdims=True) + sorted_idx = tf.argsort(counts / noise, direction='DESCENDING', stable=True) + nd_indices = _to_nd_indices(sorted_idx) + sorted_counts = tf.gather_nd(counts, nd_indices) + sorted_noise = tf.gather_nd(noise, nd_indices) + # Decide whether an entry will have a positive value or 0. + is_pos = tf.cast( + (odds + tf.cumsum(sorted_noise, axis=1)) / tf.cumsum(sorted_counts, axis=1) > sorted_noise / sorted_counts, + tf.float32 + ) + # The lambda in the paper above, which is the lagrangian multiplier for + # the simplex constraint on the variables. + lagrangian_multiplier = tf.reduce_sum(sorted_counts * is_pos, axis=1, keepdims=True + ) / (1 + tf.reduce_sum(sorted_noise * is_pos, axis=1, keepdims=True) / odds) + res = (sorted_counts / lagrangian_multiplier - sorted_noise / odds) * is_pos + return tf.scatter_nd(nd_indices, res, shape=tf.shape(counts)) + + +def ragged_to_dense(labels, predictions, weights): + """Converts given inputs from ragged tensors to dense tensors. + + Args: + labels: A `tf.RaggedTensor` of the same shape as `predictions` representing + relevance. + predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each + value is the ranking score of the corresponding example. + weights: An optional `tf.RaggedTensor` of the same shape of predictions or a + `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and + the latter case is per-list. + + Returns: + A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s. + """ + # TODO: Add checks to validate (ragged) shapes of input tensors. + mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool) + labels = labels.to_tensor(_PADDING_LABEL) + if predictions is not None: + predictions = predictions.to_tensor(_PADDING_PREDICTION) + if isinstance(weights, tf.RaggedTensor): + weights = weights.to_tensor(_PADDING_WEIGHT) + return labels, predictions, weights, mask + + +def parse_keys_and_weights(key: str) -> Dict[str, float]: + """Parses the encoded key to keys and weights. + + This parse function will remove all spaces. Different keys are split by "," + and then weight associated with key is split by ":". + + Args: + key: A string represents a key, or a string of multiple keys, split by ",", + and weighted by the weights split by ":". For example, key = + 'softmax_loss:0.9,sigmoid_cross_entropy_loss:0.1'. + + Returns: + A dict from keys to weights. + """ + + def _parse(key_with_weight: str) -> Tuple[str, float]: + if ':' in key_with_weight: + pair = key_with_weight.split(':') + else: + pair = [key_with_weight, 1.0] + + return pair[0], float(pair[1]) + + # Remove spaces. + key = key.replace(' ', '') + # Single objective or multiple objectives with weights: + keys_to_weights = dict(_parse(loss_key_with_weight) for loss_key_with_weight in key.split(',')) + + return keys_to_weights + + +def serialize_keras_object(obj): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.serialize_keras_object(obj) + else: + return tf.keras.utils.serialize_keras_object(obj) + + +def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + else: + return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + + +# The following functions are used to transform labels and ranks for losses and +# metrics computation. User customized functions can be defined similarly by +# following the same annotations. +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def identity(label: TensorLike) -> tf.Tensor: + """Identity function that returns the input label. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + The input label. + """ + return label + + +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def inverse(rank: TensorLike) -> tf.Tensor: + """Computes the inverse of input rank. + + Args: + rank: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `1/x`. + """ + return tf.math.divide_no_nan(1., rank) + + +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def pow_minus_1(label: TensorLike) -> tf.Tensor: + """Computes `2**x - 1` element-wise for each label. + + Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `2**x - 1`. + """ + return tf.math.pow(2., label) - 1. + + +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def log2_inverse(rank: TensorLike) -> tf.Tensor: + """Computes `1./log2(1+x)` element-wise for each label. + + Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + rank: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`. + """ + return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank)) + + +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def is_greater_equal_1(label: TensorLike) -> tf.Tensor: + """Computes whether label is greater or equal to 1. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `I(x > 1)`. + """ + return tf.greater_equal(label, 1.0) + + +@tf.keras.utils.register_keras_serializable(package="deepray.losses") +def symmetric_log1p(t: TensorLike) -> tf.Tensor: + """Computes `sign(x) * log(1 + sign(x))`. + + Args: + t: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `I(x > 1)`. + """ + return tf.math.log1p(t * tf.sign(t)) * tf.sign(t) diff --git a/deepray/losses/weighted_sparse_categorical_crossentropy.py b/deepray/losses/weighted_sparse_categorical_crossentropy.py new file mode 100644 index 00000000..e21a86f9 --- /dev/null +++ b/deepray/losses/weighted_sparse_categorical_crossentropy.py @@ -0,0 +1,108 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Sparse categorical cross-entropy losses.""" + +from __future__ import absolute_import +from __future__ import division +# from __future__ import google_type_annotations +from __future__ import print_function + +import tensorflow as tf + + +def _adjust_labels(labels, predictions): + """Adjust the 'labels' tensor by squeezing it if needed.""" + labels = tf.cast(labels, tf.int32) + if len(predictions.shape) == len(labels.shape): + labels = tf.squeeze(labels, [-1]) + return labels, predictions + + +def _validate_rank(labels, predictions, weights): + if weights is not None and len(weights.shape) != len(labels.shape): + raise RuntimeError( + ("Weight and label tensors were not of the same rank. weights.shape " + "was %s, and labels.shape was %s.") % (predictions.shape, labels.shape) + ) + if (len(predictions.shape) - 1) != len(labels.shape): + raise RuntimeError( + ( + "Weighted sparse categorical crossentropy expects `labels` to have a " + "rank of one less than `predictions`. labels.shape was %s, and " + "predictions.shape was %s." + ) % (labels.shape, predictions.shape) + ) + + +def per_example_loss(labels, predictions, weights=None): + """Calculate a per-example sparse categorical crossentropy loss. + + This loss function assumes that the predictions are post-softmax. + Args: + labels: The labels to evaluate against. Should be a set of integer indices + ranging from 0 to (vocab_size-1). + predictions: The network predictions. Should have softmax already applied. + weights: An optional weight array of the same shape as the 'labels' array. + If None, all examples will be used. + + Returns: + A tensor of shape predictions.shape[:-1] containing the per-example + loss. + """ + # When using these functions with the Keras core API, we will need to squeeze + # the labels tensor - Keras adds a spurious inner dimension. + labels, predictions = _adjust_labels(labels, predictions) + _validate_rank(labels, predictions, weights) + + labels_one_hot = tf.keras.backend.one_hot(labels, predictions.shape[-1]) + labels_one_hot = tf.keras.backend.cast(labels_one_hot, predictions.dtype) + per_example_loss_data = -tf.keras.backend.sum(predictions * labels_one_hot, axis=[-1]) + if weights is not None: + weights = tf.keras.backend.cast(weights, per_example_loss_data.dtype) + per_example_loss_data = weights * per_example_loss_data + return per_example_loss_data + + +def loss(labels, predictions, weights=None): + """Calculate a per-batch sparse categorical crossentropy loss. + + This loss function assumes that the predictions are post-softmax. + Args: + labels: The labels to evaluate against. Should be a set of integer indices + ranging from 0 to (vocab_size-1). + predictions: The network predictions. Should have softmax already applied. + weights: An optional weight array of the same shape as the 'labels' array. + If None, all examples will be used. + + Returns: + A loss scalar. + + Raises: + RuntimeError if the passed tensors do not have the same rank. + """ + # When using these functions with the Keras core API, we will need to squeeze + # the labels tensor - Keras adds a spurious inner dimension. + labels, predictions = _adjust_labels(labels, predictions) + _validate_rank(labels, predictions, weights) + + per_example_loss_data = per_example_loss(labels, predictions, weights) + + if weights is None: + return tf.keras.backend.mean(per_example_loss_data) + else: + numerator = tf.keras.backend.sum(per_example_loss_data) + weights = tf.keras.backend.cast(weights, predictions.dtype) + denominator = tf.keras.backend.sum(weights) + 1e-5 + return numerator / denominator diff --git a/deepray/metrics/__init__.py b/deepray/metrics/__init__.py index f628fa35..8f3c2214 100755 --- a/deepray/metrics/__init__.py +++ b/deepray/metrics/__init__.py @@ -22,13 +22,6 @@ hamming_loss_fn, ) from deepray.metrics.utils import MeanMetricWrapper -from deepray.metrics.matthews_correlation_coefficient import ( - MatthewsCorrelationCoefficient, -) -from deepray.metrics.multilabel_confusion_matrix import ( - MultiLabelConfusionMatrix, -) -from deepray.metrics.r_square import RSquare from deepray.metrics.geometric_mean import GeometricMean from deepray.metrics.harmonic_mean import HarmonicMean from deepray.metrics.streaming_correlations import ( @@ -37,3 +30,5 @@ PearsonsCorrelation, SpearmansRank, ) +from deepray.metrics.ndcg import NDCGMetric +from deepray.metrics.mrr import MRRMetric \ No newline at end of file diff --git a/deepray/metrics/_ranking.py b/deepray/metrics/_ranking.py new file mode 100644 index 00000000..c85a9d6c --- /dev/null +++ b/deepray/metrics/_ranking.py @@ -0,0 +1,165 @@ +# Copyright 2024 The TensorFlow Ranking Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from typing import Callable + +TensorLike = tf.types.experimental.TensorLike +GainFunction = Callable[[TensorLike], tf.Tensor] +RankDiscountFunction = Callable[[TensorLike], tf.Tensor] +PositiveFunction = Callable[[TensorLike], tf.Tensor] + + +class _RankingMetric(tf.keras.metrics.Mean): + """Implements base ranking metric class. + + Please see tf.keras.metrics.Mean for more information about such a class and + https://www.tensorflow.org/tutorials/distribute/custom_training on how to do + customized training. + """ + + def __init__(self, name=None, dtype=None, ragged=False, **kwargs): + super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs) + # An instance of `metrics_impl._RankingMetric`. + # Overwrite this in subclasses. + self._metric = None + self._ragged = ragged + + def update_state(self, y_true, y_pred, sample_weight=None): + """Accumulates metric statistics. + + `y_true` and `y_pred` should have the same shape. + + Args: + y_true: The ground truth values. + y_pred: The predicted values. + sample_weight: Optional weighting of each example. Defaults to 1. Can be a + `Tensor` whose rank is either 0, or the same rank as `y_true`, and must + be broadcastable to `y_true`. + + Returns: + Update op. + """ + y_true = tf.cast(y_true, self._dtype) + y_pred = tf.cast(y_pred, self._dtype) + + # TODO: Add mask argument for metric.compute() call + per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight) + return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights) + + def get_config(self): + config = super(_RankingMetric, self).get_config() + config.update({ + "ragged": self._ragged, + }) + return config + + +def serialize_keras_object(obj): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.serialize_keras_object(obj) + else: + return tf.keras.utils.serialize_keras_object(obj) + + +def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None): + if hasattr(tf.keras.utils, "legacy"): + return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + else: + return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name) + + +# The following functions are used to transform labels and ranks for losses and +# metrics computation. User customized functions can be defined similarly by +# following the same annotations. +def identity(label: TensorLike) -> tf.Tensor: + """Identity function that returns the input label. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + The input label. + """ + return label + + +def inverse(rank: TensorLike) -> tf.Tensor: + """Computes the inverse of input rank. + + Args: + rank: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `1/x`. + """ + return tf.math.divide_no_nan(1., rank) + + +def pow_minus_1(label: TensorLike) -> tf.Tensor: + """Computes `2**x - 1` element-wise for each label. + + Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `2**x - 1`. + """ + return tf.math.pow(2., label) - 1. + + +def log2_inverse(rank: TensorLike) -> tf.Tensor: + """Computes `1./log2(1+x)` element-wise for each label. + + Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + rank: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`. + """ + return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank)) + + +def is_greater_equal_1(label: TensorLike) -> tf.Tensor: + """Computes whether label is greater or equal to 1. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `I(x > 1)`. + """ + return tf.greater_equal(label, 1.0) + + +def symmetric_log1p(t: TensorLike) -> tf.Tensor: + """Computes `sign(x) * log(1 + sign(x))`. + + Args: + t: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `I(x > 1)`. + """ + return tf.math.log1p(t * tf.sign(t)) * tf.sign(t) diff --git a/deepray/metrics/alpha_dcg.py b/deepray/metrics/alpha_dcg.py new file mode 100644 index 00000000..ae1f2cde --- /dev/null +++ b/deepray/metrics/alpha_dcg.py @@ -0,0 +1,126 @@ +from ._ranking import _RankingMetric + + +class AlphaDCGMetric(_RankingMetric): + r"""Alpha discounted cumulative gain (alphaDCG). + + Alpha discounted cumulative gain ([Clarke et al, 2008][clarke2008]; + [Clarke et al, 2009][clarke2009]) is a cumulative gain metric that operates + on subtopics and is typically used for diversification tasks. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + alphaDCG(y, s) = sum_t sum_i gain(y_{i,t}) * rank_discount(rank(s_i)) + gain(y_{i,t}) = (1 - alpha)^(sum_j I[rank(s_j) < rank(s_i)] * gain(y_{j,t})) + ``` + + NOTE: The labels `y_true` should be of shape + `[batch_size, list_size, subtopic_size]`, indicating relevance for each + subtopic in the last dimension. + + NOTE: The `rank_discount_fn` should be keras serializable. Please see + `tfr.keras.utils.log2_inverse` as an example when defining user customized + functions. + + Standalone usage: + + >>> y_true = [[[0., 1.], [1., 0.], [1., 1.]]] + >>> y_pred = [[3., 1., 2.]] + >>> alpha_dcg = tfr.keras.metrics.AlphaDCGMetric() + >>> alpha_dcg(y_true, y_pred).numpy() + 2.1963947 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant( + ... [[[0., 0.], [1., 0.]], [[1., 1.], [0., 2.], [1., 0.]]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> alpha_dcg = tfr.keras.metrics.AlphaDCGMetric(ragged=True) + >>> alpha_dcg(y_true, y_pred).numpy() + 1.8184297 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.AlphaDCGMetric()]) + ``` + + Definition: + + $$ + \alpha\text{DCG}(y, s) = + \sum_t \sum_i \text{gain}(y_{i, t}, \alpha) + \text{ rank_discount}(\text{rank}(s_i))\\ + \text{gain}(y_{i, t}, \alpha) = + y_{i, t} (1 - \alpha)^{\sum_j I[\text{rank}(s_j) < \text{rank}(s_i)] y_{j, t}} + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly and $I[]$ is the indicator function: + + $$ + I[\text{cond}] = \begin{cases} + 1 & \text{if cond is true}\\ + 0 & \text{else}\end{cases} + $$ + + References: + + - [Novelty and diversity in information retrieval evaluation, Clarke et al, + 2008][clarke2008] + - [Overview of the TREC 2009 Web Track, Clarke et al, 2009][clarke2009] + + [clarke2008]: https://dl.acm.org/doi/10.1145/1390334.1390446 + [clarke2009]: https://trec.nist.gov/pubs/trec18/papers/ENT09.OVERVIEW.pdf + """ + + def __init__( + self, + name="alpha_dcg_metric", + topn=None, + alpha=0.5, + rank_discount_fn=None, + seed=None, + dtype=None, + ragged=False, + **kwargs + ): + """Construct the ranking metric class for alpha-DCG. + + Args: + name: A string used as the name for this metric. + topn: A cutoff for how many examples to consider for this metric. + alpha: A float between 0 and 1, parameter used in definition of alpha-DCG. + Introduced as an assessor error in judging whether a document is + covering a subtopic of the query. + rank_discount_fn: A function of rank discounts. Default is set to + `1 / log2(rank+1)`. The `rank_discount_fn` should be keras serializable. + Please see the `log2_inverse` above as an example when defining user + customized functions. + seed: The ops-level random seed used in shuffle ties in `sort_by_scores`. + dtype: Data type of the metric output. See `tf.keras.metrics.Metric`. + ragged: A bool indicating whether the supplied tensors are ragged. If + True y_true, y_pred and sample_weight (if providing per-example weights) + need to be ragged tensors with compatible shapes. + **kwargs: Other keyward arguments used in `tf.keras.metrics.Metric`. + """ + super(AlphaDCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._alpha = alpha + self._rank_discount_fn = rank_discount_fn or utils.log2_inverse + self._seed = seed + self._metric = metrics_impl.AlphaDCGMetric( + name=name, topn=topn, alpha=alpha, rank_discount_fn=self._rank_discount_fn, seed=seed, ragged=ragged + ) + + def get_config(self): + config = super(AlphaDCGMetric, self).get_config() + config.update( + { + "topn": self._topn, + "alpha": self._alpha, + "rank_discount_fn": self._rank_discount_fn, + "seed": self._seed, + } + ) + return config diff --git a/deepray/metrics/arp.py b/deepray/metrics/arp.py new file mode 100644 index 00000000..122f7d25 --- /dev/null +++ b/deepray/metrics/arp.py @@ -0,0 +1,47 @@ +from ._ranking import _RankingMetric + + +class ARPMetric(_RankingMetric): + r"""Average relevance position (ARP). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + ARP(y, s) = sum_i (y_i * rank(s_i)) / sum_j y_j + ``` + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> arp = tfr.keras.metrics.ARPMetric() + >>> arp(y_true, y_pred).numpy() + 2.5 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> arp = tfr.keras.metrics.ARPMetric(ragged=True) + >>> arp(y_true, y_pred).numpy() + 1.75 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.ARPMetric()]) + ``` + + Definition: + + $$ + \text{ARP}(\{y\}, \{s\}) = + \frac{1}{\sum_i y_i} \sum_i y_i \cdot \text{rank}(s_i) + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly. + """ + + def __init__(self, name=None, dtype=None, ragged=False, **kwargs): + super(ARPMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._metric = metrics_impl.ARPMetric(name=name, ragged=ragged) diff --git a/deepray/metrics/cohens_kappa.py b/deepray/metrics/cohens_kappa.py index 72ddae2d..84f1c4db 100644 --- a/deepray/metrics/cohens_kappa.py +++ b/deepray/metrics/cohens_kappa.py @@ -16,8 +16,7 @@ import tensorflow as tf import numpy as np -import tensorflow.keras.backend as K -from tensorflow.keras.metrics import Metric +import tf_keras as keras from deepray.utils.types import AcceptableDTypes, FloatTensorLike from typeguard import typechecked @@ -25,7 +24,7 @@ @tf.keras.utils.register_keras_serializable(package="Deepray") -class CohenKappa(Metric): +class CohenKappa(keras.metrics.Metric): """Computes Kappa score between two raters. The score lies in the range `[-1, 1]`. A score of -1 represents @@ -256,7 +255,7 @@ def reset_state(self): """Resets all of the metric state variables.""" for v in self.variables: - K.set_value( + keras.set_value( v, np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype), ) diff --git a/deepray/metrics/dcg.py b/deepray/metrics/dcg.py new file mode 100644 index 00000000..05f770ee --- /dev/null +++ b/deepray/metrics/dcg.py @@ -0,0 +1,75 @@ +from ._ranking import _RankingMetric + + +class DCGMetric(_RankingMetric): + r"""Discounted cumulative gain (DCG). + + Discounted cumulative gain ([Järvelin et al, 2002][jarvelin2002]). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + DCG(y, s) = sum_i gain(y_i) * rank_discount(rank(s_i)) + ``` + + NOTE: The `gain_fn` and `rank_discount_fn` should be keras serializable. + Please see `tfr.keras.utils.pow_minus_1` and `tfr.keras.utils.log2_inverse` as + examples when defining user customized functions. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> dcg = tfr.keras.metrics.DCGMetric() + >>> dcg(y_true, y_pred).numpy() + 1.1309297 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> dcg = tfr.keras.metrics.DCGMetric(ragged=True) + >>> dcg(y_true, y_pred).numpy() + 2.065465 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.DCGMetric()]) + ``` + + Definition: + + $$ + \text{DCG}(\{y\}, \{s\}) = + \sum_i \text{gain}(y_i) \cdot \text{rank_discount}(\text{rank}(s_i)) + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly. + + References: + + - [Cumulated gain-based evaluation of IR techniques, Järvelin et al, + 2002][jarvelin2002] + + [jarvelin2002]: https://dl.acm.org/doi/10.1145/582415.582418 + """ + + def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dtype=None, ragged=False, **kwargs): + super(DCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._gain_fn = gain_fn or utils.pow_minus_1 + self._rank_discount_fn = rank_discount_fn or utils.log2_inverse + self._metric = metrics_impl.DCGMetric( + name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged + ) + + def get_config(self): + base_config = super(DCGMetric, self).get_config() + config = { + "topn": self._topn, + "gain_fn": self._gain_fn, + "rank_discount_fn": self._rank_discount_fn, + } + config.update(base_config) + return config diff --git a/deepray/metrics/f_scores.py b/deepray/metrics/f_scores.py index db96729b..c15afc23 100755 --- a/deepray/metrics/f_scores.py +++ b/deepray/metrics/f_scores.py @@ -15,15 +15,15 @@ """Implements F scores.""" import tensorflow as tf -from tensorflow.keras import backend as K +import tf_keras as keras from typeguard import typechecked from deepray.utils.types import AcceptableDTypes, FloatTensorLike from typing import Optional -@tf.keras.utils.register_keras_serializable(package="Deepray") -class FBetaScore(tf.keras.metrics.Metric): +@keras.utils.register_keras_serializable(package="Deepray") +class FBetaScore(keras.metrics.Metric): r"""Computes F-Beta score. It is the weighted harmonic mean of precision @@ -191,7 +191,7 @@ def get_config(self): def reset_state(self): reset_value = tf.zeros(self.init_shape, dtype=self.dtype) - K.batch_set_value([(v, reset_value) for v in self.variables]) + keras.batch_set_value([(v, reset_value) for v in self.variables]) def reset_states(self): # Backwards compatibility alias of `reset_state`. New classes should diff --git a/deepray/metrics/geometric_mean.py b/deepray/metrics/geometric_mean.py index 4f5f698a..ee1081ec 100644 --- a/deepray/metrics/geometric_mean.py +++ b/deepray/metrics/geometric_mean.py @@ -15,16 +15,15 @@ """Implements GeometricMean.""" import tensorflow as tf -from tensorflow.keras import backend as K -from tensorflow.keras.metrics import Metric - +import tf_keras as keras +from tf_keras import backend as K from typeguard import typechecked from deepray.utils.types import AcceptableDTypes from deepray.metrics.utils import sample_weight_shape_match @tf.keras.utils.register_keras_serializable(package="Deepray") -class GeometricMean(Metric): +class GeometricMean(keras.metrics.Metric): """Compute Geometric Mean The geometric mean is a kind of mean. Unlike the arithmetic mean diff --git a/deepray/metrics/hits.py b/deepray/metrics/hits.py new file mode 100644 index 00000000..eabdb2c3 --- /dev/null +++ b/deepray/metrics/hits.py @@ -0,0 +1,65 @@ +from ._ranking import _RankingMetric + + +class HitsMetric(_RankingMetric): + r"""Hits@k metric. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + Hits@k(y, s) = 1.0, if \exists i s.t. y_i >= 1 and rank(s_i) <= k + Hits@k(y, s) = 0.0, otherwise. + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1` and `y_i = 0` if `y_i < 1`. + NOTE: While `topn` could be left as `None` without raising an error, the Hits + metric without `topn` specified would be trivial as it simply measures the + percentage of lists with at least 1 relevant item. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> hits_at_1 = tfr.keras.metrics.HitsMetric(topn=1) + >>> hits_at_1(y_true, y_pred).numpy() + 0.0 + >>> hits_at_2 = tfr.keras.metrics.HitsMetric(topn=2) + >>> hits_at_2(y_true, y_pred).numpy() + 1.0 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 1., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> hits_at_1 = tfr.keras.metrics.HitsMetric(topn=1, ragged=True) + >>> hits_at_1(y_true, y_pred).numpy() + 0.5 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.HitsMetric(topn=1)]) + ``` + + Definition: + + $$ + \text{Hits}@k(\{y\}, \{s\}) = \max_{i | y_i \geq 1} + \mathbf{I} [\text{rank}(s_i) \leq k] + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly and $y_i$ are labels. + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + super(HitsMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.HitsMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + config = super(HitsMetric, self).get_config() + config.update({ + "topn": self._topn, + }) + return config diff --git a/deepray/metrics/matthews_correlation_coefficient.py b/deepray/metrics/matthews_correlation_coefficient.py index 6190144b..c871d7fe 100644 --- a/deepray/metrics/matthews_correlation_coefficient.py +++ b/deepray/metrics/matthews_correlation_coefficient.py @@ -17,7 +17,7 @@ import numpy as np import tensorflow as tf -from tensorflow.keras import backend as K +import tf_keras as keras from deepray.utils.types import AcceptableDTypes, FloatTensorLike from typeguard import typechecked @@ -125,7 +125,7 @@ def reset_state(self): """Resets all of the metric state variables.""" for v in self.variables: - K.set_value( + keras.set_value( v, np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype), ) diff --git a/deepray/metrics/mean_average_precision.py b/deepray/metrics/mean_average_precision.py new file mode 100644 index 00000000..76a5ecf0 --- /dev/null +++ b/deepray/metrics/mean_average_precision.py @@ -0,0 +1,79 @@ +from ._ranking import _RankingMetric + + +class MeanAveragePrecisionMetric(_RankingMetric): + r"""Mean average precision (MAP). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + MAP(y, s) = sum_k (P@k(y, s) * rel(k)) / sum_i y_i + rel(k) = y_i if rank(s_i) = k + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1`. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> map_metric = tfr.keras.metrics.MeanAveragePrecisionMetric(topn=2) + >>> map_metric(y_true, y_pred).numpy() + 0.25 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> map_metric = tfr.keras.metrics.MeanAveragePrecisionMetric( + ... topn=2, ragged=True) + >>> map_metric(y_true, y_pred).numpy() + 0.5 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', + metrics=[tfr.keras.metrics.MeanAveragePrecisionMetric()]) + ``` + + Definition: + + $$ + \text{MAP}(\{y\}, \{s\}) = + \frac{\sum_k P@k(y, s) \cdot \text{rel}(k)}{\sum_j \bar{y}_j} \\ + \text{rel}(k) = \max_i I[\text{rank}(s_i) = k] \bar{y}_i + $$ + + where: + + * $P@k(y, s)$ is the Precision at rank $k$. See + `tfr.keras.metrics.PrecisionMetric`. + * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$ + with ties broken randomly + * $I[]$ is the indicator function:\ + $I[\text{cond}] = \begin{cases} + 1 & \text{if cond is true}\\ + 0 & \text{else}\end{cases} + $ + * $\bar{y}_i$ are the truncated labels:\ + $ + \bar{y}_i = \begin{cases} + 1 & \text{if }y_i \geq 1 \\ + 0 & \text{else} + \end{cases} + $ + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + super(MeanAveragePrecisionMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.MeanAveragePrecisionMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + base_config = super(MeanAveragePrecisionMetric, self).get_config() + config = { + "topn": self._topn, + } + config.update(base_config) + return config diff --git a/deepray/metrics/metrics_impl.py b/deepray/metrics/metrics_impl.py new file mode 100644 index 00000000..dffb469d --- /dev/null +++ b/deepray/metrics/metrics_impl.py @@ -0,0 +1,895 @@ +# Copyright 2024 The TensorFlow Ranking Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implements the metrics for TF-Ranking. + +The test cases are mainly on metrics_test.py. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc +import functools +import six +import tensorflow as tf + +from deepray.metrics import utils + +_DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1 + +_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank) + + +def _alpha_dcg_gain_fn(labels, alpha): + """Computes gain for alpha DCG metric from sorted labels. + + Args: + labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. Each + value represents graded relevance to a subtopic: 1 for relevent subtopic, + 0 for irrelevant, and -1 for paddings. When the actual subtopic number of + a query is smaller than the `subtopic_size`, `labels` will be padded to + `subtopic_size` with -1, similar to the paddings used for queries with doc + number less then list_size. + alpha: A float between 0 and 1. Originally introduced as an assessor error + in judging whether a document is covering a subtopic of the query. It can + also be interpreted as the inverse number of documents covering the same + subtopic reader needs to get and confirm the subtopic information of a + query. + + Returns: + A function computes the alpha DCG gain. + """ + # Cumulative number of topics covered along the list_size dimension. + cum_subtopics = tf.cumsum(labels, axis=1, exclusive=True) + gains = tf.reduce_sum(tf.multiply(labels, tf.pow(1 - alpha, cum_subtopics)), axis=-1) + + return gains + + +def _per_example_weights_to_per_list_weights(weights, relevance): + """Computes per list weight from per example weight. + + The per-list weights are computed as: + per_list_weights = sum(weights * relevance) / sum(relevance). + + For a list with sum(relevance) = 0, we set a default weight as the following + average weight while all the lists with sum(weights) = 0 are ignored. + sum(per_list_weights) / num(sum(relevance) != 0 && sum(weights) != 0) + When all the lists have sum(relevance) == 0, we set the average weight to 1.0. + + Such a computation is good for the following scenarios: + - When all the weights are 1.0, the per list weights will be 1.0 everywhere, + even for lists without any relevant examples because + sum(per_list_weights) == num(sum(relevance) != 0) + This handles the standard ranking metrics where the weights are all 1.0. + - When every list has a nonzero weight, the default weight is not used. This + handles the unbiased metrics well. + - For the mixture of the above 2 scenario, the weights for lists with + nonzero relevance and nonzero weights is proportional to + per_list_weights / sum(per_list_weights) * + num(sum(relevance) != 0) / num(lists). + The rest have weights 1.0 / num(lists). + + Args: + weights: The weights `Tensor` of shape [batch_size, list_size]. + relevance: The relevance `Tensor` of shape [batch_size, list_size]. + + Returns: + The per list `Tensor` of shape [batch_size, 1] + """ + nonzero_weights = tf.greater(tf.reduce_sum(input_tensor=weights, axis=1, keepdims=True), 0.0) + per_list_relevance = tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True) + nonzero_relevance = tf.compat.v1.where( + nonzero_weights, tf.cast(tf.greater(per_list_relevance, 0.0), tf.float32), tf.zeros_like(per_list_relevance) + ) + nonzero_relevance_count = tf.reduce_sum(input_tensor=nonzero_relevance, axis=0, keepdims=True) + + per_list_weights = tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True), per_list_relevance + ) + sum_weights = tf.reduce_sum(input_tensor=per_list_weights, axis=0, keepdims=True) + + avg_weight = tf.compat.v1.where( + tf.greater(nonzero_relevance_count, 0.0), tf.compat.v1.math.divide_no_nan(sum_weights, nonzero_relevance_count), + tf.ones_like(nonzero_relevance_count) + ) + return tf.compat.v1.where( + nonzero_weights, + tf.where(tf.greater(per_list_relevance, 0.0), per_list_weights, + tf.ones_like(per_list_weights) * avg_weight), tf.zeros_like(per_list_weights) + ) + + +def _discounted_cumulative_gain( + labels, weights=None, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN +): + """Computes discounted cumulative gain (DCG). + + DCG = SUM(gain_fn(label) / rank_discount_fn(rank)). Using the default values + of the gain and discount functions, we get the following commonly used + formula for DCG: SUM((2^label -1) / log(1+rank)). + + Args: + labels: The relevance `Tensor` of shape [batch_size, list_size]. For the + ideal ranking, the examples are sorted by relevance in reverse order. In + alpha_dcg, it is a `Tensor` with shape [batch_size, list_size, + subtopic_size]. + weights: A `Tensor` of the same shape as labels or [batch_size, 1]. The + former case is per-example and the latter case is per-list. + gain_fn: (function) Transforms labels. + rank_discount_fn: (function) The rank discount function. + + Returns: + A `Tensor` as the weighted discounted cumulative gain per-list. The + tensor shape is [batch_size, 1]. + """ + list_size = tf.shape(input=labels)[1] + position = tf.cast(tf.range(1, list_size + 1), dtype=tf.float32) + gain = gain_fn(tf.cast(labels, dtype=tf.float32)) + discount = rank_discount_fn(position) + return tf.reduce_sum(input_tensor=weights * gain * discount, axis=1, keepdims=True) + + +def _per_list_recall(labels, predictions, topn, mask): + """Computes the recall@k for each query in the batch. + + Args: + labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a + relevant example. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + topn: A cutoff for how many examples to consider for this metric. + mask: A mask indicating which entries are valid for computing the metric. + + Returns: + A `Tensor` of size [batch_size, 1] containing the recall of each query + respectively. + """ + sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] + topn_positives = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) + labels = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + per_list_recall = tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=topn_positives, axis=1, keepdims=True), + tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True) + ) + return per_list_recall + + +def _per_list_precision(labels, predictions, topn, mask): + """Computes the precision for each query in the batch. + + Args: + labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a + relevant example. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + topn: A cutoff for how many examples to consider for this metric. + mask: A `Tensor` of the same shape as predictions indicating which entries + are valid for computing the metric. + + Returns: + A `Tensor` of size [batch_size, 1] containing the precision of each query + respectively. + """ + sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] + # Relevance = 1.0 when labels >= 1.0. + relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) + if topn is None: + topn = tf.shape(relevance)[1] + valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True)) + per_list_precision = tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.cast(valid_topn, dtype=tf.float32) + ) + return per_list_precision + + +class _RankingMetric(six.with_metaclass(abc.ABCMeta, object)): + """Interface for ranking metrics.""" + + def __init__(self, ragged=False): + """Constructor. + + Args: + ragged: A bool indicating whether the supplied tensors are ragged. If + True labels, predictions and weights (if providing per-example weights) + need to be ragged tensors with compatible shapes. + """ + self._ragged = ragged + + @abc.abstractproperty + def name(self): + """The metric name.""" + raise NotImplementedError('Calling an abstract method.') + + def _prepare_and_validate_params(self, labels, predictions, weights, mask): + """Prepares and validates the parameters. + + Args: + labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means + a relevant example. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. + The former case is per-example and the latter case is per-list. + mask: A `Tensor` of the same shape as predictions indicating which entries + are valid for computing the metric. + + Returns: + (labels, predictions, weights, mask) ready to be used for metric + calculation. + """ + if any(isinstance(tensor, tf.RaggedTensor) for tensor in [labels, predictions, weights]): + raise ValueError( + 'labels, predictions and/or weights are ragged tensors, ' + 'use ragged=True to enable ragged support for metrics.' + ) + labels = tf.convert_to_tensor(value=labels) + predictions = tf.convert_to_tensor(value=predictions) + weights = 1.0 if weights is None else tf.convert_to_tensor(value=weights) + example_weights = tf.ones_like(labels) * weights + predictions.get_shape().assert_is_compatible_with(example_weights.get_shape()) + predictions.get_shape().assert_is_compatible_with(labels.get_shape()) + predictions.get_shape().assert_has_rank(2) + + # All labels should be >= 0. Invalid entries are reset. + if mask is None: + mask = utils.is_label_valid(labels) + mask = tf.math.logical_and(mask, tf.math.greater(example_weights, 0.0)) + labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels)) + predictions = tf.compat.v1.where( + mask, predictions, + -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True) + ) + return labels, predictions, example_weights, mask + + def compute(self, labels, predictions, weights=None, mask=None): + """Computes the metric with the given inputs. + + Args: + labels: A `Tensor` of the same shape as `predictions` representing + relevance. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: An optional `Tensor` of the same shape of predictions or + [batch_size, 1]. The former case is per-example and the latter case is + per-list. + mask: An optional `Tensor` of the same shape as predictions indicating + which entries are valid for computing the metric. Will be ignored if + the metric was constructed with ragged=True. + + Returns: + A tf metric. + """ + if self._ragged: + labels, predictions, weights, mask = utils.ragged_to_dense(labels, predictions, weights) + labels, predictions, weights, mask = self._prepare_and_validate_params(labels, predictions, weights, mask) + return self._compute_impl(labels, predictions, weights, mask) + + @abc.abstractmethod + def _compute_impl(self, labels, predictions, weights, mask): + """Computes the metric with the given inputs. + + Args: + labels: A `Tensor` of the same shape as `predictions` representing + relevance. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. + The former case is per-example and the latter case is per-list. + mask: A `Tensor` of the same shape as predictions indicating which entries + are valid for computing the metric. + + Returns: + A tf metric. + """ + raise NotImplementedError('Calling an abstract method.') + + +class _DivRankingMetric(_RankingMetric): + """Interface for diversity ranking metrics. + + Attributes: + name: A string used as the name for this metric. + """ + + def __init__(self, name, topn=None, ragged=False): + super(_DivRankingMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + @abc.abstractmethod + def _compute_per_list_metric(self, labels, predictions, weights, topn, mask): + """Computes the metric with the given inputs. + + Args: + labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A + nonzero value means that the example covers the corresponding subtopic. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. + The former case is per-example and the latter case is per-list. + topn: A cutoff for how many examples to consider for this metric. + mask: A `Tensor` of the same shape as predictions indicating which entries + are valid for computing the metric. + + Returns: + A tf per-list metric. + """ + + def _prepare_and_validate_params(self, labels, predictions, weights, mask): + """Prepares and validates the parameters. + + Args: + labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A + nonzero value means that the example covers the corresponding subtopic. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. + The former case is per-example and the latter case is per-list. + mask: A `Tensor` of the same shape as predictions indicating which entries + are valid for computing the metric. + + Returns: + A 4-tuple of (labels, predictions, weights, mask) ready to be used + for metric calculation. + """ + labels = tf.convert_to_tensor(value=labels) + predictions = tf.convert_to_tensor(value=predictions) + labels.get_shape().assert_has_rank(3) + if mask is None: + mask = utils.is_label_valid(labels) + mask = tf.convert_to_tensor(value=mask) + if mask.get_shape().rank == 3: + mask = tf.reduce_any(mask, axis=2) + predictions = tf.where( + mask, predictions, + -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True) + ) + # All labels should be >= 0. Invalid entries are reset. + labels = tf.where(tf.expand_dims(mask, axis=2), labels, tf.zeros_like(labels)) + weights = (tf.constant(1.0, dtype=tf.float32) if weights is None else tf.convert_to_tensor(value=weights)) + example_weights = tf.ones_like(predictions) * weights + + return labels, predictions, example_weights, mask + + def _compute_per_list_weights(self, weights, labels): + """Computes per list weight from weights and labels for diversification. + + Args: + weights: The weights `Tensor` of shape [batch_size, list_size]. + labels: The labels `Tensor` of shape [batch_size, list_size, + subtopic_size]. + + Returns: + The per-list `Tensor` of shape [batch_size, 1] + """ + # per_list_weights are computed from the whole list to avoid the problem of + # 0 when there is no relevant example in topn. + return _per_example_weights_to_per_list_weights( + weights, tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=-1), dtype=tf.float32) + ) + + def _compute_impl(self, labels, predictions, weights, mask): + """Computes the metric and per list weight with the given inputs. + + Args: + labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A + nonzero value means that the example covers the corresponding subtopic. + predictions: A `Tensor` with shape [batch_size, list_size]. Each value is + the ranking score of the corresponding example. + weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. + The former case is per-example and the latter case is per-list. + mask: An optional `Tensor` of the same shape as predictions indicating + which entries are valid for computing the metric. + + Returns: + A per-list metric and a per-list weights. + """ + topn = tf.shape(input=predictions)[1] if self._topn is None else self._topn + per_list_metric = self._compute_per_list_metric(labels, predictions, weights, topn, mask) + per_list_weights = self._compute_per_list_weights(weights, labels) + return per_list_metric, per_list_weights + + +class MRRMetric(_RankingMetric): + """Implements mean reciprocal rank (MRR).""" + + def __init__(self, name, topn, ragged=False): + """Constructor.""" + super(MRRMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask) + sorted_list_size = tf.shape(input=sorted_labels)[1] + # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance. + relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) + reciprocal_rank = 1.0 / tf.cast(tf.range(1, sorted_list_size + 1), dtype=tf.float32) + # MRR has a shape of [batch_size, 1]. + mrr = tf.reduce_max(input_tensor=relevance * reciprocal_rank, axis=1, keepdims=True) + per_list_weights = _per_example_weights_to_per_list_weights( + weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + ) + return mrr, per_list_weights + + +class HitsMetric(_RankingMetric): + r"""Implements Hits@k metric. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + Hits@k(y, s) = 1.0, if \exists i s.t. y_i >= 1 and rank(s_i) <= k + Hits@k(y, s) = 0.0, otherwise. + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1` and `y_i = 0` if `y_i < 1`. + NOTE: While `topn` could be left as `None` without raising an error, the Hits + metric without `topn` specified would be trivial as it simply measures the + percentage of lists with at least 1 relevant item. + """ + + def __init__(self, name, topn, ragged=False): + """Constructor.""" + super(HitsMetric, self).__init__(ragged=ragged) + self._name = name + if topn is None: + tf.compat.v1.logging.warning( + 'Hits metric without `topn` specified could be trivial. ' + 'Consider specify `topn` for Hits metric.' + ) + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask) + # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance. + relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) + # Hits has a shape of [batch_size, 1]. + hits = tf.reduce_max(input_tensor=relevance, axis=1, keepdims=True) + per_list_weights = _per_example_weights_to_per_list_weights( + weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + ) + return hits, per_list_weights + + +class ARPMetric(_RankingMetric): + """Implements average relevance position (ARP).""" + + def __init__(self, name, ragged=False): + """Constructor.""" + super(ARPMetric, self).__init__(ragged=ragged) + self._name = name + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] + sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask) + weighted_labels = sorted_labels * sorted_weights + position = (tf.cast(tf.range(1, topn + 1), dtype=tf.float32) * tf.ones_like(weighted_labels)) + per_list_weights = tf.reduce_sum(weighted_labels, axis=1, keepdims=True) + per_list_arp = tf.compat.v1.div_no_nan( + tf.reduce_sum(position * weighted_labels, axis=1, keepdims=True), per_list_weights + ) + # TODO: Consider to add a cap position topn + 1 when there is no + # relevant examples. + return per_list_arp, per_list_weights + + +class RecallMetric(_RankingMetric): + """Implements recall@k (r@k).""" + + def __init__(self, name, topn, ragged=False): + """Constructor.""" + super(RecallMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + per_list_recall = _per_list_recall(labels, predictions, topn, mask) + # per_list_weights are computed from the whole list to avoid the problem of + # 0 when there is no relevant example in topn. + per_list_weights = _per_example_weights_to_per_list_weights( + weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + ) + return per_list_recall, per_list_weights + + +class PrecisionMetric(_RankingMetric): + """Implements precision@k (P@k).""" + + def __init__(self, name, topn, ragged=False): + """Constructor.""" + super(PrecisionMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + per_list_precision = _per_list_precision(labels, predictions, topn, mask) + # per_list_weights are computed from the whole list to avoid the problem of + # 0 when there is no relevant example in topn. + per_list_weights = _per_example_weights_to_per_list_weights( + weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + ) + return per_list_precision, per_list_weights + + +class MeanAveragePrecisionMetric(_RankingMetric): + """Implements mean average precision (MAP).""" + + def __init__(self, name, topn, ragged=False): + """Constructor.""" + super(MeanAveragePrecisionMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + # Relevance = 1.0 when labels >= 1.0. + relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + sorted_relevance, sorted_weights = utils.sort_by_scores(predictions, [relevance, weights], topn=topn, mask=mask) + per_list_relevant_counts = tf.cumsum(sorted_relevance, axis=1) + per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1) + per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs) + total_precision = tf.reduce_sum( + input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True + ) + + # Compute the total relevance regardless of self._topn. + total_relevance = tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True) + + per_list_map = tf.math.divide_no_nan(total_precision, total_relevance) + # per_list_weights are computed from the whole list to avoid the problem of + # 0 when there is no relevant example in topn. + per_list_weights = _per_example_weights_to_per_list_weights(weights, relevance) + return per_list_map, per_list_weights + + +class NDCGMetric(_RankingMetric): + """Implements normalized discounted cumulative gain (NDCG).""" + + def __init__(self, name, topn, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN, ragged=False): + """Constructor.""" + super(NDCGMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + self._gain_fn = gain_fn + self._rank_discount_fn = rank_discount_fn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask) + dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn) + # Sorting over the weighted gains to get ideal ranking. + weighted_gains = weights * self._gain_fn(tf.cast(labels, dtype=tf.float32)) + ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores( + weighted_gains, [labels, weights], topn=topn, mask=mask + ) + ideal_dcg = _discounted_cumulative_gain( + ideal_sorted_labels, ideal_sorted_weights, self._gain_fn, self._rank_discount_fn + ) + per_list_ndcg = tf.compat.v1.math.divide_no_nan(dcg, ideal_dcg) + per_list_weights = _per_example_weights_to_per_list_weights( + weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32)) + ) + return per_list_ndcg, per_list_weights + + +class DCGMetric(_RankingMetric): + """Implements discounted cumulative gain (DCG).""" + + def __init__(self, name, topn, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN, ragged=False): + """Constructor.""" + super(DCGMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + self._gain_fn = gain_fn + self._rank_discount_fn = rank_discount_fn + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask) + dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn) + per_list_weights = _per_example_weights_to_per_list_weights( + weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32)) + ) + per_list_dcg = tf.compat.v1.math.divide_no_nan(dcg, per_list_weights) + return per_list_dcg, per_list_weights + + +class OPAMetric(_RankingMetric): + """Implements ordered pair accuracy (OPA).""" + + def __init__(self, name, ragged=False): + """Constructor.""" + super(OPAMetric, self).__init__(ragged=ragged) + self._name = name + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + valid_pair = tf.logical_and(tf.expand_dims(mask, 2), tf.expand_dims(mask, 1)) + pair_label_diff = tf.expand_dims(labels, 2) - tf.expand_dims(labels, 1) + pair_pred_diff = tf.expand_dims(predictions, 2) - tf.expand_dims(predictions, 1) + # Correct pairs are represented twice in the above pair difference tensors. + # We only take one copy for each pair. + correct_pairs = tf.cast(pair_label_diff > 0, dtype=tf.float32) * tf.cast(pair_pred_diff > 0, dtype=tf.float32) + pair_weights = tf.cast(pair_label_diff > 0, + dtype=tf.float32) * tf.expand_dims(weights, 2) * tf.cast(valid_pair, dtype=tf.float32) + per_list_weights = tf.expand_dims(tf.reduce_sum(pair_weights, axis=[1, 2]), 1) + per_list_opa = tf.compat.v1.math.divide_no_nan( + tf.expand_dims(tf.reduce_sum(correct_pairs * pair_weights, axis=[1, 2]), 1), per_list_weights + ) + return per_list_opa, per_list_weights + + +class PrecisionIAMetric(_DivRankingMetric): + """Implements Intent-Aware Precision@k (Pre-IA@k). + + PrecisionIA is a metric introduced in ["Overview of the TREC 2009 Web Track."] + by C Clarke, et al. It is one of the evaluation measures for the TREC + diversity task, where a query may have multiple different implications, termed + as subtopics / nuggets. Specifically, + Pre-IA@k = SUM_t SUM_{i=1}^k label(rank=i, topic=t) / (# of Subtopics * k), + where t indexes subtopics and i indexes document ranks, SUM_t sums over all + subtopics and SUM_{i=1}^k sums over the top k ranks. + """ + + def _compute_per_list_metric(self, labels, predictions, weights, topn, mask): + """See `_DivRankingMetric`.""" + sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] + # relevance shape = [batch_size, topn]. + relevance = tf.reduce_sum(tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32), axis=-1) + # num_subtopics shape = [batch_size, 1]. + num_subtopics = tf.reduce_sum( + tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=1, keepdims=True), dtype=tf.float32), axis=-1 + ) + if topn is None: + topn = tf.shape(relevance)[1] + # valid_topn shape = [batch_size, 1]. + valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True)) + return tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), + tf.reduce_sum(input_tensor=tf.cast(valid_topn, dtype=tf.float32) * num_subtopics, axis=1, keepdims=True) + ) + + +class AlphaDCGMetric(_DivRankingMetric): + """Implements alpha discounted cumulative gain (alphaDCG). + + alphaDCG is a metric first introduced in ["Novelty and Diversity in + Information Retrieval Evaluation."] by C Clarke, et al. It is commonly used in + diversification tasks, where a query may have multiple different implications, + termed as subtopics / nuggets. This metric tends to emphasize a rank with + items covering different subtopics on top by a gain_fn with reduced gain from + readily covered subtopics. Specifically, + alphaDCG = SUM(gain_fn(label, alpha) / rank_discount_fn(rank)). + Using the default values of the gain and discount functions, we get the + following commonly used formula for alphaDCG: + SUM(label_i * (1-alpha)^(SUM_{rank_j R: i.e. When a lot of irrelevant documents ranked higher than the + relevant ones, the metric could be very positive. To use the latter formula, + set use_trec_version to False. + """ + + def __init__(self, name, topn, use_trec_version=True, ragged=False): + """Constructor.""" + super(BPrefMetric, self).__init__(ragged=ragged) + self._name = name + self._topn = topn + self._use_trec_version = use_trec_version + + @property + def name(self): + """The metric name.""" + return self._name + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + + # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance. + relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) + irrelevance = tf.cast(mask, tf.float32) - relevance + + total_relevance = tf.reduce_sum(relevance, axis=1, keepdims=True) + total_irrelevance = tf.reduce_sum(irrelevance, axis=1, keepdims=True) + + sorted_relevance, sorted_irrelevance = utils.sort_by_scores( + predictions, [relevance, irrelevance], mask=mask, topn=topn + ) + + numerator = tf.minimum(tf.cumsum(sorted_irrelevance, axis=1), total_relevance) + denominator = tf.minimum(total_irrelevance, total_relevance) if self._use_trec_version else total_relevance + + bpref = tf.math.divide_no_nan( + tf.reduce_sum(((1. - tf.math.divide_no_nan(numerator, denominator)) * sorted_relevance), axis=1, keepdims=True), + total_relevance + ) + + per_list_weights = _per_example_weights_to_per_list_weights( + weights=weights, relevance=tf.cast(tf.greater_equal(relevance, 1.0), dtype=tf.float32) + ) + + return bpref, per_list_weights + + +class PWAMetric(_RankingMetric): + """Construct a custom Position-Weighted Average Metric. + + For each query we order the results by scores and compute: + + pwa = (ratings[0] * position_weights[0] + ... + + ratings[topn - 1] * position_weights[topn - 1]) / + (position_weights[0] + ... + position_weights[topn - 1]) + + where position_weights = (1. / 1, 1. / 2, ..., 1. / topn) + + Metric value for the whole dataset is weighted sum over pwa values for + individual queries: + + result = pwa(query_0) * weights[0] + pwa(query_1) * weights[1] + ... + + For this metrcs, weights should be a `Tensor` of the shape [batch_size, 1]. + """ + + def __init__(self, name, topn=5, ragged=False): + """Constructor.""" + super().__init__(ragged=ragged) + self._name = name + self._topn = topn + + @property + def name(self): + """The metric name.""" + return self._name + + def compute(self, labels, predictions, weights=None, mask=None): + """See `_RankingMetric`.""" + if weights is not None: + weights_tensor = tf.convert_to_tensor(value=weights) + predictions_tensor = tf.convert_to_tensor(value=predictions) + expected_shape = tf.zeros([tf.shape(predictions_tensor)[0], 1]) + if not weights_tensor.shape.is_compatible_with(expected_shape.shape): + raise ValueError('Weights should be a `Tensor` of the shape' + '[batch_size, 1]') + return super().compute(labels, predictions, weights, mask) + + def _compute_impl(self, labels, predictions, weights, mask): + """See `_RankingMetric`.""" + topn = tf.shape(predictions)[1] if self._topn is None else self._topn + sorted_labels, sorted_mask = utils.sort_by_scores(predictions, [labels, mask], topn=topn, mask=mask) + + sorted_list_size = tf.shape(input=sorted_labels)[1] + position_weights = 1.0 / tf.cast(tf.range(1, sorted_list_size + 1), dtype=tf.float32) + masked_position_weights = (tf.cast(sorted_mask, dtype=tf.float32) * position_weights) + pwa = tf.compat.v1.math.divide_no_nan( + tf.reduce_sum(input_tensor=tf.multiply(sorted_labels, masked_position_weights), axis=1, keepdims=True), + tf.reduce_sum(input_tensor=masked_position_weights, axis=1, keepdims=True) + ) + # Weights list should come in with size [batch_size, 1], then will be + # expanded out to [batch_size, list_size] in the + # "_prepare_and_validate_params" step, so we need to reduce the Tensor back + # to size [batch_size, 1]. + per_list_weights = tf.reduce_mean(input_tensor=weights, axis=1, keepdims=True) + return pwa, per_list_weights diff --git a/deepray/metrics/mrr.py b/deepray/metrics/mrr.py new file mode 100644 index 00000000..82469bab --- /dev/null +++ b/deepray/metrics/mrr.py @@ -0,0 +1,111 @@ +import tensorflow as tf + +from deepray.metrics import metrics_impl + + +class _RankingMetric(tf.keras.metrics.Mean): + """Implements base ranking metric class. + + Please see tf.keras.metrics.Mean for more information about such a class and + https://www.tensorflow.org/tutorials/distribute/custom_training on how to do + customized training. + """ + + def __init__(self, name=None, dtype=None, ragged=False, **kwargs): + super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs) + # An instance of `metrics_impl._RankingMetric`. + # Overwrite this in subclasses. + self._metric = None + self._ragged = ragged + + def update_state(self, y_true, y_pred, sample_weight=None): + """Accumulates metric statistics. + + `y_true` and `y_pred` should have the same shape. + + Args: + y_true: The ground truth values. + y_pred: The predicted values. + sample_weight: Optional weighting of each example. Defaults to 1. Can be a + `Tensor` whose rank is either 0, or the same rank as `y_true`, and must + be broadcastable to `y_true`. + + Returns: + Update op. + """ + y_true = tf.cast(y_true, self._dtype) + y_pred = tf.cast(y_pred, self._dtype) + + # TODO: Add mask argument for metric.compute() call + per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight) + return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights) + + def get_config(self): + config = super(_RankingMetric, self).get_config() + config.update({ + "ragged": self._ragged, + }) + return config + + +class MRRMetric(_RankingMetric): + r"""Mean reciprocal rank (MRR). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + MRR(y, s) = max_i y_i / rank(s_i) + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1`. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> mrr = dp.metrics.MRRMetric() + >>> mrr(y_true, y_pred).numpy() + 0.5 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> mrr = dp.metrics.MRRMetric(ragged=True) + >>> mrr(y_true, y_pred).numpy() + 0.75 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.MRRMetric()]) + ``` + + Definition: + + $$ + \text{MRR}(\{y\}, \{s\}) = \max_i \frac{\bar{y}_i}{\text{rank}(s_i)} + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly and $\bar{y_i}$ are truncated labels: + + $$ + \bar{y}_i = \begin{cases} + 1 & \text{if }y_i \geq 1 \\ + 0 & \text{else} + \end{cases} + $$ + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + super(MRRMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.MRRMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + config = super(MRRMetric, self).get_config() + config.update({ + "topn": self._topn, + }) + return config diff --git a/deepray/metrics/multilabel_confusion_matrix.py b/deepray/metrics/multilabel_confusion_matrix.py index 281deaa5..1bbe5bfb 100644 --- a/deepray/metrics/multilabel_confusion_matrix.py +++ b/deepray/metrics/multilabel_confusion_matrix.py @@ -17,7 +17,7 @@ import warnings import tensorflow as tf -from tensorflow.keras import backend as K +from tf_keras import backend as K from tensorflow.keras.metrics import Metric import numpy as np diff --git a/deepray/metrics/ndcg.py b/deepray/metrics/ndcg.py new file mode 100644 index 00000000..b441df19 --- /dev/null +++ b/deepray/metrics/ndcg.py @@ -0,0 +1,131 @@ +import tensorflow as tf +from deepray.metrics import metrics_impl +from deepray.metrics import utils + +_DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1 + +_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank) + + +class _RankingMetric(tf.keras.metrics.Mean): + """Implements base ranking metric class. + + Please see tf.keras.metrics.Mean for more information about such a class and + https://www.tensorflow.org/tutorials/distribute/custom_training on how to do + customized training. + """ + + def __init__(self, name=None, dtype=None, ragged=False, **kwargs): + super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs) + # An instance of `metrics_impl._RankingMetric`. + # Overwrite this in subclasses. + self._metric = None + self._ragged = ragged + + def update_state(self, y_true, y_pred, sample_weight=None): + """Accumulates metric statistics. + + `y_true` and `y_pred` should have the same shape. + + Args: + y_true: The ground truth values. + y_pred: The predicted values. + sample_weight: Optional weighting of each example. Defaults to 1. Can be a + `Tensor` whose rank is either 0, or the same rank as `y_true`, and must + be broadcastable to `y_true`. + + Returns: + Update op. + """ + y_true = tf.cast(y_true, self._dtype) + y_pred = tf.cast(y_pred, self._dtype) + + # TODO: Add mask argument for metric.compute() call + per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight) + return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights) + + def get_config(self): + config = super(_RankingMetric, self).get_config() + config.update({ + "ragged": self._ragged, + }) + return config + + +@tf.keras.utils.register_keras_serializable(package="tensorflow_ranking") +class NDCGMetric(_RankingMetric): + r"""Normalized discounted cumulative gain (NDCG). + + Normalized discounted cumulative gain ([Järvelin et al, 2002][jarvelin2002]) + is the normalized version of `tfr.keras.metrics.DCGMetric`. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + NDCG(y, s) = DCG(y, s) / DCG(y, y) + DCG(y, s) = sum_i gain(y_i) * rank_discount(rank(s_i)) + ``` + + NOTE: The `gain_fn` and `rank_discount_fn` should be keras serializable. + Please see `tfr.keras.utils.pow_minus_1` and `tfr.keras.utils.log2_inverse` as + examples when defining user customized functions. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> ndcg = dp.metrics.NDCGMetric() + >>> ndcg(y_true, y_pred).numpy() + 0.6934264 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> ndcg = dp.metrics.NDCGMetric(ragged=True) + >>> ndcg(y_true, y_pred).numpy() + 0.7974351 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.NDCGMetric()]) + ``` + + Definition: + + $$ + \text{NDCG}(\{y\}, \{s\}) = + \frac{\text{DCG}(\{y\}, \{s\})}{\text{DCG}(\{y\}, \{y\})} \\ + \text{DCG}(\{y\}, \{s\}) = + \sum_i \text{gain}(y_i) \cdot \text{rank_discount}(\text{rank}(s_i)) + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly. + + References: + + - [Cumulated gain-based evaluation of IR techniques, Järvelin et al, + 2002][jarvelin2002] + + [jarvelin2002]: https://dl.acm.org/doi/10.1145/582415.582418 + """ + + def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dtype=None, ragged=False, **kwargs): + super(NDCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._gain_fn = gain_fn or utils.pow_minus_1 + self._rank_discount_fn = rank_discount_fn or utils.log2_inverse + self._metric = metrics_impl.NDCGMetric( + name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged + ) + + def get_config(self): + base_config = super(NDCGMetric, self).get_config() + config = { + "topn": self._topn, + "gain_fn": self._gain_fn, + "rank_discount_fn": self._rank_discount_fn, + } + config.update(base_config) + return config diff --git a/deepray/metrics/opa.py b/deepray/metrics/opa.py new file mode 100644 index 00000000..1a1beaa8 --- /dev/null +++ b/deepray/metrics/opa.py @@ -0,0 +1,55 @@ +from ._ranking import _RankingMetric + + +class OPAMetric(_RankingMetric): + r"""Ordered pair accuracy (OPA). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + OPA(y, s) = sum_i sum_j I[s_i > s_j] I[y_i > y_j] / sum_i sum_j I[y_i > y_j] + ``` + + NOTE: Pairs with equal labels (`y_i = y_j`) are always ignored. Pairs with + equal scores (`s_i = s_j`) are considered incorrectly ordered. + + Standalone usage: + + >>> y_true = [[0., 1., 2.]] + >>> y_pred = [[3., 1., 2.]] + >>> opa = tfr.keras.metrics.OPAMetric() + >>> opa(y_true, y_pred).numpy() + 0.33333334 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> opa = tfr.keras.metrics.OPAMetric(ragged=True) + >>> opa(y_true, y_pred).numpy() + 0.5 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.OPAMetric()]) + ``` + + Definition: + + $$ + \text{OPA}(\{y\}, \{s\}) = + \frac{\sum_i \sum_j I[s_i > s_j] I[y_i > y_j]}{\sum_i \sum_j I[y_i > y_j]} + $$ + + where $I[]$ is the indicator function: + + $$ + I[\text{cond}] = \begin{cases} + 1 & \text{if cond is true}\\ + 0 & \text{else}\end{cases} + $$ + """ + + def __init__(self, name=None, dtype=None, ragged=False, **kwargs): + super(OPAMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._metric = metrics_impl.OPAMetric(name=name, ragged=ragged) diff --git a/deepray/metrics/precision.py b/deepray/metrics/precision.py new file mode 100644 index 00000000..54184f33 --- /dev/null +++ b/deepray/metrics/precision.py @@ -0,0 +1,73 @@ +from ._ranking import _RankingMetric + + +class PrecisionMetric(_RankingMetric): + r"""Precision@k (P@k). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + P@K(y, s) = 1/k sum_i I[rank(s_i) < k] y_i + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1`. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> precision_at_2 = tfr.keras.metrics.PrecisionMetric(topn=2) + >>> precision_at_2(y_true, y_pred).numpy() + 0.5 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> precision_at_2 = tfr.keras.metrics.PrecisionMetric(topn=2, ragged=True) + >>> precision_at_2(y_true, y_pred).numpy() + 0.5 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.PrecisionMetric()]) + ``` + + Definition: + + $$ + \text{P@k}(\{y\}, \{s\}) = + \frac{1}{k} \sum_i I[\text{rank}(s_i) \leq k] \bar{y}_i + $$ + + where: + + * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$ + with ties broken randomly + * $I[]$ is the indicator function:\ + $I[\text{cond}] = \begin{cases} + 1 & \text{if cond is true}\\ + 0 & \text{else}\end{cases} + $ + * $\bar{y}_i$ are the truncated labels:\ + $ + \bar{y}_i = \begin{cases} + 1 & \text{if }y_i \geq 1 \\ + 0 & \text{else} + \end{cases} + $ + * $k = |y|$ if $k$ is not provided + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + super(PrecisionMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.PrecisionMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + config = super(PrecisionMetric, self).get_config() + config.update({ + "topn": self._topn, + }) + return config diff --git a/deepray/metrics/precision_ia.py b/deepray/metrics/precision_ia.py new file mode 100644 index 00000000..1b4d17a5 --- /dev/null +++ b/deepray/metrics/precision_ia.py @@ -0,0 +1,88 @@ +from ._ranking import _RankingMetric + + +class PrecisionIAMetric(_RankingMetric): + r"""Precision-IA@k (Pre-IA@k). + + Intent-aware Precision@k ([Agrawal et al, 2009][agrawal2009]; + [Clarke et al, 2009][clarke2009]) is a precision metric that operates on + subtopics and is typically used for diversification tasks.. + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + Pre-IA@k(y, s) = sum_t sum_i I[rank(s_i) <= k] y_{i,t} / (# of subtopics * k) + ``` + + NOTE: The labels `y_true` should be of shape + `[batch_size, list_size, subtopic_size]`, indicating relevance for each + subtopic in the last dimension. + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_{i,t} = 1` if `y_{i,t} >= 1`. + + Standalone usage: + + >>> y_true = [[[0., 1.], [1., 0.], [1., 1.]]] + >>> y_pred = [[3., 1., 2.]] + >>> pre_ia = tfr.keras.metrics.PrecisionIAMetric() + >>> pre_ia(y_true, y_pred).numpy() + 0.6666667 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant( + ... [[[0., 0.], [1., 0.]], [[1., 1.], [0., 2.], [1., 0.]]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> pre_ia = tfr.keras.metrics.PrecisionIAMetric(ragged=True) + >>> pre_ia(y_true, y_pred).numpy() + 0.5833334 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', + metrics=[tfr.keras.metrics.PrecisionIAMetric()]) + ``` + + Definition: + + $$ + \text{Pre-IA@k}(y, s) = \frac{1}{\text{# of subtopics} \cdot k} + \sum_t \sum_i I[\text{rank}(s_i) \leq k] y_{i,t} + $$ + + where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores + $s$ with ties broken randomly. + + References: + + - [Diversifying Search Results, Agrawal et al, 2009][agrawal2009] + - [Overview of the TREC 2009 Web Track, Clarke et al, 2009][clarke2009] + + [agrawal2009]: + https://www.microsoft.com/en-us/research/publication/diversifying-search-results/ + [clarke2009]: https://trec.nist.gov/pubs/trec18/papers/ENT09.OVERVIEW.pdf + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + """Constructor. + + Args: + name: A string used as the name for this metric. + topn: A cutoff for how many examples to consider for this metric. + dtype: Data type of the metric output. See `tf.keras.metrics.Metric`. + ragged: A bool indicating whether the supplied tensors are ragged. If + True y_true, y_pred and sample_weight (if providing per-example weights) + need to be ragged tensors with compatible shapes. + **kwargs: Other keyward arguments used in `tf.keras.metrics.Metric`. + """ + super(PrecisionIAMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.PrecisionIAMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + config = super(PrecisionIAMetric, self).get_config() + config.update({ + "topn": self._topn, + }) + return config diff --git a/deepray/metrics/r_square.py b/deepray/metrics/r_square.py index e3261de3..32585589 100644 --- a/deepray/metrics/r_square.py +++ b/deepray/metrics/r_square.py @@ -17,7 +17,7 @@ import numpy as np import tensorflow as tf -from tensorflow.keras import backend as K +from tf_keras import backend as K from tensorflow.keras.metrics import Metric from tensorflow.python.ops import weights_broadcast_ops diff --git a/deepray/metrics/recall.py b/deepray/metrics/recall.py new file mode 100644 index 00000000..715ff072 --- /dev/null +++ b/deepray/metrics/recall.py @@ -0,0 +1,73 @@ +from ._ranking import _RankingMetric + + +class RecallMetric(_RankingMetric): + r"""Recall@k (R@k). + + For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`: + + ``` + R@K(y, s) = sum_i I[rank(s_i) < k] y_i / sum_j y_j + ``` + + NOTE: This metric converts graded relevance to binary relevance by setting + `y_i = 1` if `y_i >= 1`. + + Standalone usage: + + >>> y_true = [[0., 1., 1.]] + >>> y_pred = [[3., 1., 2.]] + >>> recall_at_2 = tfr.keras.metrics.RecallMetric(topn=2) + >>> recall_at_2(y_true, y_pred).numpy() + 0.5 + + >>> # Using ragged tensors + >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]]) + >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]]) + >>> recall_at_2 = tfr.keras.metrics.RecallMetric(topn=2, ragged=True) + >>> recall_at_2(y_true, y_pred).numpy() + 0.75 + + Usage with the `compile()` API: + + ```python + model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.RecallMetric()]) + ``` + + Definition: + + $$ + \text{R@k}(\{y\}, \{s\}) = + \frac{\sum_i I[\text{rank}(s_i) \leq k] \bar{y}_i}{\sum_j \bar{y}_j} + $$ + + where: + + * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$ + with ties broken randomly + * $I[]$ is the indicator function:\ + $I[\text{cond}] = \begin{cases} + 1 & \text{if cond is true}\\ + 0 & \text{else}\end{cases} + $ + * $\bar{y}_i$ are the truncated labels:\ + $ + \bar{y}_i = \begin{cases} + 1 & \text{if }y_i \geq 1 \\ + 0 & \text{else} + \end{cases} + $ + * $k = |y|$ if $k$ is not provided + """ + + def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs): + super(RecallMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs) + self._topn = topn + self._metric = metrics_impl.RecallMetric(name=name, topn=topn, ragged=ragged) + + def get_config(self): + config = super(RecallMetric, self).get_config() + config.update({ + "topn": self._topn, + }) + return config diff --git a/deepray/metrics/streaming_correlations.py b/deepray/metrics/streaming_correlations.py index fc66c19e..44354cdb 100644 --- a/deepray/metrics/streaming_correlations.py +++ b/deepray/metrics/streaming_correlations.py @@ -18,13 +18,13 @@ import numpy as np import tensorflow as tf -from tensorflow.keras import backend -from tensorflow.keras.metrics import Metric +import tf_keras as keras +from tf_keras import backend from deepray.utils.types import AcceptableDTypes from typeguard import typechecked -class CorrelationBase(Metric): +class CorrelationBase(keras.metrics.Metric): """Base class for streaming correlation metrics. Based on https://arxiv.org/abs/1712.01521. diff --git a/deepray/metrics/utils.py b/deepray/metrics/utils.py index 785cc668..225b9689 100644 --- a/deepray/metrics/utils.py +++ b/deepray/metrics/utils.py @@ -14,12 +14,23 @@ # ============================================================================== """Utilities for metrics.""" +from typing import Callable +from typing import Optional + import numpy as np import tensorflow as tf +from typeguard import typechecked + from deepray.utils.types import AcceptableDTypes -from typeguard import typechecked -from typing import Optional, Callable +_PADDING_LABEL = -1. +_PADDING_PREDICTION = -1e6 +_PADDING_WEIGHT = 0. + +TensorLike = tf.types.experimental.TensorLike +GainFunction = Callable[[TensorLike], tf.Tensor] +RankDiscountFunction = Callable[[TensorLike], tf.Tensor] +PositiveFunction = Callable[[TensorLike], tf.Tensor] class MeanMetricWrapper(tf.keras.metrics.Mean): @@ -90,3 +101,141 @@ def sample_weight_shape_match(v, sample_weight): if np.size(sample_weight) == 1: return tf.fill(v.shape, sample_weight) return tf.convert_to_tensor(sample_weight) + + +def pow_minus_1(label: TensorLike) -> tf.Tensor: + """Computes `2**x - 1` element-wise for each label. + + Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + label: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `2**x - 1`. + """ + return tf.math.pow(2., label) - 1. + + +def log2_inverse(rank: TensorLike) -> tf.Tensor: + """Computes `1./log2(1+x)` element-wise for each label. + + Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`. + + Args: + rank: A `Tensor` or anything that can be converted to a tensor using + `tf.convert_to_tensor`. + + Returns: + A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`. + """ + return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank)) + + +def is_label_valid(labels): + """Returns a boolean `Tensor` for label validity.""" + labels = tf.convert_to_tensor(value=labels) + return tf.greater_equal(labels, 0.) + + +def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None): + """Gets indices which would shuffle a tensor. + + Args: + shape: The shape of the indices to generate. + mask: An optional mask that indicates which entries to place first. Its + shape should be equal to given shape. + shuffle_ties: Whether to randomly shuffle ties. + seed: The ops-level random seed. + + Returns: + An int32 `Tensor` with given `shape`. Its entries are indices that would + (randomly) shuffle the values of a `Tensor` of given `shape` along the last + axis while placing masked items first. + """ + # Generate random values when shuffling ties or all zeros when not. + if shuffle_ties: + shuffle_values = tf.random.uniform(shape, seed=seed) + else: + shuffle_values = tf.zeros(shape, dtype=tf.float32) + + # Since shuffle_values is always in [0, 1), we can safely increase entries + # where mask=False with 2.0 to make sure those are placed last during the + # argsort op. + if mask is not None: + shuffle_values = tf.where(mask, shuffle_values, shuffle_values + 2.0) + + # Generate indices by sorting the shuffle values. + return tf.argsort(shuffle_values, stable=True) + + +def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=None, mask=None): + """Sorts list of features according to per-example scores. + + Args: + scores: A `Tensor` of shape [batch_size, list_size] representing the + per-example scores. + features_list: A list of `Tensor`s to be sorted. The shape of the `Tensor` + can be [batch_size, list_size] or [batch_size, list_size, feature_dims]. + The latter is applicable for example features. + topn: An integer as the cutoff of examples in the sorted list. + shuffle_ties: A boolean. If True, randomly shuffle before the sorting. + seed: The ops-level random seed used when `shuffle_ties` is True. + mask: An optional `Tensor` of shape [batch_size, list_size] representing + which entries are valid for sorting. Invalid entries will be pushed to the + end. + + Returns: + A list of `Tensor`s as the list of sorted features by `scores`. + """ + with tf.compat.v1.name_scope(name='sort_by_scores'): + scores = tf.cast(scores, tf.float32) + scores.get_shape().assert_has_rank(2) + list_size = tf.shape(input=scores)[1] + if topn is None: + topn = list_size + topn = tf.minimum(topn, list_size) + + # Set invalid entries (those whose mask value is False) to the minimal value + # of scores so they will be placed last during sort ops. + if mask is not None: + scores = tf.where(mask, scores, tf.reduce_min(scores)) + + # Shuffle scores to break ties and/or push invalid entries (according to + # mask) to the end. + shuffle_ind = None + if shuffle_ties or mask is not None: + shuffle_ind = _get_shuffle_indices(tf.shape(input=scores), mask, shuffle_ties=shuffle_ties, seed=seed) + scores = tf.gather(scores, shuffle_ind, batch_dims=1, axis=1) + + # Perform sort and return sorted feature_list entries. + _, indices = tf.math.top_k(scores, topn, sorted=True) + if shuffle_ind is not None: + indices = tf.gather(shuffle_ind, indices, batch_dims=1, axis=1) + return [tf.gather(f, indices, batch_dims=1, axis=1) for f in features_list] + + +def ragged_to_dense(labels, predictions, weights): + """Converts given inputs from ragged tensors to dense tensors. + + Args: + labels: A `tf.RaggedTensor` of the same shape as `predictions` representing + relevance. + predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each + value is the ranking score of the corresponding example. + weights: An optional `tf.RaggedTensor` of the same shape of predictions or a + `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and + the latter case is per-list. + + Returns: + A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s. + """ + # TODO: Add checks to validate (ragged) shapes of input tensors. + mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool) + labels = labels.to_tensor(_PADDING_LABEL) + if predictions is not None: + predictions = predictions.to_tensor(_PADDING_PREDICTION) + if isinstance(weights, tf.RaggedTensor): + weights = weights.to_tensor(_PADDING_WEIGHT) + return labels, predictions, weights, mask diff --git a/deepray/models/BUILD b/deepray/models/BUILD index 3051b4c9..0a5b776e 100644 --- a/deepray/models/BUILD +++ b/deepray/models/BUILD @@ -9,6 +9,7 @@ py_library( "**/*.py", ]), deps = [ + "//deepray/layers", "//deepray/testing", "//deepray/utils", ], diff --git a/deepray/layers/networks/README.md b/deepray/models/README.md similarity index 96% rename from deepray/layers/networks/README.md rename to deepray/models/README.md index 87cc571e..95b632df 100644 --- a/deepray/layers/networks/README.md +++ b/deepray/models/README.md @@ -1,6 +1,6 @@ -# Networks +# Models -Networks are combinations of `tf.keras` layers (and possibly other networks). +Models are combinations of `tf.keras` layers (and possibly other models). They are `tf.keras` models that would not be trained alone. It encapsulates common network structures like a transformer encoder into an easily handled object with a standardized configuration. diff --git a/deepray/models/__init__.py b/deepray/models/__init__.py index e69de29b..2a4e09c4 100644 --- a/deepray/models/__init__.py +++ b/deepray/models/__init__.py @@ -0,0 +1,3 @@ +from deepray.models.transformer_encoder import TransformerEncoder +from deepray.models.albert_transformer_encoder import AlbertTransformerEncoder +from deepray.models.bert_span_labeler import BertSpanLabeler \ No newline at end of file diff --git a/deepray/layers/networks/albert_transformer_encoder.py b/deepray/models/albert_transformer_encoder.py similarity index 91% rename from deepray/layers/networks/albert_transformer_encoder.py rename to deepray/models/albert_transformer_encoder.py index 43ff854d..1cd7c4a2 100644 --- a/deepray/layers/networks/albert_transformer_encoder.py +++ b/deepray/models/albert_transformer_encoder.py @@ -21,10 +21,13 @@ import tensorflow as tf -from deepray import layers +from deepray.layers import dense_einsum +from deepray.layers import on_device_embedding +from deepray.layers import position_embedding +from deepray.layers import self_attention_mask +from deepray.layers import transformer -@tf.keras.utils.register_keras_serializable(package='Text') class AlbertTransformerEncoder(tf.keras.Model): """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network. @@ -111,7 +114,7 @@ def __init__( mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask') type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids') - self._embedding_layer = layers.OnDeviceEmbedding( + self._embedding_layer = on_device_embedding.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=embedding_width, initializer=initializer, @@ -121,13 +124,13 @@ def __init__( word_embeddings = self._embedding_layer(word_ids) # Always uses dynamic slicing for simplicity. - self._position_embedding_layer = layers.PositionEmbedding( + self._position_embedding_layer = position_embedding.PositionEmbedding( initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length, dtype=float_dtype ) position_embeddings = self._position_embedding_layer(word_embeddings) type_embeddings = ( - layers.OnDeviceEmbedding( + on_device_embedding.OnDeviceEmbedding( vocab_size=type_vocab_size, embedding_width=embedding_width, initializer=initializer, @@ -146,16 +149,18 @@ def __init__( # We project the 'embedding' output to 'hidden_size' if it is not already # 'hidden_size'. if embedding_width != hidden_size: - embeddings = layers.DenseEinsum( + embeddings = dense_einsum.DenseEinsum( output_shape=hidden_size, kernel_initializer=initializer, name='embedding_projection' )(embeddings) if float_dtype == 'float16': embeddings = tf.cast(embeddings, tf.float16) + elif float_dtype == 'bfloat16': + embeddings = tf.cast(embeddings, tf.bfloat16) data = embeddings - attention_mask = layers.SelfAttentionMask()([data, mask]) - shared_layer = layers.Transformer( + attention_mask = self_attention_mask.SelfAttentionMask()([data, mask]) + shared_layer = transformer.Transformer( num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, intermediate_activation=activation, diff --git a/deepray/layers/networks/bert_classifier.py b/deepray/models/bert_classifier.py similarity index 95% rename from deepray/layers/networks/bert_classifier.py rename to deepray/models/bert_classifier.py index 5c5e8606..faea48d4 100644 --- a/deepray/layers/networks/bert_classifier.py +++ b/deepray/models/bert_classifier.py @@ -21,10 +21,9 @@ import tensorflow as tf -from deepray.layers import networks +from deepray.models import classification -@tf.keras.utils.register_keras_serializable(package='Text') class BertClassifier(tf.keras.Model): """Classifier model based on a BERT-style transformer-based encoder. @@ -66,7 +65,7 @@ def __init__(self, network, num_classes, initializer='glorot_uniform', output='l _, cls_output = network(inputs) cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output) - self.classifier = networks.Classification( + self.classifier = classification.Classification( input_width=cls_output.shape[-1], num_classes=num_classes, initializer=initializer, diff --git a/deepray/layers/networks/bert_pretrainer.py b/deepray/models/bert_pretrainer.py similarity index 96% rename from deepray/layers/networks/bert_pretrainer.py rename to deepray/models/bert_pretrainer.py index bd3ef5ff..90858b45 100644 --- a/deepray/layers/networks/bert_pretrainer.py +++ b/deepray/models/bert_pretrainer.py @@ -20,12 +20,13 @@ from __future__ import print_function import copy + import tensorflow as tf -from deepray.layers import networks +from deepray.models import classification +from deepray.models import masked_lm -@tf.keras.utils.register_keras_serializable(package='Text') class BertPretrainer(tf.keras.Model): """BERT network training model. @@ -100,7 +101,7 @@ def __init__( ) inputs.append(masked_lm_positions) - self.masked_lm = networks.MaskedLM( + self.masked_lm = masked_lm.MaskedLM( num_predictions=num_token_predictions, input_width=sequence_output.shape[-1], source_network=network, @@ -112,7 +113,7 @@ def __init__( ) lm_outputs = self.masked_lm([sequence_output, masked_lm_positions]) - self.classification = networks.Classification( + self.classification = classification.Classification( input_width=cls_output.shape[-1], num_classes=num_classes, initializer=initializer, diff --git a/deepray/layers/networks/bert_span_labeler.py b/deepray/models/bert_span_labeler.py similarity index 93% rename from deepray/layers/networks/bert_span_labeler.py rename to deepray/models/bert_span_labeler.py index 046b7acc..e7a9b312 100644 --- a/deepray/layers/networks/bert_span_labeler.py +++ b/deepray/models/bert_span_labeler.py @@ -21,10 +21,9 @@ import tensorflow as tf -from deepray.layers import networks +from deepray.models import span_labeling -@tf.keras.utils.register_keras_serializable(package='Text') class BertSpanLabeler(tf.keras.Model): """Span labeler model based on a BERT-style transformer-based encoder. @@ -63,7 +62,7 @@ def __init__(self, network, initializer='glorot_uniform', output='logits', **kwa # This is an instance variable for ease of access to the underlying task # network. - self.span_labeling = networks.SpanLabeling( + self.span_labeling = span_labeling.SpanLabeling( input_width=sequence_output.shape[-1], initializer=initializer, output=output, name='span_labeling' ) start_logits, end_logits = self.span_labeling(sequence_output) @@ -74,7 +73,7 @@ def __init__(self, network, initializer='glorot_uniform', output='logits', **kwa start_logits = tf.keras.layers.Lambda(tf.identity, name='start_positions')(start_logits) end_logits = tf.keras.layers.Lambda(tf.identity, name='end_positions')(end_logits) - logits = {"start_positions": start_logits, "end_positions": end_logits} + logits = [start_logits, end_logits] super(BertSpanLabeler, self).__init__(inputs=inputs, outputs=logits, **kwargs) diff --git a/deepray/layers/networks/classification.py b/deepray/models/classification.py similarity index 97% rename from deepray/layers/networks/classification.py rename to deepray/models/classification.py index 7a53f63e..b447fa19 100644 --- a/deepray/layers/networks/classification.py +++ b/deepray/models/classification.py @@ -22,7 +22,6 @@ import tensorflow as tf -@tf.keras.utils.register_keras_serializable(package='Text') class Classification(tf.keras.Model): """Classification network head for BERT modeling. diff --git a/deepray/layers/networks/encoder_scaffold.py b/deepray/models/encoder_scaffold.py similarity index 94% rename from deepray/layers/networks/encoder_scaffold.py rename to deepray/models/encoder_scaffold.py index 4a5551ca..695b3191 100644 --- a/deepray/layers/networks/encoder_scaffold.py +++ b/deepray/models/encoder_scaffold.py @@ -20,12 +20,15 @@ from __future__ import print_function import inspect + import tensorflow as tf -from deepray import layers +from deepray.layers import on_device_embedding +from deepray.layers import position_embedding +from deepray.layers import self_attention_mask +from deepray.layers import transformer -@tf.keras.utils.register_keras_serializable(package='Text') class EncoderScaffold(tf.keras.Model): """Bi-directional Transformer-based encoder network scaffold. @@ -95,7 +98,7 @@ def __init__( embedding_cfg=None, embedding_data=None, num_hidden_instances=1, - hidden_cls=layers.Transformer, + hidden_cls=transformer.Transformer, hidden_cfg=None, **kwargs ): @@ -125,7 +128,7 @@ def __init__( type_ids = tf.keras.layers.Input(shape=(embedding_cfg['seq_length'],), dtype=tf.int32, name='input_type_ids') inputs = [word_ids, mask, type_ids] - self._embedding_layer = layers.OnDeviceEmbedding( + self._embedding_layer = on_device_embedding.OnDeviceEmbedding( vocab_size=embedding_cfg['vocab_size'], embedding_width=embedding_cfg['hidden_size'], initializer=embedding_cfg['initializer'], @@ -135,7 +138,7 @@ def __init__( word_embeddings = self._embedding_layer(word_ids) # Always uses dynamic slicing for simplicity. - self._position_embedding_layer = layers.PositionEmbedding( + self._position_embedding_layer = position_embedding.PositionEmbedding( initializer=embedding_cfg['initializer'], use_dynamic_slicing=True, max_sequence_length=embedding_cfg['max_seq_length'] @@ -143,7 +146,7 @@ def __init__( position_embeddings = self._position_embedding_layer(word_embeddings) type_embeddings = ( - layers.OnDeviceEmbedding( + on_device_embedding.OnDeviceEmbedding( vocab_size=embedding_cfg['type_vocab_size'], embedding_width=embedding_cfg['hidden_size'], initializer=embedding_cfg['initializer'], @@ -161,8 +164,10 @@ def __init__( if embedding_cfg.get('dtype') == 'float16': embeddings = tf.cast(embeddings, tf.float16) + elif embedding_cfg.get('dtype') == 'bfloat16': + embeddings = tf.cast(embeddings, tf.bfloat16) - attention_mask = layers.SelfAttentionMask()([embeddings, mask]) + attention_mask = self_attention_mask.SelfAttentionMask()([embeddings, mask]) data = embeddings for _ in range(num_hidden_instances): diff --git a/deepray/layers/networks/masked_lm.py b/deepray/models/masked_lm.py similarity index 98% rename from deepray/layers/networks/masked_lm.py rename to deepray/models/masked_lm.py index b2a059db..9f89fddb 100644 --- a/deepray/layers/networks/masked_lm.py +++ b/deepray/models/masked_lm.py @@ -24,7 +24,6 @@ from deepray.layers import tf_utils -@tf.keras.utils.register_keras_serializable(package='Text') class MaskedLM(tf.keras.Model): """Masked language model network head for BERT modeling. @@ -126,7 +125,6 @@ def _gather_indexes(self, sequence_tensor, positions): return output_tensor -@tf.keras.utils.register_keras_serializable(package='Text') # Temporary until we can create a Dense layer that ties the embedding. class Bias(tf.keras.layers.Layer): """Adds a bias term to an input.""" diff --git a/deepray/models/ncf_common.py b/deepray/models/ncf_common.py index 0f2c7c58..645f6596 100644 --- a/deepray/models/ncf_common.py +++ b/deepray/models/ncf_common.py @@ -33,8 +33,6 @@ from deepray.datasets.movielens import data_preprocessing from deepray.utils.flags import core as flags_core -FLAGS = flags.FLAGS - def get_inputs(params): """Returns some parameters used by the model.""" diff --git a/deepray/models/ncf_model.py b/deepray/models/ncf_model.py index 92b978f2..0abe58e1 100644 --- a/deepray/models/ncf_model.py +++ b/deepray/models/ncf_model.py @@ -87,7 +87,7 @@ def call(self, inputs, training=None, mask=None): # Custom training loop calculates loss and metric as a part of # training/evaluation step function. - if not self._params["keras_use_ctl"]: + if not self._params["use_custom_training_loop"]: softmax_logits = MetricLayer(self._params["match_mlperf"])([softmax_logits, dup_mask_input]) # TODO(b/134744680): Use model.add_loss() instead once the API is well # supported. diff --git a/deepray/models/ncf_test.py b/deepray/models/ncf_test.py index 4a797200..1555119c 100644 --- a/deepray/models/ncf_test.py +++ b/deepray/models/ncf_test.py @@ -65,7 +65,7 @@ def test_end_to_end_keras_dist_strat(self): @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100) def test_end_to_end_keras_dist_strat_ctl(self): - flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-keras_use_ctl', 'True']) + flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-use_custom_training_loop', 'True']) integration.run_synthetic(ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=flags) @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100) @@ -87,7 +87,7 @@ def test_end_to_end_keras_1_gpu_dist_strat_ctl_fp16(self): integration.run_synthetic( ncf_keras_main.main, tmp_root=self.get_temp_dir(), - extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--keras_use_ctl'] + extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--use_custom_training_loop'] ) @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100) diff --git a/deepray/models/rec/base_model.py b/deepray/models/rec/base_model.py index cedc5014..59095974 100644 --- a/deepray/models/rec/base_model.py +++ b/deepray/models/rec/base_model.py @@ -18,8 +18,6 @@ from deepray.utils.data.feature_map import FeatureMap from deepray.utils.data.input_meta import InputMeta -FLAGS = flags.FLAGS - # if FLAGS.use_dynamic_embedding: from tensorflow_recommenders_addons import dynamic_embedding as de diff --git a/deepray/models/rec/flen.py b/deepray/models/rec/flen.py index 2c7eff2c..ff2c245a 100644 --- a/deepray/models/rec/flen.py +++ b/deepray/models/rec/flen.py @@ -29,8 +29,6 @@ from deepray.utils.data.feature_map import FeatureMap from deepray.layers.field_wise_bi_interaction import FieldWiseBiInteraction -FLAGS = flags.FLAGS - __all__ = [ 'FLEN', ] diff --git a/deepray/models/rec/tfra_demo.py b/deepray/models/rec/tfra_demo.py deleted file mode 100644 index 076cec3e..00000000 --- a/deepray/models/rec/tfra_demo.py +++ /dev/null @@ -1,192 +0,0 @@ -import tensorflow as tf - -from tensorflow.keras.layers import (Layer, Input, Concatenate, Dense, Flatten, Lambda) -from tensorflow_recommenders_addons import dynamic_embedding as de - - -class DeepLayer(Layer): - - def __init__(self, hidden_dim, layer_num, out_dim): - self.layers = [] - self.hidden_dim = hidden_dim - self.layer_num = layer_num - self.out_dim = out_dim - for i in range(layer_num): - self.layers.append(Dense(hidden_dim, "relu")) - self.layers.append(Dense(out_dim, "sigmoid")) - super(DeepLayer, self).__init__() - - def call(self, inputs): - output = inputs - for layer in self.layers: - output = layer(output) - return output # (batch, out_dim) - - def get_config(self): - config = super().get_config() - config.update({ - "hidden_dim": self.hidden_dim, - "layer_num": self.layer_num, - "out_dim": self.out_dim, - }) - return config - - -# 构建model -def build_keras_model(is_training, mpi_size, mpi_rank): - # 初始化参数 - embedding_size = 8 - - if is_training: - initializer = tf.keras.initializers.VarianceScaling() - else: - initializer = tf.keras.initializers.Zeros() - gpu_device = ["GPU:0"] - cpu_device = ["CPU:0"] - - dense_embedding_layer = de.keras.layers.HvdAllToAllEmbedding( - mpi_size=mpi_size, - embedding_size=embedding_size, - key_dtype=tf.int32, - value_dtype=tf.float32, - initializer=initializer, - devices=gpu_device, - name='DenseUnifiedEmbeddingLayer', - kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank)) - ) - - sparse_embedding_layer = de.keras.layers.HvdAllToAllEmbedding( - mpi_size=mpi_size, - embedding_size=embedding_size, - key_dtype=tf.int64, - value_dtype=tf.float32, - initializer=initializer, - devices=cpu_device, - name='SparseUnifiedEmbeddingLayer', - kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank)) - ) - - # 输入层 - dense_input_dict = {"movie_genres": {'code': 1111, 'dim': 1}, "user_gender": {'code': 2222, 'dim': 1}} - sparse_input_dict = {"movie_id": {'code': 3333, 'dim': 1}, "user_id": {'code': 4444, 'dim': 1}} - - inputs = dict() - embedding_outs = [] - - # 定义 gpu embedding层 - # 主要思路是合并输入进行embedding查询,最大化利用gpu并行能力,并降低kernel launch time - # 由于 gpu dynamic embedding的动态增机制,请务必设置os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true",以保证显存不会被tensorflow graph预读。 - ################################################### - dense_input_tensors = list() - dense_input_split_dims = list() - for input_name in dense_input_dict.keys(): - dense_input_tensor = Input(shape=(1,), dtype=tf.int32, name=input_name) - inputs[input_name] = dense_input_tensor - - input_tensor_prefix_code = int(dense_input_dict[input_name]["code"]) << 17 - # dense_input_tensor = tf.bitwise.bitwise_xor(dense_input_tensor, input_tensor_prefix_code) - # xor可以用加法替代,方便后续TRT、openvino的优化 - dense_input_tensor = tf.add(dense_input_tensor, input_tensor_prefix_code) - dense_input_tensors.append(dense_input_tensor) - dense_input_split_dims.append(dense_input_dict[input_name]["dim"]) - - tmp_sum = 0 - dense_input_split_dims_final = [] - dense_input_is_sequence_feature = [] - for dim in dense_input_split_dims: - if dim == 1: - tmp_sum = tmp_sum + 1 - elif dim > 1: - if tmp_sum > 0: - dense_input_split_dims_final.append(tmp_sum) - dense_input_is_sequence_feature.append(False) - dense_input_split_dims_final.append(dim) - dense_input_is_sequence_feature.append(True) - tmp_sum = 0 - else: - raise ("dim must >= 1, which is {}".format(dim)) - if tmp_sum > 0: - dense_input_split_dims_final.append(tmp_sum) - dense_input_is_sequence_feature.append(False) - - dense_input_tensors_concat = Concatenate(axis=1)(dense_input_tensors) - dense_embedding_out_concat = dense_embedding_layer(dense_input_tensors_concat) - ################################################### - # gpu embedding部分结束 - - # 定义 cpu embedding层 - # id类特征维度空间大,显存不够用,放在主机内存 - ################################################### - sparse_input_tensors = list() - sparse_input_split_dims = list() - for input_name in sparse_input_dict.keys(): - sparse_input_tensor = Input(shape=(1,), dtype=tf.int64, name=input_name) - inputs[input_name] = sparse_input_tensor - - input_tensor_prefix_code = int(sparse_input_dict[input_name]["code"]) << 47 - # id_tensor = tf.bitwise.bitwise_xor(sparse_input_tensor, input_tensor_prefix_code) - # xor可以用加法替代,方便后续TRT、openvino的优化 - sparse_input_tensor = tf.add(sparse_input_tensor, input_tensor_prefix_code) - sparse_input_tensors.append(sparse_input_tensor) - sparse_input_split_dims.append(sparse_input_dict[input_name]["dim"]) - - tmp_sum = 0 - sparse_input_split_dims_final = [] - sparse_input_is_sequence_feature = [] - for dim in sparse_input_split_dims: - if dim == 1: - tmp_sum = tmp_sum + 1 - elif dim > 1: - if tmp_sum > 0: - sparse_input_split_dims_final.append(tmp_sum) - sparse_input_is_sequence_feature.append(False) - sparse_input_split_dims_final.append(dim) - sparse_input_is_sequence_feature.append(True) - tmp_sum = 0 - else: - raise ("dim must >= 1, which is {}".format(dim)) - if tmp_sum > 0: - sparse_input_split_dims_final.append(tmp_sum) - sparse_input_is_sequence_feature.append(False) - - sparse_input_tensors_concat = Concatenate(axis=1)(sparse_input_tensors) - sparse_embedding_out_concat = sparse_embedding_layer(sparse_input_tensors_concat) - ################################################### - # cpu embedding部分结束 - - # 接下来是特别处理向量特征 - # split_dims和is_sequence_feature用来辨识向量特征 - ################################################### - embedding_out = list() - embedding_out.extend( - tf.split(dense_embedding_out_concat, dense_input_split_dims_final, axis=1) - ) # (feature_combin_num, (batch, dim, emb_size)) - embedding_out.extend( - tf.split(sparse_embedding_out_concat, sparse_input_split_dims_final, axis=1) - ) # (feature_combin_num, (batch, dim, emb_size)) - assert ((len(dense_input_is_sequence_feature) + len(sparse_input_is_sequence_feature)) == len(embedding_out)) - is_sequence_feature = dense_input_is_sequence_feature + sparse_input_is_sequence_feature - for i, embedding in enumerate(embedding_out): - if is_sequence_feature[i] == True: - # 处理向量特征获得的embedding - embedding_vec = tf.math.reduce_mean( - embedding, axis=1, keepdims=True - ) # (feature_combin_num, (batch, x, emb_size)) - else: - embedding_vec = embedding - embedding_outs.append(embedding_vec) - - ################################################### - ################################################### - # embedding层 部分结束 - ################################################### - ################################################### - - # 算法后续部分 - embeddings_concat = Flatten()(Concatenate(axis=1)(embedding_outs)) - - outs = DeepLayer(256, 1, 1)(embeddings_concat) - outs = Lambda(lambda x: x, name="user_rating")(outs) - - model = tf.keras.Model(inputs=inputs, outputs=outs) - return model diff --git a/deepray/models/rec/tower_new_tfra.py b/deepray/models/rec/tower_new_tfra.py deleted file mode 100644 index 7f132302..00000000 --- a/deepray/models/rec/tower_new_tfra.py +++ /dev/null @@ -1,162 +0,0 @@ -# -*- coding:utf-8 -*- - -import tensorflow as tf -from absl import logging, flags -from tensorflow.keras.layers import Concatenate -from tensorflow.keras.layers import Flatten, Lambda -from tensorflow.python.framework import constant_op -from tensorflow.python.keras import backend_config -from tensorflow.python.ops import clip_ops - -from .base_model import BaseModel - -epsilon = backend_config.epsilon -FLAGS = flags.FLAGS - - -class TowerNewTFRA(BaseModel): - - def __call__( - self, - nn_hidden_units=(256, 128, 64), - nn_l2_reg=0.0, - nn_dropout=0.0, - nn_use_bn=False, - is_training=True, - *args, - **kwargs - ): - self._nn_hidden_units = nn_hidden_units - self._is_training = is_training - - self.targets = list(self.target_label_table.keys()) - self.input_dict = self.input_from_features() - features = self.build_features() - output_dict = self.build_network(features=features) - model = tf.keras.Model(inputs=self.input_dict, outputs=output_dict) - return model - - def build_network(self, flags=None, features=None): - geek_nn_dense_features, job_nn_dense_features = self.get_input_and_dense_features( - features, self._is_training, self.get_geek_nn_compo(), self.get_job_nn_compo(), self.targets, extra_dim=0 - ) - - # print("input_list:", len(input_list)) - # print("geek nn:", len(geek_nn_dense_features)) - # print("job nn:", len(job_nn_dense_features)) - - x_job = Flatten()(Concatenate(axis=-1)(job_nn_dense_features)) - x_geek = Flatten()(Concatenate(axis=-1)(geek_nn_dense_features)) - for i, n in enumerate(self._nn_hidden_units): - x_job = tf.keras.layers.Dense(n, activation='relu')(x_job) - x_geek = tf.keras.layers.Dense(n, activation='relu')(x_geek) - # if nn_dropout: - # x_job = tf.keras.layers.Dropout(nn_dropout[i])(x_job) - # x_geek = tf.keras.layers.Dropout(nn_dropout[i])(x_geek) - - x_job = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x_job) - x_geek = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x_geek) - predict_out = tf.keras.layers.Dot(axes=-1, normalize=False)([x_job, x_geek]) - - epsilon_ = constant_op.constant(epsilon(), dtype=predict_out.dtype.base_dtype) - predict_out = clip_ops.clip_by_value(predict_out, epsilon_, 1. - epsilon_) - - # output target - output_dict = dict() - output_dict['addf'] = Lambda(lambda x: x, name="addf")(predict_out) - output_dict['predict'] = Lambda(lambda x: x, name="predict")(predict_out) - output_dict['predict_0'] = Lambda(lambda x: x, name="predict_0")(predict_out) - output_dict['job_vec'] = tf.keras.layers.Lambda(lambda x: x, name='job_vec')(x_job) - output_dict['geek_vec'] = tf.keras.layers.Lambda(lambda x: x, name='geek_vec')(x_geek) - - for i, target in enumerate(self.target_label_table): - output_dict[target] = Lambda(lambda x: x, name=target)(predict_out) - - # output eva target & metrics - print("conf.evaluate_target:", self.conf.evaluate_target) - - for key, config in self.conf.evaluate_target.items(): - target = config['target'] if 'target' in config else 'predict' - if target in output_dict: - pass - else: - target = 'predict' - output_dict[key] = Lambda(lambda x: x, name=key)(output_dict[target]) - - logging.info(f'output_dict: {output_dict}') - return output_dict - - # 生成 NN Feature - def get_input_and_dense_features(self, features, is_training, geek_comp, job_comp, targets=None, extra_dim=0): - # NN features - id_features = [] - nn_features = [] - all_features = [] - geek_cnt = geek_comp - job_cnt = job_comp - emb_dim_by_name = dict() - num_targets = 1 if not targets else len(targets) - geek_feature_set = set() - for field, fea_list in self.field_dict.items(): - if field in geek_cnt: - emb_dims = geek_cnt[field] - elif field in job_cnt: - emb_dims = job_cnt[field] - else: - continue - - if len(emb_dims) < len(fea_list): - emb_dims = emb_dims + [emb_dims[-1]] * (len(fea_list) - len(emb_dims)) - - for i, fea_name in enumerate(fea_list): - if field in geek_cnt: - geek_feature_set.add(fea_name) - - feature = features[fea_name] - emb_name = feature.emb_name - emb_dim = emb_dims[i] - if self.conf.emb_reuse: - if emb_name in emb_dim_by_name and emb_dim_by_name[emb_name] != emb_dim: - logging.warn(f"[EMBED REUSE] {feature.name}@{emb_name} from {emb_dim} to {emb_dim_by_name[emb_name]}") - emb_dim = emb_dim_by_name[emb_name] - emb_dim_by_name[emb_name] = emb_dim - if feature.emb_dynamic: - id_features.append( - self.make_feature( - f=feature, - emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0), - emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else []) - ) - ) - else: - nn_features.append( - self.make_feature( - f=feature, - emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0), - emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else []) - ) - ) - all_features.append( - self.make_feature( - f=feature, - emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0), - emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else []) - ) - ) - - emb_dict = self.embedding_from_feature(all_features, is_training) - id_dense_features = self.dense_from_columns_id(id_features, emb_dict) - print("id:", id_dense_features) - nn_dense_features = self.dense_from_columns(nn_features, emb_dict) - nn_dense_features.update(id_dense_features) - - geek_dense_features = [] - job_dense_features = [] - i = 0 - for emb_name, feas in nn_dense_features.items(): - if emb_name in geek_feature_set: - geek_dense_features.append(feas) - else: - job_dense_features.append(feas) - - return geek_dense_features, job_dense_features diff --git a/deepray/layers/networks/span_labeling.py b/deepray/models/span_labeling.py similarity index 100% rename from deepray/layers/networks/span_labeling.py rename to deepray/models/span_labeling.py diff --git a/deepray/models/tests/__init__.py b/deepray/models/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepray/layers/networks/albert_transformer_encoder_test.py b/deepray/models/tests/albert_transformer_encoder_test.py similarity index 98% rename from deepray/layers/networks/albert_transformer_encoder_test.py rename to deepray/models/tests/albert_transformer_encoder_test.py index aed76c7b..3bf39ead 100644 --- a/deepray/layers/networks/albert_transformer_encoder_test.py +++ b/deepray/models/tests/albert_transformer_encoder_test.py @@ -23,7 +23,7 @@ import tensorflow as tf from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import -from official.nlp.modeling.networks import albert_transformer_encoder +from deepray.models import albert_transformer_encoder # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It diff --git a/deepray/layers/networks/bert_classifier_test.py b/deepray/models/tests/bert_classifier_test.py similarity index 100% rename from deepray/layers/networks/bert_classifier_test.py rename to deepray/models/tests/bert_classifier_test.py diff --git a/deepray/layers/networks/bert_pretrainer_test.py b/deepray/models/tests/bert_pretrainer_test.py similarity index 100% rename from deepray/layers/networks/bert_pretrainer_test.py rename to deepray/models/tests/bert_pretrainer_test.py diff --git a/deepray/layers/networks/bert_span_labeler_test.py b/deepray/models/tests/bert_span_labeler_test.py similarity index 100% rename from deepray/layers/networks/bert_span_labeler_test.py rename to deepray/models/tests/bert_span_labeler_test.py diff --git a/deepray/layers/networks/classification_test.py b/deepray/models/tests/classification_test.py similarity index 100% rename from deepray/layers/networks/classification_test.py rename to deepray/models/tests/classification_test.py diff --git a/deepray/layers/networks/encoder_scaffold_test.py b/deepray/models/tests/encoder_scaffold_test.py similarity index 96% rename from deepray/layers/networks/encoder_scaffold_test.py rename to deepray/models/tests/encoder_scaffold_test.py index 9d25100d..afa7cb65 100644 --- a/deepray/layers/networks/encoder_scaffold_test.py +++ b/deepray/models/tests/encoder_scaffold_test.py @@ -22,8 +22,10 @@ import tensorflow as tf from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import -from deepray import layers -from deepray.layers.networks import encoder_scaffold +from official.modeling import activations +from official.nlp.modeling import layers +from official.nlp.modeling.networks import encoder_scaffold +from deepray.layers import on_device_embedding # Test class that wraps a standard transformer layer. If this layer is called @@ -70,7 +72,7 @@ def test_network_creation(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -123,7 +125,7 @@ def test_network_creation_with_float16_dtype(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -170,7 +172,7 @@ def test_network_invocation(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -219,7 +221,7 @@ def test_network_invocation(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -252,7 +254,7 @@ def test_serialize_deserialize(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -291,7 +293,7 @@ def test_network_invocation(self): word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids") mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask") - embedding_layer = layers.OnDeviceEmbedding( + embedding_layer = on_device_embedding.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=hidden_size, initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -303,7 +305,7 @@ def test_network_invocation(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -350,7 +352,7 @@ def test_serialize_deserialize(self): word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids") mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask") - embedding_layer = layers.OnDeviceEmbedding( + embedding_layer = on_device_embedding.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=hidden_size, initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -362,7 +364,7 @@ def test_serialize_deserialize(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -441,7 +443,7 @@ def test_network_invocation(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), @@ -504,7 +506,7 @@ def test_serialize_deserialize(self): hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, - "intermediate_activation": tf.keras.activations.gelu, + "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), diff --git a/deepray/layers/networks/masked_lm_test.py b/deepray/models/tests/masked_lm_test.py similarity index 98% rename from deepray/layers/networks/masked_lm_test.py rename to deepray/models/tests/masked_lm_test.py index 65a2e417..5a5f8963 100644 --- a/deepray/layers/networks/masked_lm_test.py +++ b/deepray/models/tests/masked_lm_test.py @@ -23,7 +23,8 @@ from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import -from deepray.layers.networks import transformer_encoder, masked_lm +from official.nlp.modeling.networks import masked_lm +from official.nlp.modeling.networks import transformer_encoder # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It diff --git a/deepray/layers/networks/span_labeling_test.py b/deepray/models/tests/span_labeling_test.py similarity index 100% rename from deepray/layers/networks/span_labeling_test.py rename to deepray/models/tests/span_labeling_test.py diff --git a/deepray/layers/networks/transformer_encoder_test.py b/deepray/models/tests/transformer_encoder_test.py similarity index 99% rename from deepray/layers/networks/transformer_encoder_test.py rename to deepray/models/tests/transformer_encoder_test.py index 400d27bb..70945fbd 100644 --- a/deepray/layers/networks/transformer_encoder_test.py +++ b/deepray/models/tests/transformer_encoder_test.py @@ -22,7 +22,7 @@ import tensorflow as tf from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import -from deepray.layers.networks import transformer_encoder +from official.nlp.modeling.networks import transformer_encoder # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It diff --git a/deepray/layers/networks/transformer_encoder.py b/deepray/models/transformer_encoder.py similarity index 92% rename from deepray/layers/networks/transformer_encoder.py rename to deepray/models/transformer_encoder.py index 6127b07a..11ef77ef 100644 --- a/deepray/layers/networks/transformer_encoder.py +++ b/deepray/models/transformer_encoder.py @@ -21,10 +21,12 @@ import tensorflow as tf -from deepray import layers +from deepray.layers import on_device_embedding +from deepray.layers import position_embedding +from deepray.layers import self_attention_mask +from deepray.layers import transformer -@tf.keras.utils.register_keras_serializable(package='Text') class TransformerEncoder(tf.keras.Model): """Bi-directional Transformer-based encoder network. @@ -103,19 +105,19 @@ def __init__( mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask') type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids') - self._embedding_layer = layers.OnDeviceEmbedding( + self._embedding_layer = on_device_embedding.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=hidden_size, initializer=initializer, name='word_embeddings' ) word_embeddings = self._embedding_layer(word_ids) # Always uses dynamic slicing for simplicity. - self._position_embedding_layer = layers.PositionEmbedding( + self._position_embedding_layer = position_embedding.PositionEmbedding( initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length ) position_embeddings = self._position_embedding_layer(word_embeddings) type_embeddings = ( - layers.OnDeviceEmbedding( + on_device_embedding.OnDeviceEmbedding( vocab_size=type_vocab_size, embedding_width=hidden_size, initializer=initializer, @@ -133,11 +135,13 @@ def __init__( if float_dtype == 'float16': embeddings = tf.cast(embeddings, tf.float16) + elif float_dtype == 'bfloat16': + embeddings = tf.cast(embeddings, tf.bfloat16) data = embeddings - attention_mask = layers.SelfAttentionMask()([data, mask]) + attention_mask = self_attention_mask.SelfAttentionMask()([data, mask]) for i in range(num_layers): - layer = layers.Transformer( + layer = transformer.Transformer( num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, intermediate_activation=activation, diff --git a/deepray/optimizers/BUILD b/deepray/optimizers/BUILD index 12756ad8..655f307d 100644 --- a/deepray/optimizers/BUILD +++ b/deepray/optimizers/BUILD @@ -18,6 +18,19 @@ py_test( main = "tests/run_all_test.py", deps = [ ":optimizers", + "//deepray/custom_ops/embedding_variable", + "//deepray/custom_ops/training_ops", + ], +) + +py_test( + name = "adam_test", + size = "medium", + srcs = glob(["tests/adam_test.py"]), + main = "tests/adam_test.py", + deps = [ + ":optimizers", + # "//deepray/custom_ops/embedding_variable", "//deepray/custom_ops/training_ops", ], ) diff --git a/deepray/optimizers/__init__.py b/deepray/optimizers/__init__.py index 8eaff461..c3bb482b 100644 --- a/deepray/optimizers/__init__.py +++ b/deepray/optimizers/__init__.py @@ -38,7 +38,6 @@ from deepray.optimizers.proximal_adagrad import ProximalAdagrad from deepray.optimizers.rectified_adam import RectifiedAdam from deepray.optimizers.stochastic_weight_averaging import SWA -from deepray.optimizers.weight_decay_optimizers import AdamW from deepray.optimizers.adabelief import AdaBelief from deepray.optimizers.weight_decay_optimizers import SGDW from deepray.optimizers.weight_decay_optimizers import ( @@ -50,3 +49,7 @@ from deepray.optimizers.yogi import Yogi from deepray.optimizers.cocob import COCOB from deepray.optimizers.adam import Adam +from deepray.optimizers.adam_async import AdamAsync +from deepray.optimizers.gradient_descent import SGD +from deepray.optimizers.adagrad import Adagrad +from deepray.optimizers.ftrl import FtrlOptimizer \ No newline at end of file diff --git a/deepray/optimizers/adagrad.py b/deepray/optimizers/adagrad.py new file mode 100644 index 00000000..31c046a1 --- /dev/null +++ b/deepray/optimizers/adagrad.py @@ -0,0 +1,83 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Adagrad for Deepray.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import tensorflow as tf +from absl import flags + +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +from deepray.custom_ops.embedding_variable import kv_variable_ops +from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices + + +class Adagrad(tf.keras.optimizers.legacy.Adagrad): + + def __init__(self, learning_rate=0.001, **kwargs): + super().__init__(learning_rate=learning_rate, **kwargs) + self.global_step = None + flags.FLAGS([sys.argv[0], f"--ev_slot_num={1}"]) + + def _create_slots(self, var_list): + for var in var_list: + dtype = var.dtype.base_dtype + init = tf.compat.v1.constant_initializer(self._initial_accumulator_value, dtype=dtype) + self.add_slot(var, "accumulator", init, slot_config=SlotConfig(slot_index=1, slot_num=1)) + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None): + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype) + + acc = self.get_slot(var, "accumulator") + if isinstance(var, kv_variable_ops.EmbeddingVariable): + if indices_counts != None: + return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad_with_counts( + var.handle, + acc.handle, + coefficients["lr_t"], + grad, + indices, + self.global_step, + indices_counts, + use_locking=self._use_locking + ) + else: + return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad( + var.handle, + acc.handle, + coefficients["lr_t"], + grad, + indices, + self.global_step, + use_locking=self._use_locking + ) + else: + return tf.raw_ops.ResourceSparseApplyAdagradV2( + var=var.handle, + accum=acc.handle, + lr=coefficients["lr_t"], + epsilon=coefficients["epsilon"], + grad=grad, + indices=indices, + use_locking=self._use_locking, + ) + + +Adagrad.add_slot = add_slot +Adagrad._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices diff --git a/deepray/optimizers/adam.py b/deepray/optimizers/adam.py index 0a909e83..35c2a3ff 100644 --- a/deepray/optimizers/adam.py +++ b/deepray/optimizers/adam.py @@ -19,32 +19,93 @@ from __future__ import absolute_import, division, print_function -from tensorflow.python.keras.optimizer_v2 import adam as tf_adam +import sys +from absl import flags +from tf_keras.src.optimizers.legacy import adam as adam_old + +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +from deepray.custom_ops.embedding_variable import kv_variable_ops from deepray.custom_ops.training_ops import gen_training_ops +from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices -class Adam(tf_adam.Adam): +class Adam(adam_old.Adam): """Deepray Adam optimizer for efficient sparse updates""" - def _resource_apply_sparse(self, grad, var, indices, apply_state=None): - m = self.get_slot(var, 'm') - v = self.get_slot(var, 'v') + def __init__(self, learning_rate=0.001, **kwargs): + super().__init__(learning_rate=learning_rate, **kwargs) + self.global_step = None + flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"]) + + def _create_slots(self, var_list): + # Create slots for the first and second moments. + # Separate for-loops to respect the ordering of slot variables from v1. + for var in var_list: + self.add_slot(var, "m", slot_config=SlotConfig(slot_index=1, slot_num=2)) + for var in var_list: + self.add_slot(var, "v", slot_config=SlotConfig(slot_index=2, slot_num=2)) + if self.amsgrad: + for var in var_list: + self.add_slot(var, "vhat") + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ( (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype) ) - return gen_training_ops.resource_sparse_apply_adam( - var=var.handle, - m=m.handle, - v=v.handle, - beta1_power=coefficients['beta_1_power'], - beta2_power=coefficients['beta_2_power'], - lr=coefficients['lr_t'], - beta1=coefficients['beta_1_t'], - beta2=coefficients['beta_2_t'], - epsilon=coefficients['epsilon'], - grad=grad, - indices=indices, - use_locking=self._use_locking - ) + m = self.get_slot(var, 'm') + v = self.get_slot(var, 'v') + if isinstance(var, kv_variable_ops.EmbeddingVariable): + if indices_counts is not None: + return gen_kv_variable_ops.kv_resource_sparse_apply_adam_with_counts( + var.handle, + m.handle, + v.handle, + coefficients['beta_1_power'], + coefficients['beta_2_power'], + coefficients['lr_t'], + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + indices, + self.global_step, + indices_counts, + use_locking=self._use_locking + ) + else: + return gen_kv_variable_ops.kv_resource_sparse_apply_adam( + var.handle, + m.handle, + v.handle, + coefficients['beta_1_power'], + coefficients['beta_2_power'], + coefficients['lr_t'], + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + indices, + self.global_step, + use_locking=self._use_locking + ) + else: + return gen_training_ops.resource_sparse_apply_adam( + var=var.handle, + m=m.handle, + v=v.handle, + beta1_power=coefficients['beta_1_power'], + beta2_power=coefficients['beta_2_power'], + lr=coefficients['lr_t'], + beta1=coefficients['beta_1_t'], + beta2=coefficients['beta_2_t'], + epsilon=coefficients['epsilon'], + grad=grad, + indices=indices, + use_locking=self._use_locking + ) + + +Adam.add_slot = add_slot +Adam._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices diff --git a/deepray/optimizers/adam_async.py b/deepray/optimizers/adam_async.py new file mode 100644 index 00000000..0da3bad6 --- /dev/null +++ b/deepray/optimizers/adam_async.py @@ -0,0 +1,188 @@ +# Copyright 2025 The Deepray Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AdamAsync optimizer for Deepray. +""" + +from __future__ import absolute_import, division, print_function + +import sys + +import tensorflow as tf +from absl import flags +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops + +from deepray.custom_ops.embedding_variable import config_pb2 +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +from deepray.custom_ops.embedding_variable import kv_variable_ops +from deepray.custom_ops.training_ops import gen_training_ops +from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices + + +class AdamAsync(tf.keras.optimizers.legacy.Adam): + """Deepray Adam optimizer for efficient sparse updates""" + + def __init__(self, learning_rate=0.001, apply_sparse_rmsprop=False, **kwargs): + super().__init__(learning_rate=learning_rate, **kwargs) + self._apply_sparse_rmsprop = apply_sparse_rmsprop + self.global_step = None + flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"]) + + def _create_slots(self, var_list): + # Create slots for the first and second moments. + # Separate for-loops to respect the ordering of slot variables from v1. + for var in var_list: + self.add_slot(var, "m", slot_config=SlotConfig(slot_index=1, slot_num=2)) + # for var in var_list: + self.add_slot(var, "v", slot_config=SlotConfig(slot_index=2, slot_num=2)) + if isinstance(var, kv_variable_ops.EmbeddingVariable): + self.add_slot( + var, + slot_name="beta1_power", + initializer=array_ops.expand_dims(self._get_hyper("beta_1", var.dtype.base_dtype), -1), + slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE) + ) + self.add_slot( + var, + slot_name="beta2_power", + initializer=array_ops.expand_dims(self._get_hyper("beta_2", var.dtype.base_dtype), -1), + slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE) + ) + else: + self.add_slot( + var, + slot_name="beta1_power", + initializer=self._get_hyper("beta_1", var.dtype.base_dtype), + slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE) + ) + self.add_slot( + var, + slot_name="beta2_power", + initializer=self._get_hyper("beta_2", var.dtype.base_dtype), + slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE) + ) + if self.amsgrad: + for var in var_list: + self.add_slot(var, "vhat") + + def _prepare_local(self, var_device, var_dtype, apply_state): + if "learning_rate" in self._hyper: + lr_t = tf.identity(self._decayed_lr(var_dtype)) + apply_state[(var_device, var_dtype)]["lr_t"] = lr_t + + beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype)) + beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype)) + # beta_1_power = tf.identity(self._get_hyper("beta1_power", var_dtype)) + # beta_2_power = tf.identity(self._get_hyper("beta2_power", var_dtype)) + + # lr = apply_state[(var_device, var_dtype)]["lr_t"] * (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)) + apply_state[(var_device, var_dtype)].update( + dict( + # lr=lr, + epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), + beta_1_t=beta_1_t, + # beta_1_power=beta_1_power, + one_minus_beta_1_t=1 - beta_1_t, + beta_2_t=beta_2_t, + # beta_2_power=beta_2_power, + one_minus_beta_2_t=1 - beta_2_t, + ) + ) + + def _resource_apply_dense(self, grad, var): + m = self.get_slot(var, "m") + v = self.get_slot(var, "v") + beta1_power = self.get_slot(var, 'beta1_power') + beta2_power = self.get_slot(var, 'beta2_power') + return gen_training_ops.resource_apply_adam_async( + var.handle, + m.handle, + v.handle, + beta1_power.handle, + beta2_power.handle, + math_ops.cast(self._lr_t, grad.dtype.base_dtype), + math_ops.cast(self._beta1_t, grad.dtype.base_dtype), + math_ops.cast(self._beta2_t, grad.dtype.base_dtype), + math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), + grad, + use_locking=self._use_locking, + apply_sparse_rmsprop=self._apply_sparse_rmsprop + ) + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None): + m = self.get_slot(var, 'm') + v = self.get_slot(var, 'v') + beta1_power = self.get_slot(var, 'beta1_power') + beta2_power = self.get_slot(var, 'beta2_power') + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = ( + (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype) + ) + + if isinstance(var, kv_variable_ops.EmbeddingVariable): + if indices_counts is not None: + return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async_with_counts( + var.handle, + m.handle, + v.handle, + beta1_power.handle, + beta2_power.handle, + coefficients['lr_t'], + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + indices, + self.global_step, + indices_counts, + use_locking=self._use_locking, + apply_sparse_rmsprop=self._apply_sparse_rmsprop + ) + else: + return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async( + var.handle, + m.handle, + v.handle, + beta1_power.handle, + beta2_power.handle, + coefficients['lr_t'], + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + indices, + self.global_step, + use_locking=self._use_locking, + apply_sparse_rmsprop=self._apply_sparse_rmsprop + ) + else: + return gen_training_ops.resource_sparse_apply_adam_async( + var=var.handle, + m=m.handle, + v=v.handle, + beta1_power=beta1_power.handle, + beta2_power=beta2_power.handle, + lr=coefficients['lr_t'], + beta1=coefficients['beta_1_t'], + beta2=coefficients['beta_2_t'], + epsilon=coefficients['epsilon'], + grad=grad, + indices=indices, + use_locking=self._use_locking, + apply_sparse_rmsprop=self._apply_sparse_rmsprop + ) + + +AdamAsync.add_slot = add_slot +AdamAsync._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices diff --git a/deepray/optimizers/ev_optimizer_patch.py b/deepray/optimizers/ev_optimizer_patch.py new file mode 100644 index 00000000..ba6e391f --- /dev/null +++ b/deepray/optimizers/ev_optimizer_patch.py @@ -0,0 +1,260 @@ +# Copyright 2024 The Deepray Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""EmbeddingVariable optimizer.""" + +import tensorflow as tf +from packaging.version import parse +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops + +from deepray.custom_ops.embedding_variable import config_pb2 +from deepray.custom_ops.embedding_variable import variables as ev_variables +from deepray.custom_ops.unique_ops import gen_array_ops + +if parse(tf.__version__) < parse("2.11.0"): + from keras.optimizers.legacy.optimizer_v2 import _var_key +elif parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.optimizers.legacy.optimizer_v2 import _var_key + from tf_keras.src import backend + from tf_keras.src.optimizers.legacy.optimizer_v2 import _deduplicate_indexed_slices +else: + from keras.src.optimizers.legacy.optimizer_v2 import _var_key + from keras.src import backend + from keras.src.optimizers.legacy.optimizer_v2 import _deduplicate_indexed_slices + +import tf_keras as keras +import functools + +from deepray.custom_ops.embedding_variable.python import kv_variable_ops + +from tensorflow.core.framework import attr_value_pb2 +from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable_internal, get_embedding_variable_v2_internal + + +class SlotConfig: + + def __init__(self, slot_num=1, slot_index=0, slot_type=config_pb2.SlotType.EMBEDDING_VARIABLE): + self.slot_num = slot_num + self.slot_index = slot_index + self.slot_type = slot_type + + +def _set_init_op_embedding_type_attr(var, embedding_type): + var._init_op._set_attr("embedding_variable_type", attr_value_pb2.AttrValue(i=embedding_type)) + var._initializer_for_restore._set_attr("embedding_variable_type", attr_value_pb2.AttrValue(i=embedding_type)) + + +def _set_init_op_slot_num_attr(var, slot_num): + var._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num)) + var._initializer_for_restore._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num)) + + +def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=None): + """Add a new slot variable for `var`. + + A slot variable is an additional variable associated with `var` to + train. It is allocated and managed by optimizers, e.g. `Adam`. + + Args: + var: a `Variable` object. + slot_name: name of the slot variable. + initializer: initializer of the slot variable + shape: (Optional) shape of the slot variable. If not set, it will + default to the shape of `var`. + + Returns: + A slot variable. + """ + if slot_name not in self._slot_names: + self._slot_names.append(slot_name) + var_key = _var_key(var) + slot_dict = self._slots.setdefault(var_key, {}) + weight = slot_dict.get(slot_name, None) + if weight is None: + if isinstance(initializer, str) or callable(initializer): + initializer = keras.initializers.get(initializer) + if isinstance( + initializer, + tf.__internal__.tracking.CheckpointInitialValueCallable, + ) or (shape is not None): + slot_shape = shape + else: + slot_shape = var.shape + initial_value = functools.partial(initializer, shape=slot_shape, dtype=var.dtype) + else: + initial_value = initializer + + if isinstance(var, kv_variable_ops.EmbeddingVariable): + if slot_config is None: + weight = get_embedding_variable_internal( + name=f"{var._shared_name}/{slot_name}", + initializer=initializer, + trainable=False, + embedding_dim=slot_shape, + key_dtype=var._invalid_key_type, + value_dtype=var.dtype, + validate_shape=slot_shape.is_fully_defined(), + steps_to_live=var._steps_to_live, + ht_partition_num=var._ht_partition_num + ) + # _set_init_op_embedding_type_attr(weight, config_pb2.EmbeddingVariableType.MUTABLE) + else: + filter_strategy = None + if var._filter_freq != 0: + if var._max_element_size != 0: + filter_strategy = ev_variables.CBFFilter( + filter_freq=var._filter_freq, + max_element_size=var._max_element_size, + false_positive_probability=var._false_positive_probability, + counter_type=var._counter_type + ) + else: + filter_strategy = ev_variables.CounterFilter(filter_freq=var._filter_freq) + if slot_config.slot_type is config_pb2.SlotType.EMBEDDING_VARIABLE: + # _set_init_op_slot_num_attr(var, slot_config.slot_num) + var._slot_num = slot_config.slot_num + emb_index = var._emb_index + if var.block_num > 1: + var = var._primary + weight = get_embedding_variable_v2_internal( + name=f"{var._shared_name}/{slot_name}", + initializer=initializer, + trainable=False, + embedding_dim=slot_shape, + key_dtype=var._invalid_key_type, + value_dtype=var.dtype, + validate_shape=slot_shape.is_fully_defined(), + evconfig=ev_variables.EmbeddingVariableConfig( + steps_to_live=var._steps_to_live, + handle_name=var._block_handle_name, + emb_index=emb_index, + block_num=var.block_num, + slot_index=slot_config.slot_index, + primary=var._primary, + slot_num=slot_config.slot_num, + storage_type=var.storage_type, + storage_path=var._storage_path, + storage_size=var._storage_size, + storage_cache_strategy=var._storage_cache_strategy, + layout=var._layout, + l2_weight_threshold=var._l2_weight_threshold, + filter_strategy=filter_strategy + ) + ) + else: + weight = tf.Variable( + name=f"{var._shared_name}/{slot_name}", + dtype=var.dtype, + trainable=False, + initial_value=initial_value, + ) + else: + with self._distribution_strategy_scope(): + strategy = tf.distribute.get_strategy() + if not strategy.extended.variable_created_in_scope(var): + raise ValueError( + "Trying to create optimizer slot variable under the " + "scope for tf.distribute.Strategy ({}), which is " + "different from the scope used for the original " + "variable ({}). Make sure the slot variables are " + "created under the same strategy scope. This may " + "happen if you're restoring from a checkpoint " + "outside the scope.".format(strategy, var) + ) + + with strategy.extended.colocate_vars_with(var): + weight = tf.Variable( + name=f"{var._shared_name}/{slot_name}", + dtype=var.dtype, + trainable=False, + initial_value=initial_value, + ) + + backend.track_variable(weight) + slot_dict[slot_name] = weight + self._restore_slot_variable(slot_name=slot_name, variable=var, slot_variable=weight) + self._weights.append(weight) + return weight + + +def _deduplicate_indexed_slices_with_counts(values, indices): + """Sums `values` associated with any non-unique `indices` + and return counts of each count in `values`.""" + unique_indices, new_index_positions, indices_counts = \ + gen_array_ops.deepray_unique_with_counts(indices, out_idx=dtypes.int64) + summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0]) + return summed_values, unique_indices, indices_counts + + +def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices): + """Sums `values` associated with any non-unique `indices` + and return counts of each count in `values`.""" + unique_indices, new_index_positions, summed_counts = \ + gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts) + summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0]) + return summed_values, unique_indices, summed_counts + + +def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices, **kwargs): + """Add ops to apply sparse gradients to `handle`, with repeated indices. + + Optimizers which override this method must deal with repeated indices. See + the docstring of `_apply_sparse_duplicate_indices` for details. By default + the correct behavior, to sum non-unique indices and their associated + gradients, is enforced by first pre-processing `grad` and `indices` and + passing them on to `_resource_apply_sparse`. Optimizers which deal correctly + with duplicate indices may instead override this method to avoid the + overhead of summing. + + Args: + grad: a `Tensor` representing the gradient for the affected indices. + handle: a `Tensor` of dtype `resource` which points to the variable + to be updated. + indices: a `Tensor` of integral type representing the indices for + which the gradient is nonzero. Indices may be repeated. + + Returns: + An `Operation` which updates the value of the variable. + """ + from deepray.custom_ops.embedding_variable import kv_variable_ops + if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts(): + if len(handle._counts_tensor.keys()) == 0: + summed_grad, unique_indices, indices_counts = \ + _deduplicate_indexed_slices_with_counts( + values=grad, indices=indices) + else: + extra_counts, extra_indices = [], [] + if indices.op.type == "ConcatV2": + for tensor in indices.op.inputs: + if tensor.op.type == "Reshape": + indices_tensor = tensor.op.inputs[0] + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) + elif indices.op.type == "Reshape": + indices_tensor = indices.op.inputs[0] + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) + summed_grad, unique_indices, indices_counts = \ + _deduplicate_indexed_slices_with_counts_reduction( + grad, indices, extra_counts, extra_indices) + return self._resource_apply_sparse( + grad=summed_grad, var=handle, indices=unique_indices, indices_counts=indices_counts, **kwargs + ) + else: + summed_grad, unique_indices = _deduplicate_indexed_slices(values=grad, indices=indices) + return self._resource_apply_sparse(summed_grad, handle, unique_indices, **kwargs) diff --git a/deepray/optimizers/ftrl.py b/deepray/optimizers/ftrl.py new file mode 100644 index 00000000..33c2c2f7 --- /dev/null +++ b/deepray/optimizers/ftrl.py @@ -0,0 +1,96 @@ +import sys + +import tensorflow as tf +from absl import flags + +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +from deepray.custom_ops.embedding_variable import kv_variable_ops +from .ev_optimizer_patch import add_slot, SlotConfig + + +class FtrlOptimizer(tf.keras.optimizers.legacy.Ftrl): + + def __init__(self, learning_rate=0.001, **kwargs): + super().__init__(learning_rate=learning_rate, **kwargs) + self.global_step = None + flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"]) + + def _create_slots(self, var_list): + # Create the "accum" and "linear" slots. + for var in var_list: + dtype = var.dtype.base_dtype + init = tf.compat.v1.constant_initializer(self._initial_accumulator_value, dtype=dtype) + self.add_slot(var, "accumulator", init, slot_config=SlotConfig(slot_index=1, slot_num=2)) + self.add_slot(var, "linear", slot_config=SlotConfig(slot_index=2, slot_num=2)) + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None): + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype) + + # Adjust L2 regularization strength to include beta to avoid the + # underlying TensorFlow ops needing to include it. + adjusted_l2_regularization_strength = coefficients["l2_regularization_strength" + ] + coefficients["beta"] / (2.0 * coefficients["lr_t"]) + + accum = self.get_slot(var, "accumulator") + linear = self.get_slot(var, "linear") + + if self._l2_shrinkage_regularization_strength <= 0.0: + if isinstance(var, kv_variable_ops.EmbeddingVariable): + return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl( + var.handle, + accum.handle, + linear.handle, + grad, + indices, + coefficients["lr_t"], + coefficients["l1_regularization_strength"], + adjusted_l2_regularization_strength, + coefficients["learning_rate_power"], + use_locking=self._use_locking + ) + else: + return tf.raw_ops.ResourceSparseApplyFtrl( + var=var.handle, + accum=accum.handle, + linear=linear.handle, + grad=grad, + indices=indices, + lr=coefficients["lr_t"], + l1=coefficients["l1_regularization_strength"], + l2=adjusted_l2_regularization_strength, + lr_power=coefficients["learning_rate_power"], + use_locking=self._use_locking, + ) + else: + if isinstance(var, kv_variable_ops.EmbeddingVariable): + return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl_v2( + var.handle, + accum.handle, + linear.handle, + grad, + indices, + coefficients["lr_t"], + coefficients["l1_regularization_strength"], + adjusted_l2_regularization_strength, + coefficients["l2_shrinkage_regularization_strength"], + coefficients["learning_rate_power"], + use_locking=self._use_locking + ) + else: + return tf.raw_ops.ResourceSparseApplyFtrlV2( + var=var.handle, + accum=accum.handle, + linear=linear.handle, + grad=grad, + indices=indices, + lr=coefficients["lr_t"], + l1=coefficients["l1_regularization_strength"], + l2=adjusted_l2_regularization_strength, + l2_shrinkage=coefficients["l2_shrinkage_regularization_strength"], + lr_power=coefficients["learning_rate_power"], + use_locking=self._use_locking, + ) + + +FtrlOptimizer.add_slot = add_slot diff --git a/deepray/optimizers/gradient_descent.py b/deepray/optimizers/gradient_descent.py new file mode 100644 index 00000000..1ff6f25c --- /dev/null +++ b/deepray/optimizers/gradient_descent.py @@ -0,0 +1,91 @@ +# Copyright 2024 The Deepray Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""GradientDescentOptimizer for Deepray.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tf_keras.src.optimizers.legacy import gradient_descent as gd_old + +from deepray.custom_ops.embedding_variable import gen_kv_variable_ops +from deepray.custom_ops.embedding_variable import kv_variable_ops + + +class SGD(gd_old.SGD): + + def __init__(self, learning_rate=0.01, **kwargs): + super().__init__(learning_rate=learning_rate, **kwargs) + self.global_step = None + + def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, **kwargs): + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = kwargs.get("apply_state", {}).get((var_device, var_dtype) + ) or self._fallback_apply_state(var_device, var_dtype) + if self._momentum: + # This method is only needed for momentum optimization. + momentum_var = self.get_slot(var, "momentum") + return tf.raw_ops.ResourceSparseApplyKerasMomentum( + var=var.handle, + accum=momentum_var.handle, + lr=coefficients["lr_t"], + grad=grad, + indices=indices, + momentum=coefficients["momentum"], + use_locking=self._use_locking, + use_nesterov=self.nesterov, + ) + else: + if isinstance(var, kv_variable_ops.EmbeddingVariable): + if var.need_counts() and len(var._counts_tensor.keys()) != 0: + extra_counts, extra_indices = [], [] + if indices.op.type == "ConcatV2": + for tensor in indices.op.inputs: + if tensor.op.type == "Reshape": + indices_tensor = tensor.op.inputs[0] + if indices_tensor in var._counts_tensor: + extra_counts.append(var._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) + elif indices.op.type == "Reshape": + indices_tensor = indices.op.inputs[0] + if indices_tensor in var._counts_tensor: + extra_counts.append(var._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) + + from deepray.custom_ops.unique_ops import gen_array_ops + unique_indices, new_index_positions, indices_counts = \ + gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts) + summed_grads = math_ops.unsorted_segment_sum(grad, new_index_positions, array_ops.shape(unique_indices)[0]) + return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent_with_counts( + var.handle, + coefficients["lr_t"], + summed_grads, + unique_indices, + self.global_step, + indices_counts, + use_locking=self._use_locking + ) + else: + return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent( + var.handle, coefficients["lr_t"], grad, indices, self.global_step, use_locking=self._use_locking + ) + else: + return tf.raw_ops.ResourceScatterAdd( + resource=var.handle, + indices=indices, + updates=-grad * coefficients["lr_t"], + ) diff --git a/deepray/optimizers/lazy_adam.py b/deepray/optimizers/lazy_adam.py index 6fda8c3d..2c940f32 100644 --- a/deepray/optimizers/lazy_adam.py +++ b/deepray/optimizers/lazy_adam.py @@ -23,17 +23,18 @@ import importlib import tensorflow as tf from deepray.utils.types import FloatTensorLike +import tf_keras as keras from typeguard import typechecked from typing import Union, Callable -if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None: - adam_optimizer_class = tf.keras.optimizers.legacy.Adam +if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None: + adam_optimizer_class = keras.optimizers.legacy.Adam else: - adam_optimizer_class = tf.keras.optimizers.Adam + adam_optimizer_class = keras.optimizers.Adam -@tf.keras.utils.register_keras_serializable(package="Deepray") +@keras.utils.register_keras_serializable(package="Deepray") class LazyAdam(adam_optimizer_class): """Variant of the Adam optimizer that handles sparse updates more efficiently. @@ -67,7 +68,7 @@ def __init__( Args: learning_rate: A `Tensor` or a floating point value. or a schedule - that is a `tf.keras.optimizers.schedules.LearningRateSchedule` + that is a `keras.optimizers.schedules.LearningRateSchedule` The learning rate. beta_1: A `float` value or a constant `float` tensor. The exponential decay rate for the 1st moment estimates. @@ -142,3 +143,6 @@ def _resource_scatter_operate(self, resource, indices, update, resource_scatter_ } return resource_scatter_op(**resource_update_kwargs) + + def get_config(self): + return super().get_config() diff --git a/deepray/optimizers/multi_optimizer.py b/deepray/optimizers/multi_optimizer.py index 273710d4..aae91dd3 100644 --- a/deepray/optimizers/multi_optimizer.py +++ b/deepray/optimizers/multi_optimizer.py @@ -12,20 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Discriminative Layer Training Optimizer for TensorFlow.""" +"""Multiple Optimizer for TensorFlow. +References: +1. https://github.com/tensorflow/recommenders/blob/7caed557b9d5194202d8323f2d4795231a5d0b1d/tensorflow_recommenders/experimental/optimizers/composite_optimizer.py#L25 +2. https://github.com/tensorflow/addons/blob/d208d752e98c310280938efa939117bf635a60a8/tensorflow_addons/optimizers/discriminative_layer_training.py#L47 +3. https://github.com/NVIDIA-Merlin/models/blob/eb1e54196a64a70950b2a7e7744d2150e052d53e/merlin/models/tf/blocks/optimizer.py#L73 +""" from collections import defaultdict from typing import List, Union import tensorflow as tf -from tensorflow.keras.optimizers import Optimizer as keras_optimizer -from tensorflow.python.keras.optimizer_v2 import optimizer_v2 -from tensorflow.python.training import optimizer +from packaging.version import Version from typeguard import typechecked -import deepray as dp from deepray.optimizers import KerasLegacyOptimizer +if Version(tf.__version__).release >= Version("2.16").release: + # Determine if loading keras 2 or 3. + if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release): + # New versions of Keras require importing from `keras.src` when + # importing internal symbols. + from keras.src import backend + from keras.src.utils import tf_utils + else: + from tf_keras.src import backend + from tf_keras.src.utils import tf_utils +elif Version(tf.__version__).release >= Version("2.13").release: + from keras.src import backend + from keras.src.utils import tf_utils +else: + from keras import backend + from keras.utils import tf_utils + class MultiOptimizer(KerasLegacyOptimizer): """Multi Optimizer Wrapper for Discriminative Layer Training. @@ -86,28 +105,19 @@ def __init__( name: str = "MultiOptimizer", **kwargs, ): - super(MultiOptimizer, self).__init__(name, **kwargs) if default_optimizer is None: - raise RuntimeError("Must specify `default_optimizer`.") + raise RuntimeError("Must specify a `default_optimizer`.") self.optimizers_and_varnames = optimizers_and_varnames self.default_optimizer = default_optimizer - if isinstance(self, optimizer.Optimizer): - self.compute_gradients = self.default_optimizer.compute_gradients - elif isinstance(self, optimizer_v2.OptimizerV2) or isinstance(self, keras_optimizer): - self.compute_gradients = self.default_optimizer._compute_gradients - else: - raise Exception("Optimizer type is not supported! got {}".format(str(type(self)))) + def apply_gradients(self, grads_and_vars, **kwargs): + """Wrapped apply_gradient method. - def minimize(self, loss, var_list, tape): - # Compute gradients - grads_and_vars = self.compute_gradients(loss=loss, var_list=var_list, tape=tape) - self.apply_gradients(grads_and_vars) - - def apply_gradients(self, grads_and_vars, name=None, **kwargs): + Returns an operation to be executed. + """ # Create a dictionary with a default optimizer and an empty variable list - var_dict, grad_dict = defaultdict(list), defaultdict(list) + grad_var_dict = defaultdict(list) # Iterate over the trainable variables list for grad, var in grads_and_vars: @@ -115,37 +125,33 @@ def apply_gradients(self, grads_and_vars, name=None, **kwargs): for optimizer, varnames in self.optimizers_and_varnames: if any(name in var.name for name in varnames.split(',')): # If it does, append the variable to the optimizer's variable list - var_dict[optimizer].append(var) - grad_dict[optimizer].append(grad) + grad_var_dict[optimizer].append((grad, var)) break else: # If it doesn't, append the variable to the default optimizer's variable list - var_dict[self.default_optimizer].append(var) - grad_dict[self.default_optimizer].append(grad) + grad_var_dict[self.default_optimizer].append((grad, var)) + update_ops = [] # Call the apply_gradients method for each optimizer with the corresponding gradient and variable list - for optimizer, partvar_list in var_dict.items(): - optimizer.apply_gradients(zip(grad_dict[optimizer], partvar_list)) + for optimizer, grad_var in grad_var_dict.items(): + update_ops.append(optimizer.apply_gradients(grad_var, **kwargs)) - def get_config(self): - # https://github.com/tensorflow/addons/blob/062a7aaf33e4618fc3eb55f54915278287bb545f/tensorflow_addons/optimizers/discriminative_layer_training.py#L153 - raise NotImplementedError("MultiOptimizer cannot be serialized because" - " it uses callable to get variables.") + # update_ops = [optimizer.apply_gradients(grad_var, **kwargs) for optimizer, grad_var in grad_var_dict.items()] + update_group = tf.group(update_ops) - @property - def iterations(self): - """The number of training steps this `optimizer` has run. + any_symbolic = any(isinstance(i, tf.Operation) or tf_utils.is_symbolic_tensor(i) for i in update_ops) - By default, iterations would be incremented by one every time - `apply_gradients()` is called. - """ - return self.default_optimizer.iterations + if not tf.executing_eagerly() or any_symbolic: + # If the current context is graph mode or any of the update ops are + # symbolic then the step update should be carried out under a graph + # context. (eager updates execute immediately) + with backend._current_graph( # pylint: disable=protected-access + update_ops + ).as_default(): + with tf.control_dependencies([update_group]): + return self.iterations.assign_add(1, read_value=False) - @iterations.setter - def iterations(self, variable): - """See base class.""" - for optimizer, _ in self.optimizers_and_varnames: - optimizer.iterations = variable + return self.iterations.assign_add(1) def variables(self): """Returns the optimizer's variables.""" diff --git a/deepray/optimizers/optimization.py b/deepray/optimizers/optimization.py index 26f3e5ff..2ca36f76 100644 --- a/deepray/optimizers/optimization.py +++ b/deepray/optimizers/optimization.py @@ -26,8 +26,6 @@ from .warmup import WarmUpPolynomial -FLAGS = flags.FLAGS - def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type="adam"): """Creates an optimizer with learning rate schedule.""" @@ -81,9 +79,6 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type= # if FLAGS.use_horovod: # import horovod.tensorflow.keras as hvd # optimizer = hvd.DistributedOptimizer(optimizer, backward_passes_per_step=1, average_aggregated_gradients=True) - if FLAGS.use_dynamic_embedding: - from tensorflow_recommenders_addons import dynamic_embedding as de - optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=FLAGS.use_horovod) return optimizer diff --git a/deepray/optimizers/tests/weight_decay_optimizers_test.py b/deepray/optimizers/tests/weight_decay_optimizers_test.py index 27d777c8..c2a1041f 100644 --- a/deepray/optimizers/tests/weight_decay_optimizers_test.py +++ b/deepray/optimizers/tests/weight_decay_optimizers_test.py @@ -373,7 +373,7 @@ def test_var_list_with_exclude_list_sgdw(dtype): ) -if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None: +if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None: optimizer_class = tf.keras.optimizers.legacy.SGD else: optimizer_class = tf.keras.optimizers.SGD diff --git a/deepray/optimizers/weight_decay_optimizers.py b/deepray/optimizers/weight_decay_optimizers.py index 53624611..264bab37 100644 --- a/deepray/optimizers/weight_decay_optimizers.py +++ b/deepray/optimizers/weight_decay_optimizers.py @@ -256,7 +256,7 @@ def _do_use_weight_decay(self, var): return var.ref() in self._decay_var_list -if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None: +if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None: keras_legacy_optimizer = Union[tf.keras.optimizers.legacy.Optimizer, tf.keras.optimizers.Optimizer] else: keras_legacy_optimizer = tf.keras.optimizers.Optimizer diff --git a/deepray/repo.bzl b/deepray/repo.bzl new file mode 100644 index 00000000..8ce8d04d --- /dev/null +++ b/deepray/repo.bzl @@ -0,0 +1,48 @@ +""" TensorFlow Http Archive + +Modified http_archive that allows us to override the TensorFlow commit that is +downloaded by setting an environment variable. This override is to be used for +testing purposes. + +Add the following to your Bazel build command in order to override the +TensorFlow revision. + +build: --action_env TF_REVISION="" + + * `TF_REVISION`: tensorflow revision override (git commit hash) +""" + +_TF_REVISION = "TF_REVISION" + +def _tensorflow_http_archive(ctx): + git_commit = ctx.attr.git_commit + sha256 = ctx.attr.sha256 + patch = getattr(ctx.attr, "patch", None) + + override_git_commit = ctx.os.environ.get(_TF_REVISION) + if override_git_commit: + sha256 = "" + git_commit = override_git_commit + + strip_prefix = "tensorflow-%s" % git_commit + urls = [ + "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % git_commit, + ] + ctx.download_and_extract( + urls, + "", + sha256, + "", + strip_prefix, + ) + if patch: + ctx.patch(patch, strip = 1) + +tensorflow_http_archive = repository_rule( + implementation = _tensorflow_http_archive, + attrs = { + "git_commit": attr.string(mandatory = True), + "sha256": attr.string(mandatory = True), + "patch": attr.label(), + }, +) diff --git a/deepray/seq2seq/BUILD b/deepray/seq2seq/BUILD deleted file mode 100644 index 325f9fac..00000000 --- a/deepray/seq2seq/BUILD +++ /dev/null @@ -1,26 +0,0 @@ -licenses(["notice"]) # Apache 2.0 - -package(default_visibility = ["//visibility:public"]) - -py_library( - name = "seq2seq", - srcs = glob(["*.py"]), - data = [ - "//deepray:options.py", - "//deepray/custom_ops/seq2seq:_beam_search_ops.so", - ], - deps = [ - "//deepray/testing", - "//deepray/utils", - ], -) - -py_test( - name = "seq2seq_test", - size = "medium", - srcs = glob(["tests/*"]), - main = "tests/run_all_test.py", - deps = [ - ":seq2seq", - ], -) diff --git a/deepray/seq2seq/__init__.py b/deepray/seq2seq/__init__.py deleted file mode 100644 index 7e5124b5..00000000 --- a/deepray/seq2seq/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Additional layers for sequence to sequence models.""" - -from deepray.seq2seq.attention_wrapper import AttentionMechanism -from deepray.seq2seq.attention_wrapper import AttentionWrapper -from deepray.seq2seq.attention_wrapper import AttentionWrapperState -from deepray.seq2seq.attention_wrapper import BahdanauAttention -from deepray.seq2seq.attention_wrapper import BahdanauMonotonicAttention -from deepray.seq2seq.attention_wrapper import LuongAttention -from deepray.seq2seq.attention_wrapper import LuongMonotonicAttention -from deepray.seq2seq.attention_wrapper import hardmax -from deepray.seq2seq.attention_wrapper import monotonic_attention -from deepray.seq2seq.attention_wrapper import safe_cumprod - -from deepray.seq2seq.basic_decoder import BasicDecoder -from deepray.seq2seq.basic_decoder import BasicDecoderOutput - -from deepray.seq2seq.beam_search_decoder import BeamSearchDecoder -from deepray.seq2seq.beam_search_decoder import BeamSearchDecoderOutput -from deepray.seq2seq.beam_search_decoder import BeamSearchDecoderState -from deepray.seq2seq.beam_search_decoder import FinalBeamSearchDecoderOutput -from deepray.seq2seq.beam_search_decoder import gather_tree -from deepray.seq2seq.beam_search_decoder import gather_tree_from_array -from deepray.seq2seq.beam_search_decoder import tile_batch - -from deepray.seq2seq.decoder import BaseDecoder -from deepray.seq2seq.decoder import Decoder -from deepray.seq2seq.decoder import dynamic_decode - -from deepray.seq2seq.loss import SequenceLoss -from deepray.seq2seq.loss import sequence_loss - -from deepray.seq2seq.sampler import CustomSampler -from deepray.seq2seq.sampler import GreedyEmbeddingSampler -from deepray.seq2seq.sampler import InferenceSampler -from deepray.seq2seq.sampler import SampleEmbeddingSampler -from deepray.seq2seq.sampler import Sampler -from deepray.seq2seq.sampler import ScheduledEmbeddingTrainingSampler -from deepray.seq2seq.sampler import ScheduledOutputTrainingSampler -from deepray.seq2seq.sampler import TrainingSampler diff --git a/deepray/tensorflow.bzl b/deepray/tensorflow.bzl deleted file mode 100644 index 082a5518..00000000 --- a/deepray/tensorflow.bzl +++ /dev/null @@ -1,333 +0,0 @@ -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") - -cc_shared_library = native.cc_shared_library - -def if_google(google_value, oss_value = []): - """Returns one of the arguments based on the non-configurable build env. - - Specifically, it does not return a `select`, and can be used to e.g. - compute elements of list attributes. - """ - return oss_value # copybara:comment_replace return google_value - -def clean_dep(target): - """Returns string to 'target' in @org_tensorflow repository. - - Use this function when referring to targets in the @org_tensorflow - repository from macros that may be called from external repositories. - """ - - # A repo-relative label is resolved relative to the file in which the - # Label() call appears, i.e. @org_tensorflow. - return str(Label(target)) - -# Include specific extra dependencies when building statically, or -# another set of dependencies otherwise. If "macos" is provided, that -# dependency list is used when using the framework_shared_object config -# on MacOS platforms. If "macos" is not provided, the "otherwise" list is -# used for all framework_shared_object platforms including MacOS. -def if_static(extra_deps, otherwise = [], macos = []): - ret = { - str(Label("//deepray:framework_shared_object")): otherwise, - "//conditions:default": extra_deps, - } - if macos: - ret[str(Label("//deepray:macos_with_framework_shared_object"))] = macos - return select(ret) - -# version for the shared libraries, can -# not contain rc or alpha, only numbers. -# Also update tensorflow/core/public/version.h -# and tensorflow/tools/pip_package/setup.py -VERSION = "2.14.0" -VERSION_MAJOR = VERSION.split(".")[0] -two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"] - -# The workspace root, to be used to set workspace 'include' paths in a way that -# will still work correctly when TensorFlow is included as a dependency of an -# external project. -workspace_root = Label("//:WORKSPACE").workspace_root or "." - -def tf_binary_additional_srcs(fullversion = False): - if fullversion: - suffix = "." + VERSION - else: - suffix = "." + VERSION_MAJOR - - return if_static( - extra_deps = [], - macos = [ - clean_dep("//deepray:libtensorflow_framework%s.dylib" % suffix), - ], - otherwise = [ - clean_dep("//deepray:libtensorflow_framework.so%s" % suffix), - ], - ) - -def _make_search_paths(prefix, levels_to_root): - return ",".join( - [ - "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level)) - for search_level in range(levels_to_root + 1) - ], - ) - -def _rpath_linkopts(name): - # Search parent directories up to the TensorFlow root directory for shared - # object dependencies, even if this op shared object is deeply nested - # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then - # the root and tensorflow/libtensorflow_framework.so should exist when - # deployed. Other shared object dependencies (e.g. shared between contrib/ - # ops) are picked up as long as they are in either the same or a parent - # directory in the tensorflow/ tree. - levels_to_root = native.package_name().count("/") + name.count("/") - return select({ - clean_dep("//deepray:macos"): [ - "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), - "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", - ], - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),), - ], - }) - -def _rpath_user_link_flags(name): - # Search parent directories up to the TensorFlow root directory for shared - # object dependencies, even if this op shared object is deeply nested - # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then - # the root and tensorflow/libtensorflow_framework.so should exist when - # deployed. Other shared object dependencies (e.g. shared between contrib/ - # ops) are picked up as long as they are in either the same or a parent - # directory in the tensorflow/ tree. - levels_to_root = native.package_name().count("/") + name.count("/") - return select({ - clean_dep("//deepray:macos"): [ - "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),), - "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", - ], - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),), - ], - }) - -# buildozer: disable=function-docstring-args -def pybind_extension_opensource( - name, - srcs, - module_name = None, - hdrs = [], - dynamic_deps = [], - static_deps = [], - deps = [], - additional_exported_symbols = [], - compatible_with = None, - copts = [], - data = [], - defines = [], - deprecation = None, - features = [], - link_in_framework = False, - licenses = None, - linkopts = [], - pytype_deps = [], - pytype_srcs = [], - restricted_to = None, - srcs_version = "PY3", - testonly = None, - visibility = None, - win_def_file = None): - """Builds a generic Python extension module.""" - _ignore = [module_name] - p = name.rfind("/") - if p == -1: - sname = name - prefix = "" - else: - sname = name[p + 1:] - prefix = name[:p + 1] - so_file = "%s%s.so" % (prefix, sname) - filegroup_name = "%s_filegroup" % name - pyd_file = "%s%s.pyd" % (prefix, sname) - exported_symbols = [ - "init%s" % sname, - "init_%s" % sname, - "PyInit_%s" % sname, - ] + additional_exported_symbols - - exported_symbols_file = "%s-exported-symbols.lds" % name - version_script_file = "%s-version-script.lds" % name - - exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols]) - version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols]) - - native.genrule( - name = name + "_exported_symbols", - outs = [exported_symbols_file], - cmd = "echo '%s' >$@" % exported_symbols_output, - output_licenses = ["unencumbered"], - visibility = ["//visibility:private"], - testonly = testonly, - ) - - native.genrule( - name = name + "_version_script", - outs = [version_script_file], - cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output, - output_licenses = ["unencumbered"], - visibility = ["//visibility:private"], - testonly = testonly, - ) - - if static_deps: - cc_library_name = so_file + "_cclib" - cc_library( - name = cc_library_name, - hdrs = hdrs, - srcs = srcs + hdrs, - data = data, - deps = deps, - compatible_with = compatible_with, - copts = copts + [ - "-fno-strict-aliasing", - "-fexceptions", - ] + select({ - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-fvisibility=hidden", - ], - }), - defines = defines, - features = features + ["-use_header_modules"], - restricted_to = restricted_to, - testonly = testonly, - visibility = visibility, - ) - - cc_shared_library( - name = so_file, - roots = [cc_library_name], - dynamic_deps = dynamic_deps, - static_deps = static_deps, - additional_linker_inputs = [exported_symbols_file, version_script_file], - compatible_with = compatible_with, - deprecation = deprecation, - features = features + ["-use_header_modules"], - licenses = licenses, - restricted_to = restricted_to, - shared_lib_name = so_file, - testonly = testonly, - user_link_flags = linkopts + _rpath_user_link_flags(name) + select({ - clean_dep("//deepray:macos"): [ - # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols - # not being exported. There should be a better way to deal with this. - "-Wl,-w", - "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, - ], - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-Wl,--version-script", - "$(location %s)" % version_script_file, - ], - }), - visibility = visibility, - ) - - # cc_shared_library can generate more than one file. - # Solution to avoid the error "variable '$<' : more than one input file." - filegroup( - name = filegroup_name, - srcs = [so_file], - output_group = "main_shared_library_output", - testonly = testonly, - ) - else: - if link_in_framework: - srcs += tf_binary_additional_srcs() - - cc_binary( - name = so_file, - srcs = srcs + hdrs, - data = data, - copts = copts + [ - "-fno-strict-aliasing", - "-fexceptions", - ] + select({ - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-fvisibility=hidden", - ], - }), - linkopts = linkopts + _rpath_linkopts(name) + select({ - clean_dep("//deepray:macos"): [ - # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols - # not being exported. There should be a better way to deal with this. - "-Wl,-w", - "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, - ], - clean_dep("//deepray:windows"): [], - "//conditions:default": [ - "-Wl,--version-script", - "$(location %s)" % version_script_file, - ], - }), - deps = deps + [ - exported_symbols_file, - version_script_file, - ], - defines = defines, - features = features + ["-use_header_modules"], - linkshared = 1, - testonly = testonly, - licenses = licenses, - visibility = visibility, - deprecation = deprecation, - restricted_to = restricted_to, - compatible_with = compatible_with, - ) - - # For Windows, emulate the above filegroup with the shared object. - native.alias( - name = filegroup_name, - actual = so_file, - ) - - # For Windows only. - native.genrule( - name = name + "_pyd_copy", - srcs = [filegroup_name], - outs = [pyd_file], - cmd = "cp $< $@", - output_to_bindir = True, - visibility = visibility, - deprecation = deprecation, - restricted_to = restricted_to, - compatible_with = compatible_with, - testonly = testonly, - ) - - native.py_library( - name = name, - data = select({ - clean_dep("//deepray:windows"): [pyd_file], - "//conditions:default": [so_file], - }) + pytype_srcs, - deps = pytype_deps, - srcs_version = srcs_version, - licenses = licenses, - testonly = testonly, - visibility = visibility, - deprecation = deprecation, - restricted_to = restricted_to, - compatible_with = compatible_with, - ) - -# Export open source version of pybind_extension under base name as well. -pybind_extension = pybind_extension_opensource - -def filegroup(**kwargs): - native.filegroup(**kwargs) - -def genrule(**kwargs): - native.genrule(**kwargs) diff --git a/deepray/utils/BUILD b/deepray/utils/BUILD index 8f511360..27d9f0c2 100644 --- a/deepray/utils/BUILD +++ b/deepray/utils/BUILD @@ -12,6 +12,10 @@ py_library( "//deepray:conftest.py", "//deepray:options.py", ], + deps = [ + "@pypi_tf_keras//:pkg", + "@pypi_tqdm//:pkg", + ], ) py_test( diff --git a/deepray/utils/benchmark.py b/deepray/utils/benchmark.py index a5ac1ada..65d8a608 100644 --- a/deepray/utils/benchmark.py +++ b/deepray/utils/benchmark.py @@ -15,7 +15,6 @@ from time import perf_counter import numpy as np -import tensorflow as tf from absl import logging @@ -38,10 +37,7 @@ def __init__(self, warmup_steps=0, total_steps=0): self.benchmark_start_time = None self.benchmark_after_warmup_start_time = None self.latency_percentiles = (90, 95, 99) - with tf.device("/CPU:0"): - self.samples = tf.Variable(0, trainable=False, dtype=tf.int64) - - self.samples.assign(0) + self.samples = 0 self.step_latencies = [0] self._results = {} # used to represent duration of entire training @@ -76,17 +72,17 @@ def _calculate_latency(self): def _calculate_throughput(self): time_elapsed = perf_counter() - self.benchmark_start_time time_elapsed_after_warmup = perf_counter() - self.benchmark_after_warmup_start_time - benchmark_throughput = self.samples.numpy() / time_elapsed_after_warmup - return {"throughput": benchmark_throughput, "time": time_elapsed} + benchmark_throughput = self.samples / time_elapsed_after_warmup + return {"throughput": benchmark_throughput, "time": time_elapsed, "total_samples": self.samples} def __call__(self, steps, global_batch_size): - self.samples.assign_add(steps * global_batch_size) + self.samples += steps * global_batch_size step_latency = perf_counter() - self.step_start_time step_throughput = steps * global_batch_size / step_latency self.step_latencies.append(step_latency) self.step += steps if self.step == self.warmup_steps: - self.samples.assign(0) + self.samples = 0 self.step_latencies = [] self.benchmark_after_warmup_start_time = perf_counter() elif self.step == self.total_steps: diff --git a/deepray/utils/ckpt_util.py b/deepray/utils/ckpt_util.py new file mode 100644 index 00000000..78394151 --- /dev/null +++ b/deepray/utils/ckpt_util.py @@ -0,0 +1,11 @@ +import tensorflow as tf + + +def print_checkpoint(save_path): + reader = tf.train.load_checkpoint(save_path) + shapes = reader.get_variable_to_shape_map() + dtypes = reader.get_variable_to_dtype_map() + print(f"Checkpoint at '{save_path}':") + for key in shapes: + print(f" (key='{key}', shape={shapes[key]}, dtype={dtypes[key].name}, " + f"value={reader.get_tensor(key)})") \ No newline at end of file diff --git a/deepray/utils/data/feature_map.py b/deepray/utils/data/feature_map.py index 316bd7eb..b5d07690 100644 --- a/deepray/utils/data/feature_map.py +++ b/deepray/utils/data/feature_map.py @@ -3,7 +3,7 @@ # @Author : Hailin.Fu # @license : Copyright(C), import os - +import yaml import pandas as pd import tensorflow as tf from absl import logging, flags @@ -11,125 +11,78 @@ from deepray.design_patterns import SingletonType from deepray.utils.horovod_utils import is_main_process -FLAGS = flags.FLAGS - class FeatureMap(metaclass=SingletonType): - def __init__(self, feature_map, black_list=None, white_list=None): - # Read YAML file - # with open(os.path.join(os.path.dirname(__file__), feature_file), encoding="utf-8") as stream: - # try: - # self.conf = yaml.safe_load(stream) - # except yaml.YAMLError as exc: - # logging.error(exc) - self._feature_file = feature_map - self._black_list = black_list - self._white_list = white_list if white_list else FLAGS.white_list - self.feature_map = self.get_summary() - if is_main_process() and self.feature_map is not None: - logging.info( - "\n" + - self.feature_map.loc[:, - ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])].to_markdown() + def __init__(self): + if flags.FLAGS.config_file: + # Read YAML file + with open(flags.FLAGS.config_file, encoding="utf-8") as stream: + try: + self.yaml_conf = yaml.safe_load(stream) + except yaml.YAMLError as exc: + logging.error(exc) + if flags.FLAGS.feature_map and tf.io.gfile.exists(flags.FLAGS.feature_map): + self.feature_map = self.get_summary( + feature_map=flags.FLAGS.feature_map, black_list=flags.FLAGS.black_list, white_list=flags.FLAGS.white_list ) - - def get_summary(self): - if not tf.io.gfile.exists(self._feature_file): if is_main_process(): - logging.info(f"File not exists: {self._feature_file}") - return None - with tf.io.gfile.GFile(self._feature_file, mode="r") as f: - file_name, file_extension = os.path.splitext(self._feature_file) - if file_extension == ".csv": - feature_map = pd.read_csv( - f, - dtype={ - "code": int, - "name": "string", - "dtype": "string", - "ftype": "string", - "dim": "uint32", - "length": float, - "voc_size": float, - "lr": "float32", - "optimizer": "string", - "storage_type": "string", - "composition_size": "string", - "ev_filter": "string", - }, - ).fillna( - value={ - "code": -1, - "length": 1.0, - "voc_size": 0.0, - "lr": 0.0, - "optimizer": "", - "storage_type": "", - "composition_size": "", - "ev_filter": "", - } + logging.info("Used features map:") + print( + "\n" + + self.feature_map.loc[:, + ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])].to_markdown() ) + else: + logging.info(f"feature_map file not exists: {flags.FLAGS.feature_map}") + self.feature_map = None + + def get_summary(self, feature_map, black_list=None, white_list=None): + with tf.io.gfile.GFile(feature_map, mode="r") as f: + file_name, file_extension = os.path.splitext(feature_map) + sep = None + if file_extension == ".csv": + sep = ',' elif file_extension == ".tsv": - feature_map = pd.read_csv( - f, - sep='\t', - header=None, - usecols=[i for i in range(13)], - names=[ - "code", "name", "length", "dtype", "gpercentile", "gcov", "geva", "bpercentile", "bcov", "beva", - "def_valu", "fea_tag", "dim" - ], - dtype={ - "code": "string", - "name": "string", - "length": float, - "dtype": "string", - # "ftype": "string", - "gpercentile": "string", - "geva": "string", - "bpercentile": "string", - "bcov": "string", - "beva": "string", - "def_valu": "string", - "fea_tag": "string" - }, - ).fillna( - value={ - # "code": "", - # "name": "", - "length": 1.0, - # "dtype": "", - # "gpercentile": "", - # "fea_geva": "", - # "fea_bpercentile": "", - # "fea_bcov": "", - # "fea_beva": "", - "def_valu": "", - # "fea_tag": "" - } - ) + sep = '\t' else: ValueError(f"Not support format for {f}") - if self._black_list: - with open(self._black_list) as f: + feature_map = pd.read_csv( + f, + sep=sep, + dtype={ + "code": int, + "name": "string", + "dtype": "string", + "ftype": "string", + "dim": "uint32", + "length": float, + "voc_size": float, + }, + ).fillna(value={ + "code": -1, + "length": 1.0, + "voc_size": 0.0, + }) + if black_list: + with open(black_list) as f: black_feature_list = [feature.strip() for feature in f] feature_map = feature_map[~feature_map["name"].isin(black_feature_list)] - if self._white_list: + if white_list: white_feature_list = [] - if os.path.isfile(self._white_list): - print(f'{self._white_list} is a file.') - with open(self._white_list) as f: + if os.path.isfile(white_list): + print(f'{white_list} is a file.') + with open(white_list) as f: white_feature_list += [feature.strip() for feature in f] - elif os.path.isdir(self._white_list): - print(f'{self._white_list} is a directory.') - for used_features in os.listdir(self._white_list): - filename = os.path.join(self._white_list, used_features) + elif os.path.isdir(white_list): + print(f'{white_list} is a directory.') + for used_features in os.listdir(white_list): + filename = os.path.join(white_list, used_features) with open(filename) as f: white_feature_list += [feature.strip() for feature in f] else: - print(f'{self._white_list} is neither a file nor a directory.') + print(f'{white_list} is neither a file nor a directory.') feature_map = feature_map[feature_map["name"].isin(white_feature_list)] @@ -137,7 +90,6 @@ def get_summary(self): for column in [ 'length', 'voc_size', - # 'composition_size' ]: if column in feature_map.columns: feature_map[column] = feature_map[column].astype(int) diff --git a/deepray/utils/data/input_meta.py b/deepray/utils/data/input_meta.py index a2a30865..ab20e450 100644 --- a/deepray/utils/data/input_meta.py +++ b/deepray/utils/data/input_meta.py @@ -10,8 +10,6 @@ from deepray.design_patterns import SingletonType -FLAGS = flags.FLAGS - class InputMeta(metaclass=SingletonType): diff --git a/deepray/utils/dllogger_class.py b/deepray/utils/dllogger_class.py deleted file mode 100644 index 2c851120..00000000 --- a/deepray/utils/dllogger_class.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from dllogger import Logger, StdOutBackend, JSONStreamBackend, Verbosity - - -class dllogger_class(): - - def format_step(self, step): - if isinstance(step, str): - return step - elif isinstance(step, int): - return "Iteration: {} ".format(step) - elif len(step) > 0: - return "Iteration: {} ".format(step[0]) - else: - return "" - - def __init__(self, log_path="bert_dllog.json"): - self.logger = Logger( - [ - StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step), - JSONStreamBackend(Verbosity.VERBOSE, log_path), - ] - ) - self.logger.metadata("mlm_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("nsp_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("avg_loss_step", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("total_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}) - self.logger.metadata("f1", {"unit": None, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("precision", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("recall", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("mcc", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata("exact_match", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}) - self.logger.metadata( - "throughput_train", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "TRAIN" - }, - ) - self.logger.metadata( - "throughput_inf", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "VAL" - }, - ) - self.logger.metadata( - "throughput_val", - { - "unit": "sequences/s", - "format": ":.3f", - "GOAL": "MAXIMIZE", - "STAGE": "VAL" - }, - ) diff --git a/deepray/utils/export/export.py b/deepray/utils/export/export.py index bfaf8228..d4ca9acc 100644 --- a/deepray/utils/export/export.py +++ b/deepray/utils/export/export.py @@ -26,15 +26,23 @@ import horovod.tensorflow as hvd import tensorflow as tf -from absl import logging, flags -from keras.engine import data_adapter +from absl import flags +from packaging.version import parse + +if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"): + from keras.engine import data_adapter +elif parse(tf.__version__) > parse("2.16.0"): + from tf_keras.src.engine import data_adapter +else: + from keras.src.engine import data_adapter from tensorflow.python.compiler.tensorrt import trt_convert as trt from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import tag_constants +from deepray.utils import logging_util from deepray.utils.horovod_utils import is_main_process, get_world_size, get_rank -FLAGS = flags.FLAGS +logger = logging_util.get_logger() def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32, batch_size=1): @@ -64,33 +72,37 @@ def serving_input_receiver_fn(): def export_to_checkpoint(saver: Union[tf.train.Checkpoint, tf.train.CheckpointManager], checkpoint_number=None): - # TODO(@hejia): Fix export_to_checkpoint when use TFRA. - if FLAGS.use_dynamic_embedding: - return def helper(name, _saver): """Saves model to with provided checkpoint prefix.""" - latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(FLAGS.model_dir, 'ckpt_' + name)) + latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(flags.FLAGS.model_dir, 'ckpt_' + name)) match = re.search(r"(?<=ckpt-)\d+", latest_checkpoint_file) if latest_checkpoint_file else None latest_step_ckpt = int(match.group()) if match else -1 if latest_step_ckpt != checkpoint_number: save_path = _saver.save(checkpoint_number) - logging.info('Saved checkpoint to {}'.format(save_path)) + logger.info('Saved checkpoint to {}'.format(save_path)) - if is_main_process(): + def _save_fn(): if isinstance(saver, dict): for name, _saver in saver.items(): helper(name, _saver) else: helper(name="main", _saver=saver) + if flags.FLAGS.use_horovod and flags.FLAGS.use_dynamic_embedding: + _save_fn() + else: + _save_fn() + def export_to_savedmodel( model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]], savedmodel_dir: Optional[Text] = None, checkpoint_dir: Optional[Union[Text, Dict[Text, Text]]] = None, - restore_model_using_load_weights: bool = False + restore_model_using_load_weights: bool = False, + include_optimizer: bool = False, + signatures=None ) -> Text: """Export keras model for serving which does not include the optimizer. @@ -112,7 +124,7 @@ def export_to_savedmodel( ValueError when model is not specified. """ - if FLAGS.use_dynamic_embedding and FLAGS.use_horovod: + if flags.FLAGS.use_dynamic_embedding and flags.FLAGS.use_horovod: try: rank_array = hvd.allgather_object(get_rank(), name='check_tfra_ranks') assert len(set(rank_array)) == get_world_size() @@ -120,8 +132,11 @@ def export_to_savedmodel( raise ValueError(f"Shouldn't place {inspect.stack()[0][3]} only in the main_process when use TFRA and Horovod.") def helper(name, _model: tf.keras.Model, _checkpoint_dir): - _savedmodel_dir = os.path.join(FLAGS.model_dir, 'export') if savedmodel_dir is None else savedmodel_dir - _savedmodel_dir = f"{_savedmodel_dir}_{name}" + _savedmodel_dir = os.path.join(flags.FLAGS.model_dir, 'export') if savedmodel_dir is None else savedmodel_dir + if get_world_size() > 1: + _savedmodel_dir = f"{_savedmodel_dir}_{name}_{get_rank()}" + else: + _savedmodel_dir = f"{_savedmodel_dir}_{name}" os.makedirs(_savedmodel_dir, exist_ok=True) if _checkpoint_dir: @@ -139,28 +154,59 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir): # Restores the model from latest checkpoint. latest_checkpoint_file = tf.train.latest_checkpoint(_checkpoint_dir) assert latest_checkpoint_file - logging.info('Checkpoint file %s found and restoring from ' - 'checkpoint', latest_checkpoint_file) + logger.info('Checkpoint file %s found and restoring from ' + 'checkpoint', latest_checkpoint_file) checkpoint.restore(latest_checkpoint_file).assert_existing_objects_matched() - options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA']) if FLAGS.use_dynamic_embedding else None - - if is_main_process(): - tf.saved_model.save(_model, _savedmodel_dir, options=options) + if flags.FLAGS.use_dynamic_embedding: + try: + from tensorflow_recommenders_addons import dynamic_embedding as de + de.keras.models.de_save_model( + _model, _savedmodel_dir, overwrite=True, include_optimizer=include_optimizer, signatures=signatures + ) + except: + # Compatible with TFRA version before commit 460b50847d459ebbf91b30ea0f9499fbc7ed9da0 + def _check_de_var_with_fs_saver(_var): + try: + from tensorflow_recommenders_addons import dynamic_embedding as de + # This function only serves FileSystemSaver. + return hasattr(_var, "params") and \ + hasattr(_var.params, "_created_in_class") and \ + _var.params._saveable_object_creator is not None and \ + isinstance(_var.params.kv_creator.saver, de.FileSystemSaver) + except: + return False + + de_dir = os.path.join(_savedmodel_dir, "variables", "TFRADynamicEmbedding") + options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA']) + if is_main_process(): + for var in _model.variables: + _is_dump = _check_de_var_with_fs_saver(var) + if _is_dump: + de_var = var.params + if hasattr(de_var, 'saveable'): + de_var.saveable._saver_config.save_path = de_dir + tf.saved_model.save(_model, export_dir=_savedmodel_dir, signatures=signatures, options=options) + else: + for var in _model.variables: + _is_dump = _check_de_var_with_fs_saver(var) + if _is_dump: + de_var = var.params + a2a_emb = de_var._created_in_class + # save other rank's embedding weights + var.params.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank()) + # save opt weights + if include_optimizer: + de_opt_vars = a2a_emb.optimizer_vars.as_list( + ) if hasattr(a2a_emb.optimizer_vars, "as_list") else a2a_emb.optimizer_vars + for de_opt_var in de_opt_vars: + de_opt_var.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank()) else: - de_dir = os.path.join(_savedmodel_dir, "variables", "TFRADynamicEmbedding") - for var in _model.variables: - if hasattr(var, "params"): - # save other rank's embedding weights - var.params.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank()) - # save opt weights - # opt_de_vars = var.params.optimizer_vars.as_list( - # ) if hasattr(var.params.optimizer_vars, "as_list") else var.params.optimizer_vars - # for opt_de_var in opt_de_vars: - # opt_de_var.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank()) + if is_main_process(): + tf.saved_model.save(_model, export_dir=_savedmodel_dir, signatures=signatures) if is_main_process(): - logging.info(f"save pb model to: {_savedmodel_dir}, without optimizer & traces") + logger.info(f"save pb model to: {_savedmodel_dir}, without optimizer & traces") return _savedmodel_dir @@ -170,7 +216,7 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir): _dir = helper(name, _model, _checkpoint_dir=checkpoint_dir[name] if checkpoint_dir else None) ans.append(_dir) prefix_path = longestCommonPrefix(ans) - logging.info(f"Export multiple models to {prefix_path}*") + logger.info(f"Export multiple models to {prefix_path}*") return prefix_path else: return helper(name="main", _model=model, _checkpoint_dir=checkpoint_dir) @@ -178,47 +224,52 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir): def optimize_for_inference( model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]], - dataset: tf.data.Dataset, savedmodel_dir: Text, + dataset: tf.data.Dataset = None, + signatures=None, ) -> None: - x, y, z = data_adapter.unpack_x_y_sample_weight(next(iter(dataset))) - if isinstance(model, dict): - for name, _model in model.items(): - if "main" in name: - preds = _model(x) - logging.info(preds) - else: - preds = model(x) - logging.info(preds) + x = None + if dataset: + x, y, z = data_adapter.unpack_x_y_sample_weight(next(iter(dataset))) + if isinstance(model, dict): + for name, _model in model.items(): + if "main" in name: + preds = _model(x) + logger.debug(preds) + else: + preds = model(x) + logger.debug(preds) def helper(_model, path): tmp_path = tempfile.mkdtemp(dir='/tmp/') - export_to_savedmodel(_model, savedmodel_dir=tmp_path) + export_to_savedmodel(_model, savedmodel_dir=tmp_path, signatures=signatures) file = os.path.join(path, "saved_model.pb") if tf.io.gfile.exists(file): tf.io.gfile.remove(file) - logging.info(f"Replace optimized saved_modle.pb for {file}") + logger.info(f"Replace optimized saved_modle.pb for {file}") tf.io.gfile.copy(os.path.join(tmp_path + "_main", "saved_model.pb"), file, overwrite=True) else: raise FileNotFoundError(f"{file} does not exist.") if isinstance(model, dict): for name, _model in model.items(): - if "main" in name: - preds = _model(x) - logging.info(preds) + if dataset: + if "main" in name: + preds = _model(x) + logger.info(preds) src = savedmodel_dir + name helper(_model, src) else: - preds = model(x) - logging.info(preds) + if dataset: + preds = model(x) + logger.info(preds) helper(model, savedmodel_dir) class SavedModel: def __init__(self, model_dir, precision): - if FLAGS.use_dynamic_embedding: + if flags.FLAGS.use_dynamic_embedding: from tensorflow_recommenders_addons import dynamic_embedding as de de.enable_inference_mode() @@ -226,7 +277,7 @@ def __init__(self, model_dir, precision): self.graph_func = self.saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] self.precision = tf.float16 if precision == "amp" else tf.float32 - if not FLAGS.run_eagerly: + if not flags.FLAGS.run_eagerly: self._infer_step = tf.function(self.infer_step) else: self._infer_step = self.infer_step @@ -244,7 +295,7 @@ class TFTRTModel: def export_model(self, model_dir, prec, tf_trt_model_dir=None): loaded_model = tf.saved_model.load(model_dir) signature = loaded_model.signatures['serving_default'] - logging.info(signature) + logger.info(signature) # input_shape = [1, 384] # dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32)) # x = [ @@ -262,13 +313,13 @@ def export_model(self, model_dir, prec, tf_trt_model_dir=None): converter.convert() tf_trt_model_dir = tf_trt_model_dir or f'/tmp/tf-trt_model_{prec}' converter.save(tf_trt_model_dir) - logging.info(f"TF-TRT model saved at {tf_trt_model_dir}") + logger.info(f"TF-TRT model saved at {tf_trt_model_dir}") def __init__(self, model_dir, precision): temp_tftrt_dir = f"/tmp/tf-trt_model_{precision}" self.export_model(model_dir, precision, temp_tftrt_dir) saved_model_loaded = tf.saved_model.load(temp_tftrt_dir, tags=[tag_constants.SERVING]) - logging.info(f"TF-TRT model loaded from {temp_tftrt_dir}") + logger.info(f"TF-TRT model loaded from {temp_tftrt_dir}") self.graph_func = saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] self.precision = tf.float16 if precision == "amp" else tf.float32 diff --git a/deepray/utils/flags/_base.py b/deepray/utils/flags/_base.py index 400a3b5d..f4214d77 100644 --- a/deepray/utils/flags/_base.py +++ b/deepray/utils/flags/_base.py @@ -26,7 +26,7 @@ def define_base( num_train_examples=False, learning_rate=False, optimizer_type=False, - keras_use_ctl=False, + use_custom_training_loop=False, model_dir=False, clean=False, num_accumulation_steps=False, @@ -36,9 +36,7 @@ def define_base( num_gpus=False, init_checkpoint=False, hooks=False, - dllog_path=False, export_dir=False, - save_checkpoint_steps=False, run_eagerly=False ): """Register base flags. @@ -74,13 +72,13 @@ def define_base( if learning_rate: flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.') key_flags.append("learning_rate") - if keras_use_ctl: + if use_custom_training_loop: flags.DEFINE_bool( - name="keras_use_ctl", + name="use_custom_training_loop", default=True, help=flags_core.help_wrap("If True, we use a custom training loop for keras.") ) - key_flags.append("keras_use_ctl") + key_flags.append("use_custom_training_loop") if optimizer_type: flags.DEFINE_string("optimizer_type", "adam", "Optimizer used for training - LAMB or ADAM") key_flags.append("optimizer_type") @@ -94,16 +92,6 @@ def define_base( flags.DEFINE_list("init_weights", '', "Initial weights for the main model.") key_flags.append("init_weights") - if save_checkpoint_steps: - flags.DEFINE_integer( - 'save_checkpoint_steps', sys.maxsize, - 'save checkpoint for every n steps. Default value will not save checkpoint during training.' - ) - key_flags.append("save_checkpoint_steps") - if dllog_path: - flags.DEFINE_string('dllog_path', 'deepray_dllogger.json', 'filename where dllogger writes to') - key_flags.append("dllog_path") - if model_dir: flags.DEFINE_string( name="model_dir", diff --git a/deepray/utils/flags/_benchmark.py b/deepray/utils/flags/_benchmark.py index 7333e155..834c0301 100644 --- a/deepray/utils/flags/_benchmark.py +++ b/deepray/utils/flags/_benchmark.py @@ -26,11 +26,10 @@ def define_log_steps(): return [] -def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader=False): +def define_benchmark(bigquery_uploader=False): """Register benchmarking flags. Args: - benchmark_log_dir: Create a flag to specify location for benchmark logging. bigquery_uploader: Create flags for uploading results to BigQuery. Returns: @@ -38,11 +37,6 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader= """ key_flags = [] - - if benchmark: - flags.DEFINE_boolean('benchmark', False, 'Benchmark mode.') - key_flags.append("benchmark") - flags.DEFINE_enum( name="benchmark_logger_type", default="BaseBenchmarkLogger", @@ -68,11 +62,6 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader= define_log_steps() - if benchmark_log_dir: - flags.DEFINE_string( - name="benchmark_log_dir", default=None, help=help_wrap("The location of the benchmark logging.") - ) - if bigquery_uploader: flags.DEFINE_string( name="gcp_project", default=None, help=help_wrap("The GCP project name where the benchmark will be uploaded.") @@ -105,15 +94,4 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader= "information will be uploaded.") ) - @flags.multi_flags_validator( - ["benchmark_logger_type", "benchmark_log_dir"], - message="--benchmark_logger_type=BenchmarkFileLogger will require " - "--benchmark_log_dir being set" - ) - def _check_benchmark_log_dir(flags_dict): - benchmark_logger_type = flags_dict["benchmark_logger_type"] - if benchmark_logger_type == "BenchmarkFileLogger": - return flags_dict["benchmark_log_dir"] - return True - return key_flags diff --git a/deepray/utils/flags/_device.py b/deepray/utils/flags/_device.py index 1278ab15..1fc06419 100644 --- a/deepray/utils/flags/_device.py +++ b/deepray/utils/flags/_device.py @@ -57,7 +57,7 @@ def define_device(tpu=False, redis=False): if tpu: flags.DEFINE_string( - name="tpu", + name="tpu_address", default=None, help=help_wrap( "The Cloud TPU to use for training. This should be either the name " @@ -66,7 +66,7 @@ def define_device(tpu=False, redis=False): "CPU of the local instance instead. (Good for debugging.)" ) ) - key_flags.append("tpu") + key_flags.append("tpu_address") flags.DEFINE_string( name="tpu_zone", diff --git a/deepray/utils/flags/_distribution.py b/deepray/utils/flags/_distribution.py index 49d5f89b..bf7420c7 100644 --- a/deepray/utils/flags/_distribution.py +++ b/deepray/utils/flags/_distribution.py @@ -18,7 +18,7 @@ from deepray.utils.flags._conventions import help_wrap -def define_distribution(use_horovod=True, distribution_strategy=False, worker_hosts=True, task_index=True): +def define_distribution(use_horovod=True, distribution_strategy=False, worker_hosts=False, task_index=False): """Register distributed execution flags. Args: @@ -33,16 +33,16 @@ def define_distribution(use_horovod=True, distribution_strategy=False, worker_ho key_flags = [] if use_horovod: - flags.DEFINE_bool('use_horovod', False, 'Whether to use horovod.') + flags.DEFINE_bool("use_horovod", False, 'Whether to use horovod.') key_flags.append("use_horovod") if distribution_strategy: flags.DEFINE_string( name="distribution_strategy", - default="mirrored", + default="off", help=help_wrap( "The Distribution Strategy to use for training. " - "Accepted values are 'off', 'one_device', " + "Accepted values are 'off', 'horovod', 'one_device', " "'mirrored', 'parameter_server', 'collective', " "case insensitive. 'off' means not to use " "Distribution Strategy; 'default' means to choose " diff --git a/deepray/utils/flags/common_flags.py b/deepray/utils/flags/common_flags.py index f4b9e051..4939aeb1 100644 --- a/deepray/utils/flags/common_flags.py +++ b/deepray/utils/flags/common_flags.py @@ -13,9 +13,6 @@ # limitations under the License. # ============================================================================== """Defining common flags used across all BERT models/applications.""" -import datetime -import logging -import os import tensorflow as tf from absl import flags @@ -25,14 +22,13 @@ def define_common_flags(): """Define common flags for BERT tasks.""" - logging.info("flags base......................................") flags_core.define_base( train_data=True, num_train_examples=True, batch_size=True, learning_rate=True, optimizer_type=True, - keras_use_ctl=True, + use_custom_training_loop=True, num_accumulation_steps=True, init_checkpoint=True, num_gpus=True, @@ -43,8 +39,6 @@ def define_common_flags(): hooks=False, export_dir=False, run_eagerly=True, - dllog_path=True, - save_checkpoint_steps=True, ) flags.DEFINE_string( 'config_file', @@ -56,65 +50,19 @@ def define_common_flags(): '`--config_file` and `--params_override`, `config_file` will be used ' 'first, followed by params_override.' ) - flags.DEFINE_string('vocab_file', None, 'The vocabulary file that the BERT model was trained on.') - flags.DEFINE_bool( - "do_lower_case", True, "Whether to lower case the input text. Should be True for uncased " - "models and False for cased models." - ) flags.DEFINE_integer( - 'steps_per_summary', 200, 'Number of steps per graph-mode loop. Only training step ' + 'steps_per_execution', None, 'Number of steps per graph-mode loop. Only training step ' 'happens inside the loop. Callbacks will not be called ' 'inside.' ) flags.DEFINE_integer("stop_steps", -1, "steps when training stops") - flags.DEFINE_boolean( - 'scale_loss', False, 'Whether to divide the loss by number of replica inside the per-replica ' - 'loss function.' - ) - flags.DEFINE_string( - 'hub_module_url', None, 'TF-Hub path/url to Bert module. ' - 'If specified, init_checkpoint flag should not be used.' - ) flags.DEFINE_string( 'model_name', None, 'Specifies the name of the model. ' 'If "bert", will use canonical BERT; if "albert", will use ALBERT model.' ) - flags.DEFINE_enum( - 'mode', 'train_and_predict', - ['train_and_predict', 'train', 'predict', 'export_only', 'sm_predict', 'trt_predict'], - 'One of {"train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"}. ' - '`train_and_predict`: both train and predict to a json file. ' - '`train`: only trains the model. ' - 'trains the model and evaluates in the meantime. ' - '`predict`: predict answers from the squad json file. ' - '`export_only`: will take the latest checkpoint inside ' - 'model_dir and export a `SavedModel`.' - '`sm_predict`: will load SavedModel from savedmodel_dir and predict answers' - '`trt_predict`: will load SavedModel from savedmodel_dir, convert and predict answers with TF-TRT' - ) - flags.DEFINE_string( - 'input_meta_data_path', None, 'Path to file that contains meta data about input ' - 'to be used for training and evaluation.' - ) flags.DEFINE_bool("use_dynamic_embedding", False, "Whether use tfra.dynamic_embedding.") - flags.DEFINE_string('predict_file', None, 'Prediction data path with train tfrecords.') - flags.DEFINE_string( - "eval_script", None, "SQuAD evaluate.py file to compute f1 and exact_match E.g., evaluate-v1.1.py" - ) flags.DEFINE_integer( - 'n_best_size', 20, 'The total number of n-best predictions to generate in the ' - 'nbest_predictions.json output file.' - ) - flags.DEFINE_integer( - 'max_answer_length', 30, 'The maximum length of an answer that can be generated. This is needed ' - 'because the start and end predictions are not conditioned on one another.' - ) - flags.DEFINE_bool( - 'verbose_logging', False, 'If true, all of the warnings related to data processing will be printed. ' - 'A number of warnings are expected for a normal SQuAD evaluation.' - ) - flags.DEFINE_integer( - "random_seed", 12345, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.") + "random_seed", None, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.") ) # Adds flags for mixed precision training. flags_core.define_performance( @@ -124,7 +72,7 @@ def define_common_flags(): synthetic_data=False, max_train_steps=False, dtype=True, - dynamic_loss_scale=True, + dynamic_loss_scale=False, loss_scale=True, all_reduce_alg=False, num_packs=False, @@ -135,30 +83,21 @@ def define_common_flags(): flags_core.define_distribution(distribution_strategy=True) flags_core.define_data( dataset=True, - data_dir=True, - download_if_missing=True, - ) - flags_core.define_device(tpu=False, redis=True) - flags_core.define_benchmark(benchmark=True,) - - flags.DEFINE_string( - name="date", default=(datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), help="" + data_dir=False, + download_if_missing=False, ) - flags.DEFINE_string(name="restore_date", default=None, help="") - flags.DEFINE_string(name="start_date", default=None, help="") - flags.DEFINE_string(name="end_date", default=None, help="") - flags.DEFINE_string(name="fine_tune", default=None, help="") - flags.DEFINE_string(name="warmup_path", default=None, help="") + flags_core.define_device(tpu=False, redis=False) + flags_core.define_benchmark() flags.DEFINE_float( "dropout_rate", default=-1, help="Dropout rate for all the classification MLPs (default: -1, disabled).", ) - flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") flags.DEFINE_integer("prebatch", 1, "prebatch size for tfrecord") - flags.DEFINE_string("feature_map", os.path.join(os.getcwd(), "business/data/feature_map.csv"), "path to feature_map") + flags.DEFINE_string("feature_map", None, "path to feature_map") flags.DEFINE_string("black_list", None, "black list for feature_map") flags.DEFINE_string("white_list", None, "white list for feature_map") + flags.DEFINE_integer("ev_slot_num", 0, "ev_slot_num") def use_float16(): diff --git a/deepray/utils/flags/core.py b/deepray/utils/flags/core.py index b727fafb..4f89965e 100644 --- a/deepray/utils/flags/core.py +++ b/deepray/utils/flags/core.py @@ -159,7 +159,7 @@ def parse_flags(flags_obj): "epsilon": flags_obj.epsilon, "match_mlperf": flags_obj.ml_perf, # "epochs_between_evals": flags_obj.epochs_between_evals, - "keras_use_ctl": flags_obj.keras_use_ctl, + "use_custom_training_loop": flags_obj.use_custom_training_loop, "hr_threshold": flags_obj.hr_threshold, "stream_files": flags_obj.tpu is not None, "train_dataset_path": flags_obj.train_dataset_path, diff --git a/deepray/utils/horovod_utils.py b/deepray/utils/horovod_utils.py index 6da2d21a..5e62d6ad 100644 --- a/deepray/utils/horovod_utils.py +++ b/deepray/utils/horovod_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,10 +11,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import horovod.tensorflow.keras as hvd -from absl import logging, flags +# We don't want the whole process to quit because of the import failure when +# we don't use horovod to do communication. +try: + import horovod.tensorflow as hvd +except ImportError: + pass +from absl import flags -FLAGS = flags.FLAGS +from deepray.utils import logging_util + +logger = logging_util.get_logger() def get_rank(): @@ -32,4 +39,26 @@ def get_world_size(): def is_main_process(): - return not FLAGS.use_horovod or get_rank() == 0 + return not flags.FLAGS.use_horovod or get_rank() == 0 + + +def main_info(info): + if is_main_process(): + logger.info(info) + + +def main_warning(info): + if is_main_process(): + logger.warning(info) + + +def id_in_rank(): + return 0 + + +def num_gpu_per_rank(): + return 1 + + +def global_gpu_id(): + return get_rank() diff --git a/deepray/utils/keras_utils.py b/deepray/utils/keras_utils.py index 6bd2663a..47620db9 100644 --- a/deepray/utils/keras_utils.py +++ b/deepray/utils/keras_utils.py @@ -1,4 +1,4 @@ -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,154 +11,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# ============================================================================== -"""Utilities for tf.keras.""" +"""Helper functions for the Keras implementations of models.""" + +import multiprocessing +import os import tensorflow as tf +from absl import logging +from tensorflow.python import tf2 +from deepray.utils import logging_util -def is_tensor_or_variable(x): - return tf.is_tensor(x) or isinstance(x, tf.Variable) - - -class LossFunctionWrapper(tf.keras.losses.Loss): - """Wraps a loss function in the `Loss` class.""" - - def __init__(self, fn, reduction=tf.keras.losses.Reduction.AUTO, name=None, **kwargs): - """Initializes `LossFunctionWrapper` class. - - Args: - fn: The loss function to wrap, with signature `fn(y_true, y_pred, - **kwargs)`. - reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to - loss. Default value is `AUTO`. `AUTO` indicates that the reduction - option will be determined by the usage context. For almost all cases - this defaults to `SUM_OVER_BATCH_SIZE`. When used with - `tf.distribute.Strategy`, outside of built-in training loops such as - `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE` - will raise an error. Please see this custom training [tutorial]( - https://www.tensorflow.org/tutorials/distribute/custom_training) - for more details. - name: (Optional) name for the loss. - **kwargs: The keyword arguments that are passed on to `fn`. - """ - super().__init__(reduction=reduction, name=name) - self.fn = fn - self._fn_kwargs = kwargs - - def call(self, y_true, y_pred): - """Invokes the `LossFunctionWrapper` instance. - - Args: - y_true: Ground truth values. - y_pred: The predicted values. - - Returns: - Loss values per sample. - """ - return self.fn(y_true, y_pred, **self._fn_kwargs) - - def get_config(self): - config = {} - for k, v in iter(self._fn_kwargs.items()): - config[k] = tf.keras.backend.eval(v) if is_tensor_or_variable(v) else v - base_config = super().get_config() - return {**base_config, **config} - - -def normalize_data_format(value): - if value is None: - value = tf.keras.backend.image_data_format() - data_format = value.lower() - if data_format not in {"channels_first", "channels_last"}: - raise ValueError( - "The `data_format` argument must be one of " - '"channels_first", "channels_last". Received: ' + str(value) - ) - return data_format - - -def normalize_tuple(value, n, name): - """Transforms an integer or iterable of integers into an integer tuple. - - A copy of tensorflow.python.keras.util. - - Args: - value: The value to validate and convert. Could an int, or any iterable - of ints. - n: The size of the tuple to be returned. - name: The name of the argument being validated, e.g. "strides" or - "kernel_size". This is only used to format error messages. - - Returns: - A tuple of n integers. - - Raises: - ValueError: If something else than an int/long or iterable thereof was - passed. - """ - if isinstance(value, int): - return (value,) * n - else: - try: - value_tuple = tuple(value) - except TypeError: - raise TypeError("The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)) - if len(value_tuple) != n: - raise ValueError( - "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) - ) - for single_value in value_tuple: - try: - int(single_value) - except (ValueError, TypeError): - raise ValueError( - "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " " - "including element " + str(single_value) + " of type" + " " + str(type(single_value)) - ) - return value_tuple - - -def _hasattr(obj, attr_name): - # If possible, avoid retrieving the attribute as the object might run some - # lazy computation in it. - if attr_name in dir(obj): - return True - try: - getattr(obj, attr_name) - except AttributeError: - return False +logger = logging_util.get_logger() + + +def set_session_config(enable_eager=False, enable_xla=False): + """Sets the session config.""" + if is_v2_0(): + set_config_v2(enable_xla=enable_xla) else: - return True - - -def assert_like_rnncell(cell_name, cell): - """Raises a TypeError if cell is not like a - tf.keras.layers.AbstractRNNCell. - - Args: - cell_name: A string to give a meaningful error referencing to the name - of the function argument. - cell: The object which should behave like a - tf.keras.layers.AbstractRNNCell. - - Raises: - TypeError: A human-friendly exception. - """ - conditions = [ - _hasattr(cell, "output_size"), - _hasattr(cell, "state_size"), - _hasattr(cell, "get_initial_state"), - callable(cell), - ] - - errors = [ - "'output_size' property is missing", - "'state_size' property is missing", - "'get_initial_state' method is required", - "is not callable", - ] - - if not all(conditions): - errors = [error for error, cond in zip(errors, conditions) if not cond] - raise TypeError("The argument {!r} ({}) is not an RNNCell: {}.".format(cell_name, cell, ", ".join(errors))) + config = get_config_proto_v1(enable_xla=enable_xla) + if enable_eager: + tf.compat.v1.enable_eager_execution(config=config) + else: + sess = tf.Session(config=config) + tf.keras.backend.set_session(sess) + + +def get_config_proto_v1(enable_xla=False): + """Return config proto according to flag settings, or None to use default.""" + config = None + if enable_xla: + config = tf.compat.v1.ConfigProto() + config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_2) + return config + + +def set_config_v2(enable_xla=False): + """Config eager context according to flag values using TF 2.0 API.""" + if enable_xla: + tf.config.optimizer.set_jit(True) + logger.info("XLA activated") + + +def is_v2_0(): + """Returns true if using tf 2.0.""" + return tf2.enabled() + + +def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count): + """Set GPU thread mode and count, and adjust dataset threads count.""" + cpu_count = multiprocessing.cpu_count() + logging.info('Logical CPU cores: %s', cpu_count) + + # Allocate private thread pool for each GPU to schedule and launch kernels + per_gpu_thread_count = per_gpu_thread_count or 2 + os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode + os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) + logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT']) + logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE']) + + # Limit data preprocessing threadpool to CPU cores minus number of total GPU + # private threads and memory copy threads. + total_gpu_thread_count = per_gpu_thread_count * num_gpus + num_runtime_threads = num_gpus + if not datasets_num_private_threads: + datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8) + logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads) diff --git a/deepray/utils/logging_util.py b/deepray/utils/logging_util.py new file mode 100644 index 00000000..0de16c7d --- /dev/null +++ b/deepray/utils/logging_util.py @@ -0,0 +1,392 @@ +# coding=utf-8 +# Copyright 2020 Optuna, Hugging Face +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Logging utilities.""" + +import functools +import logging +import os +import sys +import threading +from logging import ( + CRITICAL, # NOQA + DEBUG, # NOQA + ERROR, # NOQA + FATAL, # NOQA + INFO, # NOQA + NOTSET, # NOQA + WARN, # NOQA + WARNING, # NOQA +) +from logging import captureWarnings as _captureWarnings +from typing import Optional + +from tqdm import auto as tqdm_lib + +_lock = threading.Lock() +_default_handler: Optional[logging.Handler] = None + +log_levels = { + "detail": logging.DEBUG, # will also print filename and line number + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, +} + +_default_log_level = logging.INFO + +_tqdm_active = True + + +def _get_default_logging_level(): + """ + If DEEPRAY_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is + not - fall back to `_default_log_level` + """ + env_level_str = os.getenv("DEEPRAY_VERBOSITY", None) + if env_level_str: + if env_level_str in log_levels: + return log_levels[env_level_str] + else: + logging.getLogger().warning( + f"Unknown option DEEPRAY_VERBOSITY={env_level_str}, " + f"has to be one of: { ', '.join(log_levels.keys()) }" + ) + return _default_log_level + + +def _get_library_name() -> str: + return __name__.split(".")[0] + + +def _get_library_root_logger() -> logging.Logger: + return logging.getLogger(_get_library_name()) + + +def _configure_library_root_logger() -> None: + global _default_handler + + with _lock: + if _default_handler: + # This library has already configured the library root logger. + return + _default_handler = logging.StreamHandler() # Set sys.stderr as stream. + # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 + if sys.stderr is None: + sys.stderr = open(os.devnull, "w") + + _default_handler.flush = sys.stderr.flush + + # Apply our default configuration to the library root logger. + library_root_logger = _get_library_root_logger() + library_root_logger.addHandler(_default_handler) + library_root_logger.setLevel(_get_default_logging_level()) + # if logging level is debug, we add pathname and lineno to formatter for easy debugging + if os.getenv("DEEPRAY_VERBOSITY", None) == "detail": + formatter = logging.Formatter("%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s") + _default_handler.setFormatter(formatter) + + library_root_logger.propagate = False + + +def _reset_library_root_logger() -> None: + global _default_handler + + with _lock: + if not _default_handler: + return + + library_root_logger = _get_library_root_logger() + library_root_logger.removeHandler(_default_handler) + library_root_logger.setLevel(logging.NOTSET) + _default_handler = None + + +def get_log_levels_dict(): + return log_levels + + +def captureWarnings(capture): + """ + Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the + `warnings` library. + + Read more about this method here: + https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module + + All warnings will be logged through the `py.warnings` logger. + + Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging + level of that logger to the library's root logger. + """ + logger = get_logger("py.warnings") + + if not logger.handlers: + logger.addHandler(_default_handler) + + logger.setLevel(_get_library_root_logger().level) + + _captureWarnings(capture) + + +def get_logger(name: Optional[str] = None) -> logging.Logger: + """ + Return a logger with the specified name. + + This function is not supposed to be directly accessed unless you are writing a custom transformers module. + """ + + if name is None: + name = _get_library_name() + + _configure_library_root_logger() + return logging.getLogger(name) + + +def get_verbosity() -> int: + """ + Return the current level for the 🤗 Transformers's root logger as an int. + + Returns: + `int`: The logging level. + + + + 🤗 Transformers has following logging levels: + + - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL` + - 40: `transformers.logging.ERROR` + - 30: `transformers.logging.WARNING` or `transformers.logging.WARN` + - 20: `transformers.logging.INFO` + - 10: `transformers.logging.DEBUG` + + """ + + _configure_library_root_logger() + return _get_library_root_logger().getEffectiveLevel() + + +def set_verbosity(verbosity: int) -> None: + """ + Set the verbosity level for the 🤗 Transformers's root logger. + + Args: + verbosity (`int`): + Logging level, e.g., one of: + + - `transformers.logging.CRITICAL` or `transformers.logging.FATAL` + - `transformers.logging.ERROR` + - `transformers.logging.WARNING` or `transformers.logging.WARN` + - `transformers.logging.INFO` + - `transformers.logging.DEBUG` + """ + + _configure_library_root_logger() + _get_library_root_logger().setLevel(verbosity) + + +def set_verbosity_info(): + """Set the verbosity to the `INFO` level.""" + return set_verbosity(INFO) + + +def set_verbosity_warning(): + """Set the verbosity to the `WARNING` level.""" + return set_verbosity(WARNING) + + +def set_verbosity_debug(): + """Set the verbosity to the `DEBUG` level.""" + return set_verbosity(DEBUG) + + +def set_verbosity_error(): + """Set the verbosity to the `ERROR` level.""" + return set_verbosity(ERROR) + + +def disable_default_handler() -> None: + """Disable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().removeHandler(_default_handler) + + +def enable_default_handler() -> None: + """Enable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().addHandler(_default_handler) + + +def add_handler(handler: logging.Handler) -> None: + """adds a handler to the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None + _get_library_root_logger().addHandler(handler) + + +def remove_handler(handler: logging.Handler) -> None: + """removes given handler from the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None and handler not in _get_library_root_logger().handlers + _get_library_root_logger().removeHandler(handler) + + +def disable_propagation() -> None: + """ + Disable propagation of the library log outputs. Note that log propagation is disabled by default. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = False + + +def enable_propagation() -> None: + """ + Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to + prevent double logging if the root logger has been configured. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = True + + +def enable_explicit_format() -> None: + """ + Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows: + ``` + [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE + ``` + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s") + handler.setFormatter(formatter) + + +def reset_format() -> None: + """ + Resets the formatting for HuggingFace Transformers's loggers. + + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + handler.setFormatter(None) + + +def warning_advice(self, *args, **kwargs): + """ + This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this + warning will not be printed + """ + no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False) + if no_advisory_warnings: + return + self.warning(*args, **kwargs) + + +logging.Logger.warning_advice = warning_advice + + +@functools.lru_cache(None) +def warning_once(self, *args, **kwargs): + """ + This method is identical to `logger.warning()`, but will emit the warning with the same message only once + + Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache. + The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to + another type of cache that includes the caller frame information in the hashing function. + """ + self.warning(*args, **kwargs) + + +logging.Logger.warning_once = warning_once + + +class EmptyTqdm: + """Dummy tqdm which doesn't do anything.""" + + def __init__(self, *args, **kwargs): # pylint: disable=unused-argument + self._iterator = args[0] if args else None + + def __iter__(self): + return iter(self._iterator) + + def __getattr__(self, _): + """Return empty function.""" + + def empty_fn(*args, **kwargs): # pylint: disable=unused-argument + return + + return empty_fn + + def __enter__(self): + return self + + def __exit__(self, type_, value, traceback): + return + + +class _tqdm_cls: + + def __call__(self, *args, **kwargs): + if _tqdm_active: + return tqdm_lib.tqdm(*args, **kwargs) + else: + return EmptyTqdm(*args, **kwargs) + + def set_lock(self, *args, **kwargs): + self._lock = None + if _tqdm_active: + return tqdm_lib.tqdm.set_lock(*args, **kwargs) + + def get_lock(self): + if _tqdm_active: + return tqdm_lib.tqdm.get_lock() + + +tqdm = _tqdm_cls() + + +def is_progress_bar_enabled() -> bool: + """Return a boolean indicating whether tqdm progress bars are enabled.""" + global _tqdm_active + return bool(_tqdm_active) + + +def enable_progress_bar(): + """Enable tqdm progress bar.""" + global _tqdm_active + _tqdm_active = True + + +def disable_progress_bar(): + """Disable tqdm progress bar.""" + global _tqdm_active + _tqdm_active = False diff --git a/deepray/utils/logs/hooks.py b/deepray/utils/logs/hooks.py deleted file mode 100644 index 065c2fef..00000000 --- a/deepray/utils/logs/hooks.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Hook that counts examples per second every N steps or seconds.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf # pylint: disable=g-bad-import-order - -from official.utils.logs import logger - - -class ExamplesPerSecondHook(tf.estimator.SessionRunHook): - """Hook to print out examples per second. - - Total time is tracked and then divided by the total number of steps - to get the average step time and then batch_size is used to determine - the running average of examples per second. The examples per second for the - most recent interval is also logged. - """ - - def __init__(self, batch_size, every_n_steps=None, every_n_secs=None, warm_steps=0, metric_logger=None): - """Initializer for ExamplesPerSecondHook. - - Args: - batch_size: Total batch size across all workers used to calculate - examples/second from global time. - every_n_steps: Log stats every n steps. - every_n_secs: Log stats every n seconds. Exactly one of the - `every_n_steps` or `every_n_secs` should be set. - warm_steps: The number of steps to be skipped before logging and running - average calculation. warm_steps steps refers to global steps across all - workers, not on each worker - metric_logger: instance of `BenchmarkLogger`, the benchmark logger that - hook should use to write the log. If None, BaseBenchmarkLogger will - be used. - - Raises: - ValueError: if neither `every_n_steps` or `every_n_secs` is set, or - both are set. - """ - - if (every_n_steps is None) == (every_n_secs is None): - raise ValueError("exactly one of every_n_steps" - " and every_n_secs should be provided.") - - self._logger = metric_logger or logger.BaseBenchmarkLogger() - - self._timer = tf.estimator.SecondOrStepTimer(every_steps=every_n_steps, every_secs=every_n_secs) - - self._step_train_time = 0 - self._total_steps = 0 - self._batch_size = batch_size - self._warm_steps = warm_steps - # List of examples per second logged every_n_steps. - self.current_examples_per_sec_list = [] - - def begin(self): - """Called once before using the session to check global step.""" - self._global_step_tensor = tf.compat.v1.train.get_global_step() - if self._global_step_tensor is None: - raise RuntimeError("Global step should be created to use StepCounterHook.") - - def before_run(self, run_context): # pylint: disable=unused-argument - """Called before each call to run(). - - Args: - run_context: A SessionRunContext object. - - Returns: - A SessionRunArgs object or None if never triggered. - """ - return tf.estimator.SessionRunArgs(self._global_step_tensor) - - def after_run(self, run_context, run_values): # pylint: disable=unused-argument - """Called after each call to run(). - - Args: - run_context: A SessionRunContext object. - run_values: A SessionRunValues object. - """ - global_step = run_values.results - - if self._timer.should_trigger_for_step(global_step) and global_step > self._warm_steps: - elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(global_step) - if elapsed_time is not None: - self._step_train_time += elapsed_time - self._total_steps += elapsed_steps - - # average examples per second is based on the total (accumulative) - # training steps and training time so far - average_examples_per_sec = self._batch_size * (self._total_steps / self._step_train_time) - # current examples per second is based on the elapsed training steps - # and training time per batch - current_examples_per_sec = self._batch_size * (elapsed_steps / elapsed_time) - # Logs entries to be read from hook during or after run. - self.current_examples_per_sec_list.append(current_examples_per_sec) - self._logger.log_metric("average_examples_per_sec", average_examples_per_sec, global_step=global_step) - - self._logger.log_metric("current_examples_per_sec", current_examples_per_sec, global_step=global_step) diff --git a/deepray/utils/logs/hooks_test.py b/deepray/utils/logs/hooks_test.py deleted file mode 100644 index cb3c18ad..00000000 --- a/deepray/utils/logs/hooks_test.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for hooks.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import time - -import tensorflow as tf # pylint: disable=g-bad-import-order - -from official.utils.logs import hooks -from official.utils.testing import mock_lib - -logging.set_verbosity(logging.DEBUG) - - -class ExamplesPerSecondHookTest(tf.test.TestCase): - """Tests for the ExamplesPerSecondHook. - - In the test, we explicitly run global_step tensor after train_op in order to - keep the global_step value and the train_op (which increase the glboal_step - by 1) consistent. This is to correct the discrepancies in reported global_step - value when running on GPUs. - """ - - def setUp(self): - """Mock out logging calls to verify if correct info is being monitored.""" - self._logger = mock_lib.MockBenchmarkLogger() - - self.graph = tf.Graph() - with self.graph.as_default(): - tf.compat.v1.train.create_global_step() - self.train_op = tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1) - self.global_step = tf.compat.v1.train.get_global_step() - - def test_raise_in_both_secs_and_steps(self): - with self.assertRaises(ValueError): - hooks.ExamplesPerSecondHook(batch_size=256, every_n_steps=10, every_n_secs=20, metric_logger=self._logger) - - def test_raise_in_none_secs_and_steps(self): - with self.assertRaises(ValueError): - hooks.ExamplesPerSecondHook(batch_size=256, every_n_steps=None, every_n_secs=None, metric_logger=self._logger) - - def _validate_log_every_n_steps(self, every_n_steps, warm_steps): - hook = hooks.ExamplesPerSecondHook( - batch_size=256, every_n_steps=every_n_steps, warm_steps=warm_steps, metric_logger=self._logger - ) - - with tf.compat.v1.train.MonitoredSession(tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess: - for _ in range(every_n_steps): - # Explicitly run global_step after train_op to get the accurate - # global_step value - mon_sess.run(self.train_op) - mon_sess.run(self.global_step) - # Nothing should be in the list yet - self.assertFalse(self._logger.logged_metric) - - mon_sess.run(self.train_op) - global_step_val = mon_sess.run(self.global_step) - - if global_step_val > warm_steps: - self._assert_metrics() - else: - # Nothing should be in the list yet - self.assertFalse(self._logger.logged_metric) - - # Add additional run to verify proper reset when called multiple times. - prev_log_len = len(self._logger.logged_metric) - mon_sess.run(self.train_op) - global_step_val = mon_sess.run(self.global_step) - - if every_n_steps == 1 and global_step_val > warm_steps: - # Each time, we log two additional metrics. Did exactly 2 get added? - self.assertEqual(len(self._logger.logged_metric), prev_log_len + 2) - else: - # No change in the size of the metric list. - self.assertEqual(len(self._logger.logged_metric), prev_log_len) - - def test_examples_per_sec_every_1_steps(self): - with self.graph.as_default(): - self._validate_log_every_n_steps(1, 0) - - def test_examples_per_sec_every_5_steps(self): - with self.graph.as_default(): - self._validate_log_every_n_steps(5, 0) - - def test_examples_per_sec_every_1_steps_with_warm_steps(self): - with self.graph.as_default(): - self._validate_log_every_n_steps(1, 10) - - def test_examples_per_sec_every_5_steps_with_warm_steps(self): - with self.graph.as_default(): - self._validate_log_every_n_steps(5, 10) - - def _validate_log_every_n_secs(self, every_n_secs): - hook = hooks.ExamplesPerSecondHook( - batch_size=256, every_n_steps=None, every_n_secs=every_n_secs, metric_logger=self._logger - ) - - with tf.compat.v1.train.MonitoredSession(tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess: - # Explicitly run global_step after train_op to get the accurate - # global_step value - mon_sess.run(self.train_op) - mon_sess.run(self.global_step) - # Nothing should be in the list yet - self.assertFalse(self._logger.logged_metric) - time.sleep(every_n_secs) - - mon_sess.run(self.train_op) - mon_sess.run(self.global_step) - self._assert_metrics() - - def test_examples_per_sec_every_1_secs(self): - with self.graph.as_default(): - self._validate_log_every_n_secs(1) - - def test_examples_per_sec_every_5_secs(self): - with self.graph.as_default(): - self._validate_log_every_n_secs(5) - - def _assert_metrics(self): - metrics = self._logger.logged_metric - self.assertEqual(metrics[-2]["name"], "average_examples_per_sec") - self.assertEqual(metrics[-1]["name"], "current_examples_per_sec") - - -if __name__ == "__main__": - tf.test.main() diff --git a/deepray/utils/logs/logger.py b/deepray/utils/logs/logger.py index 3863149a..554b8de7 100644 --- a/deepray/utils/logs/logger.py +++ b/deepray/utils/logs/logger.py @@ -45,8 +45,6 @@ RUN_STATUS_FAILURE = "failure" RUN_STATUS_RUNNING = "running" -FLAGS = flags.FLAGS - # Don't use it directly. Use get_benchmark_logger to access a logger. _benchmark_logger = None _logger_lock = threading.Lock() diff --git a/deepray/utils/logs/metric_hook.py b/deepray/utils/logs/metric_hook.py deleted file mode 100644 index 73d3b6bc..00000000 --- a/deepray/utils/logs/metric_hook.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Session hook for logging benchmark metric.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf # pylint: disable=g-bad-import-order - - -class LoggingMetricHook(tf.estimator.LoggingTensorHook): - """Hook to log benchmark metric information. - - This hook is very similar as tf.train.LoggingTensorHook, which logs given - tensors every N local steps, every N seconds, or at the end. The metric - information will be logged to given log_dir or via metric_logger in JSON - format, which can be consumed by data analysis pipeline later. - - Note that if `at_end` is True, `tensors` should not include any tensor - whose evaluation produces a side effect such as consuming additional inputs. - """ - - def __init__(self, tensors, metric_logger=None, every_n_iter=None, every_n_secs=None, at_end=False): - """Initializer for LoggingMetricHook. - - Args: - tensors: `dict` that maps string-valued tags to tensors/tensor names, - or `iterable` of tensors/tensor names. - metric_logger: instance of `BenchmarkLogger`, the benchmark logger that - hook should use to write the log. - every_n_iter: `int`, print the values of `tensors` once every N local - steps taken on the current worker. - every_n_secs: `int` or `float`, print the values of `tensors` once every N - seconds. Exactly one of `every_n_iter` and `every_n_secs` should be - provided. - at_end: `bool` specifying whether to print the values of `tensors` at the - end of the run. - - Raises: - ValueError: - 1. `every_n_iter` is non-positive, or - 2. Exactly one of every_n_iter and every_n_secs should be provided. - 3. Exactly one of log_dir and metric_logger should be provided. - """ - super(LoggingMetricHook, - self).__init__(tensors=tensors, every_n_iter=every_n_iter, every_n_secs=every_n_secs, at_end=at_end) - - if metric_logger is None: - raise ValueError("metric_logger should be provided.") - self._logger = metric_logger - - def begin(self): - super(LoggingMetricHook, self).begin() - self._global_step_tensor = tf.compat.v1.train.get_global_step() - if self._global_step_tensor is None: - raise RuntimeError("Global step should be created to use LoggingMetricHook.") - if self._global_step_tensor.name not in self._current_tensors: - self._current_tensors[self._global_step_tensor.name] = (self._global_step_tensor) - - def after_run(self, unused_run_context, run_values): - # should_trigger is a internal state that populated at before_run, and it is - # using self_timer to determine whether it should trigger. - if self._should_trigger: - self._log_metric(run_values.results) - - self._iter_count += 1 - - def end(self, session): - if self._log_at_end: - values = session.run(self._current_tensors) - self._log_metric(values) - - def _log_metric(self, tensor_values): - self._timer.update_last_triggered_step(self._iter_count) - global_step = tensor_values[self._global_step_tensor.name] - # self._tag_order is populated during the init of LoggingTensorHook - for tag in self._tag_order: - self._logger.log_metric(tag, tensor_values[tag], global_step=global_step) diff --git a/deepray/utils/logs/metric_hook_test.py b/deepray/utils/logs/metric_hook_test.py deleted file mode 100644 index d8c82c53..00000000 --- a/deepray/utils/logs/metric_hook_test.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for metric_hook.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tempfile -import time - -import tensorflow as tf # pylint: disable=g-bad-import-order -from tensorflow.python.training import monitored_session # pylint: disable=g-bad-import-order - -from official.utils.logs import metric_hook -from official.utils.testing import mock_lib - - -class LoggingMetricHookTest(tf.test.TestCase): - """Tests for LoggingMetricHook.""" - - def setUp(self): - super(LoggingMetricHookTest, self).setUp() - - self._log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) - self._logger = mock_lib.MockBenchmarkLogger() - - def tearDown(self): - super(LoggingMetricHookTest, self).tearDown() - tf.io.gfile.rmtree(self.get_temp_dir()) - - def test_illegal_args(self): - with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"): - metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=0) - with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"): - metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=-10) - with self.assertRaisesRegexp(ValueError, "xactly one of"): - metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5, every_n_secs=5) - with self.assertRaisesRegexp(ValueError, "xactly one of"): - metric_hook.LoggingMetricHook(tensors=["t"]) - with self.assertRaisesRegexp(ValueError, "metric_logger"): - metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5) - - def test_print_at_end_only(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - t = tf.constant(42.0, name="foo") - train_op = tf.constant(3) - hook = metric_hook.LoggingMetricHook(tensors=[t.name], at_end=True, metric_logger=self._logger) - hook.begin() - mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access - sess.run(tf.compat.v1.global_variables_initializer()) - - for _ in range(3): - mon_sess.run(train_op) - self.assertEqual(self._logger.logged_metric, []) - - hook.end(sess) - self.assertEqual(len(self._logger.logged_metric), 1) - metric = self._logger.logged_metric[0] - self.assertRegexpMatches(metric["name"], "foo") - self.assertEqual(metric["value"], 42.0) - self.assertEqual(metric["unit"], None) - self.assertEqual(metric["global_step"], 0) - - def test_global_step_not_found(self): - with tf.Graph().as_default(): - t = tf.constant(42.0, name="foo") - hook = metric_hook.LoggingMetricHook(tensors=[t.name], at_end=True, metric_logger=self._logger) - - with self.assertRaisesRegexp(RuntimeError, "should be created to use LoggingMetricHook."): - hook.begin() - - def test_log_tensors(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - t1 = tf.constant(42.0, name="foo") - t2 = tf.constant(43.0, name="bar") - train_op = tf.constant(3) - hook = metric_hook.LoggingMetricHook(tensors=[t1, t2], at_end=True, metric_logger=self._logger) - hook.begin() - mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access - sess.run(tf.compat.v1.global_variables_initializer()) - - for _ in range(3): - mon_sess.run(train_op) - self.assertEqual(self._logger.logged_metric, []) - - hook.end(sess) - self.assertEqual(len(self._logger.logged_metric), 2) - metric1 = self._logger.logged_metric[0] - self.assertRegexpMatches(str(metric1["name"]), "foo") - self.assertEqual(metric1["value"], 42.0) - self.assertEqual(metric1["unit"], None) - self.assertEqual(metric1["global_step"], 0) - - metric2 = self._logger.logged_metric[1] - self.assertRegexpMatches(str(metric2["name"]), "bar") - self.assertEqual(metric2["value"], 43.0) - self.assertEqual(metric2["unit"], None) - self.assertEqual(metric2["global_step"], 0) - - def _validate_print_every_n_steps(self, sess, at_end): - t = tf.constant(42.0, name="foo") - - train_op = tf.constant(3) - hook = metric_hook.LoggingMetricHook(tensors=[t.name], every_n_iter=10, at_end=at_end, metric_logger=self._logger) - hook.begin() - mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access - sess.run(tf.compat.v1.global_variables_initializer()) - mon_sess.run(train_op) - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - for _ in range(3): - self._logger.logged_metric = [] - for _ in range(9): - mon_sess.run(train_op) - # assertNotRegexpMatches is not supported by python 3.1 and later - self.assertEqual(str(self._logger.logged_metric).find(t.name), -1) - mon_sess.run(train_op) - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - - # Add additional run to verify proper reset when called multiple times. - self._logger.logged_metric = [] - mon_sess.run(train_op) - # assertNotRegexpMatches is not supported by python 3.1 and later - self.assertEqual(str(self._logger.logged_metric).find(t.name), -1) - - self._logger.logged_metric = [] - hook.end(sess) - if at_end: - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - else: - # assertNotRegexpMatches is not supported by python 3.1 and later - self.assertEqual(str(self._logger.logged_metric).find(t.name), -1) - - def test_print_every_n_steps(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - self._validate_print_every_n_steps(sess, at_end=False) - # Verify proper reset. - self._validate_print_every_n_steps(sess, at_end=False) - - def test_print_every_n_steps_and_end(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - self._validate_print_every_n_steps(sess, at_end=True) - # Verify proper reset. - self._validate_print_every_n_steps(sess, at_end=True) - - def _validate_print_every_n_secs(self, sess, at_end): - t = tf.constant(42.0, name="foo") - train_op = tf.constant(3) - - hook = metric_hook.LoggingMetricHook(tensors=[t.name], every_n_secs=1.0, at_end=at_end, metric_logger=self._logger) - hook.begin() - mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access - sess.run(tf.compat.v1.global_variables_initializer()) - - mon_sess.run(train_op) - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - - # assertNotRegexpMatches is not supported by python 3.1 and later - self._logger.logged_metric = [] - mon_sess.run(train_op) - self.assertEqual(str(self._logger.logged_metric).find(t.name), -1) - time.sleep(1.0) - - self._logger.logged_metric = [] - mon_sess.run(train_op) - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - - self._logger.logged_metric = [] - hook.end(sess) - if at_end: - self.assertRegexpMatches(str(self._logger.logged_metric), t.name) - else: - # assertNotRegexpMatches is not supported by python 3.1 and later - self.assertEqual(str(self._logger.logged_metric).find(t.name), -1) - - def test_print_every_n_secs(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - self._validate_print_every_n_secs(sess, at_end=False) - # Verify proper reset. - self._validate_print_every_n_secs(sess, at_end=False) - - def test_print_every_n_secs_and_end(self): - with tf.Graph().as_default(), tf.compat.v1.Session() as sess: - tf.compat.v1.train.get_or_create_global_step() - self._validate_print_every_n_secs(sess, at_end=True) - # Verify proper reset. - self._validate_print_every_n_secs(sess, at_end=True) - - -if __name__ == "__main__": - tf.test.main() diff --git a/deepray/utils/logs/mlperf_helper.py b/deepray/utils/logs/mlperf_helper.py index a2340b70..c2553148 100644 --- a/deepray/utils/logs/mlperf_helper.py +++ b/deepray/utils/logs/mlperf_helper.py @@ -193,6 +193,5 @@ def clear_system_caches(): if __name__ == "__main__": - logging.set_verbosity(logging.INFO) with LOGGER(True): ncf_print(key=TAGS.RUN_START) diff --git a/deepray/utils/logs/summary_manager.py b/deepray/utils/logs/summary_manager.py index 7f6ef677..7af94e6c 100644 --- a/deepray/utils/logs/summary_manager.py +++ b/deepray/utils/logs/summary_manager.py @@ -22,8 +22,6 @@ import tensorflow as tf import horovod.tensorflow as hvd -FLAGS = flags.FLAGS - _MIN_SUMMARY_STEPS = 10 @@ -52,7 +50,7 @@ def __init__(self, summary_dir, global_step=None): self.summary_writers['train'], self.summary_writers['evel'] = None, None else: self.summary_writers['evel'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "eval")) - if FLAGS.steps_per_summary >= _MIN_SUMMARY_STEPS: + if FLAGS.steps_per_execution >= _MIN_SUMMARY_STEPS: # Only writes summary when the stats are collected sufficiently over enough steps. self.summary_writers['train'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "train")) else: diff --git a/deepray/utils/resource_loader.py b/deepray/utils/resource_loader.py index 3e0a6350..e8c9ed61 100644 --- a/deepray/utils/resource_loader.py +++ b/deepray/utils/resource_loader.py @@ -21,7 +21,7 @@ import tensorflow as tf INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.9.1" -EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.13.0" +EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.15.0" abi_warning_already_raised = False SKIP_CUSTOM_OPS = False @@ -63,7 +63,7 @@ def ops(self): pytest.skip("Skipping the test because a custom ops " "was being loaded while --skip-custom-ops was set.") if self._ops is None: - self.display_warning_if_incompatible() + # self.display_warning_if_incompatible() self._ops = tf.load_op_library(get_path_to_datafile(self.relative_path, is_so=True)) return self._ops diff --git a/deepray/utils/test_utils.py b/deepray/utils/test_utils.py index 338370a7..81cef91a 100644 --- a/deepray/utils/test_utils.py +++ b/deepray/utils/test_utils.py @@ -14,21 +14,21 @@ # ============================================================================== """Utilities for testing Deepray.""" +import inspect import os import random -import inspect import numpy as np import pytest import tensorflow as tf - from packaging.version import Version + from deepray import options from deepray.utils import resource_loader -if Version(tf.__version__).release >= Version("2.13").release: - # New versions of Keras require importing from `keras.src` when - # importing internal symbols. +if Version(tf.__version__) > Version("2.16.0"): + from tf_keras.src.testing_infra.test_utils import layer_test # noqa: F401 +elif Version(tf.__version__).release >= Version("2.13").release: from keras.src.testing_infra.test_utils import layer_test # noqa: F401 elif Version(tf.__version__) >= Version("2.9"): from keras.testing_infra.test_utils import layer_test # noqa: F401 diff --git a/deepray/utils/timer.py b/deepray/utils/timer.py new file mode 100644 index 00000000..8593299f --- /dev/null +++ b/deepray/utils/timer.py @@ -0,0 +1,34 @@ +import time +from functools import wraps + + +class Timer: + """Useage + if __name__ == "__main__": + with Timer(): + # ... + """ + + def __enter__(self): + self._enter_time = time.time() + + def __exit__(self, *exc_args): + self._exit_time = time.time() + print(f"{self._exit_time - self._enter_time:.2f} seconds elapsed") + + +def timer(func): + """Useage + @timer + def your_function(): + # ... + """ + + @wraps(func) + def inner(*args, **kwargs): + start_time = time.time() + retval = func(*args, **kwargs) + print(f"{time.time() - start_time:.2f} seconds elapsed") + return retval + + return inner diff --git a/deepray/utils/types.py b/deepray/utils/types.py index b92a34d8..01dd101e 100644 --- a/deepray/utils/types.py +++ b/deepray/utils/types.py @@ -19,32 +19,37 @@ import importlib import numpy as np import tensorflow as tf +import tf_keras as keras from packaging.version import Version -# TODO: Remove once https://github.com/tensorflow/tensorflow/issues/44613 is resolved -if Version(tf.__version__).release >= Version("2.13").release: - # New versions of Keras require importing from `keras.src` when - # importing internal symbols. - from keras.src.engine import keras_tensor +# Find KerasTensor. +if Version(tf.__version__).release >= Version("2.16").release: + # Determine if loading keras 2 or 3. + if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release): + from keras import KerasTensor + else: + from tf_keras.src.engine.keras_tensor import KerasTensor +elif Version(tf.__version__).release >= Version("2.13").release: + from keras.src.engine.keras_tensor import KerasTensor elif Version(tf.__version__).release >= Version("2.5").release: - from keras.engine import keras_tensor + from keras.engine.keras_tensor import KerasTensor else: - from tensorflow.python.keras.engine import keras_tensor + from tensorflow.python.keras.engine.keras_tensor import KerasTensor Number = Union[float, int, np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64,] -Initializer = Union[None, dict, str, Callable, tf.keras.initializers.Initializer] -Regularizer = Union[None, dict, str, Callable, tf.keras.regularizers.Regularizer] -Constraint = Union[None, dict, str, Callable, tf.keras.constraints.Constraint] +Initializer = Union[None, dict, str, Callable, keras.initializers.Initializer] +Regularizer = Union[None, dict, str, Callable, keras.regularizers.Regularizer] +Constraint = Union[None, dict, str, Callable, keras.constraints.Constraint] Activation = Union[None, str, Callable] -if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None: - Optimizer = Union[tf.keras.optimizers.Optimizer, tf.keras.optimizers.legacy.Optimizer, str] +if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None: + Optimizer = Union[keras.optimizers.Optimizer, keras.optimizers.legacy.Optimizer, str] else: - Optimizer = Union[tf.keras.optimizers.Optimizer, str] + Optimizer = Union[keras.optimizers.Optimizer, str] TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.SparseTensor, tf.Variable, - keras_tensor.KerasTensor,] + KerasTensor,] FloatTensorLike = Union[tf.Tensor, float, np.float16, np.float32, np.float64] AcceptableDTypes = Union[tf.DType, np.dtype, type, int, str, None] diff --git a/deepray/version.py b/deepray/version.py index 7c75ae0c..d62f5246 100644 --- a/deepray/version.py +++ b/deepray/version.py @@ -16,12 +16,12 @@ # Required TensorFlow version [min, max) INCLUSIVE_MIN_TF_VERSION = "2.9.1" -EXCLUSIVE_MAX_TF_VERSION = "2.13.0" +EXCLUSIVE_MAX_TF_VERSION = "2.18.0" # We follow Semantic Versioning (https://semver.org/) _MAJOR_VERSION = "0" _MINOR_VERSION = "21" -_PATCH_VERSION = "9" +_PATCH_VERSION = "86" # When building releases, we can update this value on the release branch to # reflect the current release candidate ('rc0', 'rc1') or, finally, the official diff --git a/deepray/workspace0.bzl b/deepray/workspace0.bzl index d954c362..6b991de2 100644 --- a/deepray/workspace0.bzl +++ b/deepray/workspace0.bzl @@ -1,9 +1,9 @@ """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it.""" +load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") +load("@rules_compressor//tensorflow:workspace2.bzl", rules_compressor_deps = "tf_workspace2") load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies") load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") -load("@rules_compressor//tensorflow:workspace2.bzl", rules_compressor_deps = "tf_workspace2") -load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") def workspace(): # If a target is bound twice, the later one wins, so we have to do tf bindings diff --git a/deepray/workspace2.bzl b/deepray/workspace2.bzl index fbfecb7c..b52c3b6f 100644 --- a/deepray/workspace2.bzl +++ b/deepray/workspace2.bzl @@ -1,8 +1,12 @@ """Deepray workspace initialization. Consult the WORKSPACE on how to use it.""" -# Import external repository rules. -load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("//third_party:repo.bzl", "tf_http_archive") + +# Sanitize a dependency so that it works correctly from code that includes +# TensorFlow as a submodule. +def clean_dep(dep): + return str(Label(dep)) # Define all external repositories required by TensorFlow def _tf_repositories(): @@ -22,12 +26,6 @@ def _tf_repositories(): strip_prefix = "double-conversion-3.2.0", ) - git_repository( - name = "rules_python", - remote = "https://github.com/bazelbuild/rules_python.git", - tag = "0.16.2", - ) - http_archive( name = "eigen3", urls = [ @@ -47,17 +45,18 @@ def _tf_repositories(): type = "tar.gz", strip_prefix = "OpenBLAS-{}".format(OPENBLAS_VERSION), build_file = Label("//third_party:openblas.BUILD"), - # sha256 = "5d9491d07168a5d00116cdc068a40022c3455bf9293c7cb86a65b1054d7e5114", + sha256 = "4c25cb30c4bb23eddca05d7d0a85997b8db6144f5464ba7f8c09ce91e2f35543", ) - ARROW_VERSION = "7.0.0" http_archive( - name = "com_github_apache_arrow", - sha256 = "57e13c62f27b710e1de54fd30faed612aefa22aa41fa2c0c3bacd204dd18a8f3", + name = "org_apache_arrow", build_file = Label("//third_party/arrow:arrow.BUILD"), - strip_prefix = "arrow-apache-arrow-" + ARROW_VERSION, + patches = ["//third_party/arrow:arrow-20.patch"], + patch_args = ["-p1"], + sha256 = "89efbbf852f5a1f79e9c99ab4c217e2eb7f991837c005cba2d4a2fbd35fad212", + strip_prefix = "apache-arrow-20.0.0", urls = [ - "https://github.com/apache/arrow/archive/apache-arrow-{}.tar.gz".format(ARROW_VERSION), + "https://github.com/apache/arrow/releases/download/apache-arrow-20.0.0/apache-arrow-20.0.0.tar.gz", ], ) @@ -94,7 +93,7 @@ def _tf_repositories(): ) http_archive( - name = "com_github_apache_thrift", # Apache License 2.0 + name = "org_apache_thrift", # Apache License 2.0 build_file = Label("//third_party/thrift:thrift.BUILD"), sha256 = "5da60088e60984f4f0801deeea628d193c33cec621e78c8a43a5d8c4055f7ad9", strip_prefix = "thrift-0.13.0", @@ -182,30 +181,6 @@ def _tf_repositories(): ], ) - http_archive( - name = "libcuckoo", - build_file = "//third_party:libcuckoo.BUILD", - patch_args = ["-p1"], - patches = [ - "//third_party:cuckoohash_map.patch", - ], - sha256 = "7238436b7346a0edf4ce57c12f43f71af5347b8b15f9bf2f0e24bfdca6225fc5", - strip_prefix = "libcuckoo-0.3", - urls = [ - "https://github.com/efficient/libcuckoo/archive/v0.3.zip", - ], - ) - - http_archive( - name = "sparsehash", - build_file = "//third_party:sparsehash.BUILD", - sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755", - strip_prefix = "sparsehash-c11-2.11.1", - urls = [ - "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz", - ], - ) - http_archive( name = "murmurhash", build_file = "//third_party:murmurhash.BUILD", @@ -228,23 +203,115 @@ def _tf_repositories(): ) http_archive( + name = "com_github_NVIDIA_cuCollections", + # sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0", + build_file = "//third_party/cuCollections:cuCollections.BUILD", + strip_prefix = "cuCollections-2303a7a2a03e38385dbe1bbc91c55007a94a9192", + urls = [ + "https://github.com/NVIDIA/cuCollections/archive/2303a7a2a03e38385dbe1bbc91c55007a94a9192.zip", + ], + ) + + tf_http_archive( name = "cuCollections", # Apache License 2.0 - # patches = ["//third_party/cucollection:cucollection.patch"], - build_file = "//third_party/cucollection:cuco.BUILD", + patch_file = [clean_dep("//third_party/cuCollections:cucollection.patch")], + build_file = clean_dep("//third_party/cuCollections:cuco.BUILD"), sha256 = "c5c77a1f96b439b67280e86483ce8d5994aa4d14b7627b1d3bd7880be6be23fa", strip_prefix = "cuCollections-193de1aa74f5721717f991ca757dc610c852bb17", urls = [ "https://github.com/NVIDIA/cuCollections/archive/193de1aa74f5721717f991ca757dc610c852bb17.zip", + "https://github.com/NVIDIA/cuCollections/archive/193de1aa74f5721717f991ca757dc610c852bb17.zip", ], ) - http_archive( - name = "sparsehash_c11", - build_file = "//third_party:sparsehash_c11.BUILD", + tf_http_archive( + name = "sparsehash_c11", # BSD-3-Clause License + build_file = clean_dep("//third_party/sparsehash_c11:sparsehash_c11.BUILD"), + patch_file = [ + clean_dep("//third_party/sparsehash_c11:sparsehash_c11.patch"), + ], sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755", strip_prefix = "sparsehash-c11-2.11.1", urls = [ "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz", + "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz", + ], + ) + + # http_archive( + # name = "sparsehash_c11", # BSD-3-Clause License + # build_file = "//third_party/sparsehash_c11:sparsehash_c11.BUILD", + # patch_args = ["-p1"], + # patches = ["//third_party/sparsehash_c11:sparsehash_c11.patch"], + # sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755", + # strip_prefix = "sparsehash-c11-2.11.1", + # urls = [ + # "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz", + # # "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz", + # ], + # ) + + http_archive( + name = "cutlass", + urls = ["https://github.com/NVIDIA/cutlass/archive/319a389f42b776fae5701afcb943fc03be5b5c25.zip"], + build_file = "//third_party:cutlass.BUILD", + strip_prefix = "cutlass-319a389f42b776fae5701afcb943fc03be5b5c25", + ) + + http_archive( + name = "flash_attn", + urls = ["https://github.com/Dao-AILab/flash-attention/archive/9818f85fee29ac6b60c9214bce841f8109a18b1b.zip"], # v1.0.4 + build_file = "//third_party/flash_attn:flash_attn.BUILD", + sha256 = "15f29a1095600ba2a3af688fa96a0a48635edb90fffec56c6eb7c48a4a322d2b", + strip_prefix = "flash-attention-9818f85fee29ac6b60c9214bce841f8109a18b1b", + patches = [ + "//third_party/flash_attn:flash_attn.patch", + ], + patch_args = ["-p1"], + ) + + http_archive( + name = "libcuckoo", + build_file = "//third_party:libcuckoo.BUILD", + patch_args = ["-p1"], + patches = [ + "//third_party:cuckoohash_map.patch", + ], + sha256 = "7238436b7346a0edf4ce57c12f43f71af5347b8b15f9bf2f0e24bfdca6225fc5", + strip_prefix = "libcuckoo-0.3", + urls = [ + "https://github.com/efficient/libcuckoo/archive/v0.3.zip", + ], + ) + + http_archive( + name = "com_github_google_leveldb", + sha256 = "f99dc5dcb6f23e500b197db02e993ee0d3bafd1ac84b85ab50de9009b36fbf03", + strip_prefix = "leveldb-5d94ad4d95c09d3ac203ddaf9922e55e730706a8", + build_file = "//third_party:leveldb.BUILD", + urls = [ + "https://github.com/google/leveldb/archive/5d94ad4d95c09d3ac203ddaf9922e55e730706a8.tar.gz", + ], + ) + + tf_http_archive( + name = "readerwriterqueue_archive", + build_file = clean_dep("//third_party:readerwriterqueue.BUILD"), + sha256 = "fc68f55bbd49a8b646462695e1777fb8f2c0b4f342d5e6574135211312ba56c1", + strip_prefix = "readerwriterqueue-1.0.6", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz", + "https://github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz", + ], + ) + + http_archive( + name = "openssl", + sha256 = "9f54d42aed56f62889e8384895c968e24d57eae701012776d5f18fb9f2ae48b0", + build_file = "//third_party:openssl.BUILD", + strip_prefix = "openssl-openssl-3.0.2", + urls = [ + "https://github.com/openssl/openssl/archive/refs/tags/openssl-3.0.2.tar.gz", ], ) diff --git a/deepray/workspace3.bzl b/deepray/workspace3.bzl index 2aaed1e7..8ff78d85 100644 --- a/deepray/workspace3.bzl +++ b/deepray/workspace3.bzl @@ -1,16 +1,8 @@ """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it.""" load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") def workspace(): - http_archive( - name = "rules_cc", - urls = ["https://github.com/bazelbuild/rules_cc/releases/download/0.0.9/rules_cc-0.0.9.tar.gz"], - sha256 = "2037875b9a4456dce4a79d112a8ae885bbc4aad968e6587dca6e64f3a0900cdf", - strip_prefix = "rules_cc-0.0.9", - ) - http_archive( name = "rules_foreign_cc", sha256 = "476303bd0f1b04cc311fc258f1708a5f6ef82d3091e53fd1977fa20383425a6a", @@ -27,12 +19,10 @@ def workspace(): sha256 = "8f9ee2dc10c1ae514ee599a8b42ed99fa262b757058f65ad3c384289ff70c4b8", ) - git_repository( + http_archive( name = "rules_compressor", - # branch = "main", - remote = "https://github.com/fuhailin/rules_compressor.git", - commit = "a98ee1d04dc8175aa87a9640caef25725a78ef03", - shallow_since = "1681204047 +0800", + url = "https://github.com/fuhailin/rules_compressor/archive/refs/heads/main.zip", + strip_prefix = "rules_compressor-main", ) # Alias so it can be loaded without assigning to a different symbol to prevent diff --git a/docker.sh b/docker.sh index 1d8dcc9b..2c2f6bb8 100644 --- a/docker.sh +++ b/docker.sh @@ -2,20 +2,26 @@ set -x -e -PY_VERSION=${1:-"3.8"} -TF_VERSION=${2:-"2.9.1"} -CUDA_VERSION=${3:-"11.6.2"} -OS_VERSION=${3:-"20.04"} +PY_VERSION=${1:-"3.10"} +TF_VERSION=${2:-"2.15.0"} +CUDA_VERSION=${3:-"12.2.2"} +OS_VERSION=${3:-"22.04"} -docker pull hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} +# docker pull hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} + +# docker volume create -d local --name dev-build \ +# --opt device="/data/fuhailin/workspaces" \ +# --opt type="none" \ +# --opt o="bind" docker run --gpus all -it \ - --rm=true \ - --name="deepray_dev" \ - -w /workspaces \ + --rm \ + --network=host \ + --name="deepray_dev_py${PY_VERSION}" \ + --volume=/data/fuhailin/workspaces/datasets/:/datasets \ --volume=dev-build:/workspaces \ - --shm-size=1g \ - --device /dev/fuse \ - --network host \ --privileged \ - hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} /bin/bash + --cap-add=SYS_PTRACE \ + --shm-size=1g \ + --ulimit memlock=-1 \ + hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} diff --git a/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh b/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh index 50110616..2caa469a 100644 --- a/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh +++ b/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh @@ -14,66 +14,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -keras_use_ctl=${1:-"true"} -num_gpu=${2:-"4"} +set -eu +set -o pipefail batch_size=${3:-"1024"} learning_rate=${4:-"5e-6"} -precision=${5:-"fp32"} -use_xla=${6:-"true"} -epochs=${7:-"100"} -model=${8:-"demo"} - - -if [ $num_gpu -gt 1 ] ; then - mpi_command="mpirun -np $num_gpu \ - --allow-run-as-root -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO \ - -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib" - use_hvd="--use_horovod" -else - mpi_command="" - use_hvd="" -fi -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16="--dtype=fp16" -else - use_fp16="" -fi - -if [ "$use_xla" = "true" ] ; then - use_xla_tag="--enable_xla" - echo "XLA activated" -else - use_xla_tag="" -fi - - -export GBS=$(expr $batch_size \* $num_gpu) -printf -v TAG "tf_training_fashion_mnist_%s_%s_gbs%d" "$model" "$precision" $GBS -DATESTAMP=`date +'%y%m%d%H%M%S'` +printf -v TAG "tf_training_fashion_mnist_gbs%d" $batch_size +DATESTAMP=$(date +'%y%m%d%H%M%S') #Edit to save logs & checkpoints in a different directory -RESULTS_DIR=/results/${TAG}_${DATESTAMP} +RESULTS_DIR=/workspaces/results/${TAG}_${DATESTAMP} LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log mkdir -m 777 -p $RESULTS_DIR printf "Saving checkpoints to %s\n" "$RESULTS_DIR" printf "Logs written to %s\n" "$LOGFILE" set -x -$mpi_command python -m examples.CV.Classify_images_of_clothing.train \ - --train_data=fashion_mnist \ - --keras_use_ctl=$keras_use_ctl \ - --num_gpus=$num_gpu \ - --batch_size=$batch_size \ - --learning_rate=$learning_rate \ - --epochs=$epochs \ - --model_dir=${RESULTS_DIR} \ - $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE - +CUDA_VISIBLE_DEVICES=0 python train.py \ + --use_custom_training_loop=True \ + --run_eagerly=False \ + --train_data=fashion_mnist \ + --batch_size=$batch_size \ + --learning_rate=$learning_rate \ + --epochs=3 \ + --model_dir=${RESULTS_DIR} |& tee $LOGFILE set +x diff --git a/modelzoo/CV/Classify_images_of_clothing/train.py b/modelzoo/CV/Classify_images_of_clothing/train.py index c81ee943..590a3f5d 100644 --- a/modelzoo/CV/Classify_images_of_clothing/train.py +++ b/modelzoo/CV/Classify_images_of_clothing/train.py @@ -1,39 +1,37 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - import tensorflow as tf -from absl import app, flags - -from deepray.core.base_trainer import Trainer -from deepray.core.common import distribution_utils +from absl import flags +import datetime, os +import deepray as dp +from deepray.core.trainer import Trainer from deepray.datasets.fashion_mnist import FashionMNIST -FLAGS = flags.FLAGS - -def main(_): - _strategy = distribution_utils.get_distribution_strategy() - data_pipe = FashionMNIST() - with distribution_utils.get_strategy_scope(_strategy): - model = tf.keras.Sequential( - [ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dense(10) - ] - ) +def main(): + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation='softmax') + ] + ) trainer = Trainer( model=model, - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer='adam', + loss='sparse_categorical_crossentropy', metrics=['accuracy'], ) - train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - trainer.fit(train_input=train_input_fn,) + data_pipe = FashionMNIST() + train_input_fn = data_pipe(flags.FLAGS.batch_size, is_training=True) + + # logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) + logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard') + + tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1) + trainer.fit(train_input=train_input_fn, callbacks=[tensorboard_callback]) if __name__ == "__main__": - flags.mark_flag_as_required("model_dir") - app.run(main) + dp.runner(main) diff --git a/modelzoo/CV/GAN/train.py b/modelzoo/CV/GAN/train.py index c9b1c2ae..c56339f7 100644 --- a/modelzoo/CV/GAN/train.py +++ b/modelzoo/CV/GAN/train.py @@ -8,18 +8,17 @@ from absl import app, flags from datetime import datetime -from deepray.core.base_trainer import Trainer +from deepray.core.trainer import Trainer from deepray.core.common import distribution_utils from deepray.datasets.mnist import Mnist -FLAGS = flags.FLAGS FLAGS( [ sys.argv[0], "--train_data=mnist", # "--distribution_strategy=off", # "--run_eagerly=true", - "--steps_per_summary=10", + "--steps_per_execution=10", # "--use_horovod=True", # "--batch_size=1024", ] diff --git a/modelzoo/CV/SwinTransformers/train.py b/modelzoo/CV/SwinTransformers/train.py index 772cb43d..246dfb83 100644 --- a/modelzoo/CV/SwinTransformers/train.py +++ b/modelzoo/CV/SwinTransformers/train.py @@ -6,13 +6,11 @@ from absl import app, flags from tensorflow import keras -from deepray.core.base_trainer import Trainer +from deepray.core.trainer import Trainer from deepray.core.common import distribution_utils from deepray.datasets.cifar import CIFAR100 from .model import BaseModel -FLAGS = flags.FLAGS - learning_rate = 1e-3 batch_size = 128 num_epochs = 40 diff --git a/modelzoo/CV/mnist/run_early.sh b/modelzoo/CV/mnist/run_early.sh index 3bfb479e..15583d12 100644 --- a/modelzoo/CV/mnist/run_early.sh +++ b/modelzoo/CV/mnist/run_early.sh @@ -26,14 +26,6 @@ use_xla=${6:-"true"} epochs=${7:-"10"} model=${8:-"demo"} -if [ $num_gpu -gt 1 ]; then - hvd_command="horovodrun -np $num_gpu " - use_hvd="--use_horovod" -else - hvd_command="" - use_hvd="--distribution_strategy=off" -fi - if [ "$precision" = "fp16" ]; then echo "fp16 activated!" use_fp16="--dtype=fp16" @@ -60,15 +52,14 @@ printf "Saving checkpoints to %s\n" "$RESULTS_DIR" printf "Logs written to %s\n" "$LOGFILE" set -x -$hvd_command python train_earlystop.py \ +python train_earlystop.py \ --train_data=mnist \ --keras_use_ctl=$keras_use_ctl \ --num_gpus=$num_gpu \ --batch_size=$batch_size \ --learning_rate=$learning_rate \ - --steps_per_summary=20 \ + --steps_per_execution=20 \ --epochs=$epochs \ --model_dir=${RESULTS_DIR} \ - $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE - + $use_fp16 $use_xla_tag |& tee $LOGFILE set +x diff --git a/modelzoo/CV/mnist/run_horovod.sh b/modelzoo/CV/mnist/run_horovod.sh index a552bfb2..5ae789ae 100644 --- a/modelzoo/CV/mnist/run_horovod.sh +++ b/modelzoo/CV/mnist/run_horovod.sh @@ -15,31 +15,13 @@ # limitations under the License. # ============================================================================== -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -keras_use_ctl=${1:-"true"} -num_gpu=${2:-"1"} -batch_size=${3:-"128"} -learning_rate=${4:-"5e-6"} -precision=${5:-"fp32"} -use_xla=${6:-"true"} -epochs=${7:-"1"} -model=${8:-"demo"} - -if [ $num_gpu -gt 1 ]; then - mpi_command="mpirun -np $num_gpu \ - --allow-run-as-root -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO \ - -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib" - use_hvd="--use_horovod" -else - mpi_command="" - use_hvd="" -fi +batch_size=${1:-"128"} +learning_rate=${2:-"5e-6"} +precision=${3:-"fp32"} +use_xla=${4:-"False"} +epochs=${5:-"1"} if [ "$precision" = "fp16" ]; then - echo "fp16 activated!" use_fp16="--dtype=fp16" else use_fp16="" @@ -47,13 +29,12 @@ fi if [ "$use_xla" = "true" ]; then use_xla_tag="--enable_xla" - echo "XLA activated" else use_xla_tag="" fi -export GBS=$(expr $batch_size \* $num_gpu) -printf -v TAG "tf_training_mnist_%s_%s_gbs%d" "$model" "$precision" $GBS +export GBS=$(expr $batch_size) +printf -v TAG "tf_training_mnist_gbs%d" $GBS DATESTAMP=$(date +'%y%m%d%H%M%S') #Edit to save logs & checkpoints in a different directory @@ -64,16 +45,15 @@ printf "Saving checkpoints to %s\n" "$RESULTS_DIR" printf "Logs written to %s\n" "$LOGFILE" set -x -$mpi_command python train.py \ - --train_data=mnist \ - --keras_use_ctl=$keras_use_ctl \ - --num_gpus=$num_gpu \ +CUDA_VISIBLE_DEVICES=0 python train.py \ + --run_eagerly=False \ --batch_size=$batch_size \ --learning_rate=$learning_rate \ - --steps_per_summary=1 \ - --stop_steps=20 \ + --steps_per_execution=10 \ + --stop_steps=-1 \ --epochs=$epochs \ --model_dir=${RESULTS_DIR} \ - $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE + $use_fp16 $use_xla_tag +# |& tee $LOGFILE set +x diff --git a/modelzoo/CV/mnist/train.py b/modelzoo/CV/mnist/train.py index 9665c2ef..dbd8be68 100644 --- a/modelzoo/CV/mnist/train.py +++ b/modelzoo/CV/mnist/train.py @@ -5,44 +5,93 @@ import os import sys +import keras +import numpy as np import tensorflow as tf -from absl import app, flags +from absl import flags -from deepray.core.base_trainer import Trainer -from deepray.core.common import distribution_utils +import deepray as dp +from deepray.core.trainer import Trainer from deepray.datasets.mnist import Mnist -FLAGS = flags.FLAGS -FLAGS( - [ - sys.argv[0], - "--train_data=mnist", - # "--distribution_strategy=off", - # "--run_eagerly=true", - "--steps_per_summary=10", - # "--use_horovod=True", - # "--batch_size=1024", - ] -) - - -def main(_): - _strategy = distribution_utils.get_distribution_strategy() - data_pipe = Mnist() - with distribution_utils.get_strategy_scope(_strategy): - mnist_model = tf.keras.Sequential( - [ - tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) + +def define_flasg(): + flags.FLAGS( + [ + sys.argv[0], + "--train_data=mnist", + # "--run_eagerly=true", + "--steps_per_execution=1", + # "--batch_size=1024", + ] + ) + + +class EarlyStoppingAtMinLoss(keras.callbacks.Callback): + """Stop training when the loss is at its min, i.e. the loss stops decreasing. + + Arguments: + patience: Number of epochs to wait after min has been hit. After this + number of no improvement, training stops. + """ + + def __init__(self, patience=0): + super().__init__() + self.patience = patience + # best_weights to store the weights at which the minimum loss occurs. + self.best_weights = None + + def on_train_begin(self, logs=None): + # The number of epoch it has waited when loss is no longer minimum. + self.wait = 0 + # The epoch the training stops at. + self.stopped_epoch = 0 + # Initialize the best as infinity. + self.best = np.Inf + + # def on_batch_begin(self, batch, logs=None): + # pass + + # def on_batch_end(self, batch, logs=None): + # if batch < 5: + # print(batch, self.model.get_weights()[0][0][0][0]) + # pass + + def on_epoch_end(self, epoch, logs=None): + print(logs) + current = logs.get("loss") + if np.less(current, self.best): + self.best = current + self.wait = 0 + # Record the best weights if current results is better (less). + self.best_weights = self.model.get_weights() + else: + self.wait += 1 + if self.wait >= self.patience: + self.stopped_epoch = epoch + self.model.stop_training = True + print("Restoring model weights from the end of the best epoch.") + self.model.set_weights(self.best_weights) + + def on_train_end(self, logs=None): + if self.stopped_epoch > 0: + print("Epoch %05d: early stopping" % (self.stopped_epoch + 1)) + + +def main(): + mnist_model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) trainer = Trainer( optimizer=tf.keras.optimizers.Adam(0.001), @@ -51,14 +100,19 @@ def main(_): # loss='sparse_categorical_crossentropy', metrics=["accuracy"] ) - + data_pipe = Mnist() + train_input = data_pipe(flags.FLAGS.batch_size, is_training=True) + test_input = data_pipe(flags.FLAGS.batch_size, is_training=False) tboard_callback = tf.keras.callbacks.TensorBoard( - log_dir=os.path.join(FLAGS.model_dir, 'tensorboard'), histogram_freq=1, profile_batch='10,20' + log_dir=os.path.join(flags.FLAGS.model_dir, 'tensorboard'), histogram_freq=1, profile_batch='1,2' ) - train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - trainer.fit(train_input=train_input, callbacks=[tboard_callback]) + trainer.fit( + train_input=train_input, + eval_input=test_input, + callbacks=[tboard_callback, EarlyStoppingAtMinLoss()], + ) if __name__ == "__main__": - app.run(main) + dp.runner(main) diff --git a/modelzoo/CV/mnist/train_earlystop.py b/modelzoo/CV/mnist/train_earlystop.py deleted file mode 100644 index 6c687991..00000000 --- a/modelzoo/CV/mnist/train_earlystop.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import sys - -import keras -import numpy as np -import tensorflow as tf -from absl import app, flags - -from deepray.core.base_trainer import Trainer -from deepray.datasets.mnist import Mnist - -FLAGS = flags.FLAGS -FLAGS( - [ - sys.argv[0], - "--train_data=mnist", - # "--distribution_strategy=off", - # "--run_eagerly=true", - "--steps_per_summary=10", - # "--use_horovod=True", - # "--batch_size=1024", - ] -) - - -def get_model(): - return tf.keras.Sequential( - [ - tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(1), - ] - ) - - -class EarlyStoppingAtMinLoss(keras.callbacks.Callback): - """Stop training when the loss is at its min, i.e. the loss stops decreasing. - - Arguments: - patience: Number of epochs to wait after min has been hit. After this - number of no improvement, training stops. - """ - - def __init__(self, patience=0): - super().__init__() - self.patience = patience - # best_weights to store the weights at which the minimum loss occurs. - self.best_weights = None - - def on_train_begin(self, logs=None): - # The number of epoch it has waited when loss is no longer minimum. - self.wait = 0 - # The epoch the training stops at. - self.stopped_epoch = 0 - # Initialize the best as infinity. - self.best = np.Inf - - # def on_batch_begin(self, batch, logs=None): - # pass - - # def on_batch_end(self, batch, logs=None): - # if batch < 5: - # print(batch, self.model.get_weights()[0][0][0][0]) - # pass - - def on_epoch_end(self, epoch, logs=None): - print(logs) - current = logs.get("loss") - if np.less(current, self.best): - self.best = current - self.wait = 0 - # Record the best weights if current results is better (less). - self.best_weights = self.model.get_weights() - else: - self.wait += 1 - if self.wait >= self.patience: - self.stopped_epoch = epoch - self.model.stop_training = True - print("Restoring model weights from the end of the best epoch.") - self.model.set_weights(self.best_weights) - - def on_train_end(self, logs=None): - if self.stopped_epoch > 0: - print("Epoch %05d: early stopping" % (self.stopped_epoch + 1)) - - -def main(_): - data_pipe = Mnist() - model = get_model() - - trainer = Trainer( - optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.1), - model=model, - loss="mean_squared_error", - metrics=["mean_absolute_error"], - ) - - callbacks = [EarlyStoppingAtMinLoss()], - - train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True) - trainer.fit( - train_input=train_input, - callbacks=callbacks, - ) - - -if __name__ == "__main__": - app.run(main) diff --git a/modelzoo/ELECTRA/.gitignore b/modelzoo/ELECTRA/.gitignore new file mode 100644 index 00000000..7a43e90b --- /dev/null +++ b/modelzoo/ELECTRA/.gitignore @@ -0,0 +1,129 @@ +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +#Data checkpoints and results +data/*/*/ +data/*/*.zip +checkpoints/ +results/* + +#Editor +.idea +.idea/* + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# vscode +.vscode diff --git a/modelzoo/ELECTRA/Dockerfile b/modelzoo/ELECTRA/Dockerfile new file mode 100644 index 00000000..88decd29 --- /dev/null +++ b/modelzoo/ELECTRA/Dockerfile @@ -0,0 +1,31 @@ +# syntax = docker/dockerfile:1 +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.07-tf2-py3 +FROM ${FROM_IMAGE_NAME} +RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract + +ENV DATA_PREP_WORKING_DIR /workspace/electra/data +WORKDIR /workspace +RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd .. +RUN git clone https://github.com/soskek/bookcorpus.git + +WORKDIR /workspace/electra + +RUN pip install --no-cache-dir tqdm boto3 requests six ipdb h5py nltk progressbar filelock \ + git+https://github.com/NVIDIA/dllogger \ + nvidia-ml-py3==7.352.0 tokenizers==0.11.0 + +RUN apt-get install -y iputils-ping +COPY . . diff --git a/modelzoo/ELECTRA/LICENSE b/modelzoo/ELECTRA/LICENSE new file mode 100644 index 00000000..6b0b1270 --- /dev/null +++ b/modelzoo/ELECTRA/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/modelzoo/ELECTRA/NOTICE b/modelzoo/ELECTRA/NOTICE new file mode 100644 index 00000000..453fd085 --- /dev/null +++ b/modelzoo/ELECTRA/NOTICE @@ -0,0 +1,5 @@ +ELECTRA Tensorflow 2 + +This repository includes software from https://github.com/huggingface/transformers +licensed under the Apache License 2.0. + diff --git a/modelzoo/ELECTRA/README.md b/modelzoo/ELECTRA/README.md new file mode 100644 index 00000000..154a7dff --- /dev/null +++ b/modelzoo/ELECTRA/README.md @@ -0,0 +1,1005 @@ +# ELECTRA For TensorFlow2 + +This repository provides a script and recipe to train the ELECTRA model for TensorFlow2 to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA. + + +## Table Of Contents +- [Model overview](#model-overview) + * [Model architecture](#model-architecture) + * [Default configuration](#default-configuration) + * [Feature support matrix](#feature-support-matrix) + * [Features](#features) + * [Mixed precision training](#mixed-precision-training) + * [Enabling mixed precision](#enabling-mixed-precision) + * [Enabling TF32](#enabling-tf32) + * [Glossary](#glossary) +- [Setup](#setup) + * [Requirements](#requirements) +- [Quick Start Guide](#quick-start-guide) +- [Advanced](#advanced) + * [Scripts and sample code](#scripts-and-sample-code) + * [Parameters](#parameters) + + [Pre-training parameters](#pre-training-parameters) + + [Fine-tuning parameters](#fine-tuning-parameters) + * [Command-line options](#command-line-options) + * [Getting the data](#getting-the-data) + + [Multi-dataset](#multi-dataset) + * [Training process](#training-process) + + [Pre-training](#pre-training) + + [Multi-node](#multi-node) + + [Fine-tuning](#fine-tuning) + * [Inference process](#inference-process) + + [Fine-tuning inference](#fine-tuning-inference) +- [Performance](#performance) + * [Benchmarking](#benchmarking) + + [Training performance benchmark](#training-performance-benchmark) + + [Inference performance benchmark](#inference-performance-benchmark) + * [Results](#results) + + [Training accuracy results](#training-accuracy-results) + - [Pre-training loss curves](#pre-training-loss-curves) + - [Pre-training loss results](#pre-training-loss-results) + - [Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-nvidia-dgx-a100-8x-a100-40gb) + - [Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-accuracy-nvidia-dgx-1-8x-v100-16gb) + - [Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB)](#fine-tuning-accuracy-nvidia-dgx-2-16x-v100-32gb) + - [Training stability test](#training-stability-test) + * [Pre-training stability test: NVIDIA DGX A100 (8x A100 40GB)](#pre-training-stability-test-nvidia-dgx-a100-8x-a100-40gb) + * [Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-stability-test-nvidia-dgx-1-8x-v100-16gb) + + [Training performance results](#training-performance-results) + - [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb) + * [Pre-training NVIDIA DGX A100 (8x A100 40GB)](#pre-training-nvidia-dgx-a100-8x-a100-40gb) + * [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb) + - [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb) + * [Pre-training NVIDIA DGX-1 (8x V100 16GB)](#pre-training-nvidia-dgx-1-8x-v100-16gb) + * [Fine-tuning NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-nvidia-dgx-1-8x-v100-16gb) + - [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb) + * [Pre-training NVIDIA DGX-2 (16x V100 32GB)](#pre-training-nvidia-dgx-2-16x-v100-32gb) + * [Fine-tuning NVIDIA DGX-2 (16x V100 32GB)](#fine-tuning-nvidia-dgx-2-16x-v100-32gb) + + [Inference performance results](#inference-performance-results) + - [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb) + * [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb) + - [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4) + * [Fine-tuning inference on NVIDIA T4](#fine-tuning-inference-on-nvidia-t4) +- [Release notes](#release-notes) + * [Changelog](#changelog) + * [Known issues](#known-issues) + +## Model overview + +Electra (Efficiently Learning an Encoder that Classifies Token Replacements Accurately), is a novel pre-training method for language representations which outperforms existing techniques, given the same compute budget on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/forum?id=r1xMH1BtvB) paper. NVIDIA's implementation of ELECTRA is an optimized version of the [Hugging Face implementation](https://huggingface.co/transformers/model_doc/electra.html), leveraging mixed precision arithmetic and Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures for faster training times with state-of-the-art accuracy. + +This repository contains the scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for pre-training on your own dataset (Wikipedia and BookCorpus shown as an example), and fine-tuning for tasks such as question answering. The major differences between the original implementation as described in the paper and this version of ELECTRA are as follows: + +- Scripts to download Wikipedia and BookCorpus datasets +- Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion +- Automatic mixed precision (AMP) support and optimized for performance +- Multi-GPU and Multi-node training support with push-button scripts to reach state-of-the-art accuracy and performance. + +Other publicly available implementations of Electra include: +1. [Hugging Face](https://huggingface.co/transformers/model_doc/electra.html) +2. [Google's implementation](https://github.com/google-research/electra) + +This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Additionally, this model provides push-button solutions to pre-training, fine-tuning and inference and on a corpus of choice. As a result, researchers can get results up to 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time. + +### Model architecture + +ELECTRA is a combination of two Transformer models: a generator and a discriminator. The generator’s role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator, which is the model we are interested in, tries to identify which tokens were replaced by the generator in the sequence. Both generator and discriminator use the same architecture as the encoder of the Transformer. The encoder is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer performs self-attention on multiple input representations. + +![Figure 1-1](https://1.bp.blogspot.com/-sHybc03nJRo/XmfLongdVYI/AAAAAAAAFbI/a0t5w_zOZ-UtxYaoQlVkmTRsyFJyFddtQCLcBGAsYHQ/s1600/image1.png "ELECTRA architecture") + + + +### Default configuration + +ELECTRA uses a new pre-training task called replaced token detection (RTD), that trains a bidirectional model (like a MLM) while learning from all input positions (like a LM). Inspired by generative adversarial networks (GANs), instead of corrupting the input by replacing tokens with “[MASK]” as in BERT, the generator is trained to corrupt the input by replacing some input tokens with incorrect, but somewhat plausible, fakes. On the other hand, the discriminator is trained to distinguish between “real” and “fake” input data. + +The [Google ELECTRA repository](https://github.com/google-research/electra) reports the results for three configurations of ELECTRA, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below. + +| **Model** | **Hidden layers** | **Hidden unit size** | **Parameters** | +|:---------:|:----------:|:---:|:----:| +|ELECTRA_SMALL|12 encoder| 256 | 14M| +|ELECTRA_BASE |12 encoder| 768 |110M| +|ELECTRA_LARGE|24 encoder|1024 |335M| + +The following features were implemented in this model: +- General: + - Mixed precision support with TensorFlow Automatic Mixed Precision (TF-AMP) + - Multi-GPU support using Horovod + - XLA support + - Multi-Node support + + +- Training + - Pre-training support + - Fine-tuning example + + +- Inference: + - Joint predictions with beam search. + +### Feature support matrix + +The following features are supported by this model. + +| **Feature** | **ELECTRA** | +|:---------:|:----------:| +|LAMB|Yes| +|Automatic mixed precision (AMP)|Yes| +|XLA|Yes| +|Horovod Multi-GPU|Yes| +|Multi-node|Yes| + +#### Features + +**Automatic Mixed Precision (AMP)** + +This implementation of ELECTRA uses AMP to implement mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying just a few lines of code. + +**Horovod** + +Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod). + +Multi-GPU training with Horovod + +Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage). + +**XLA support (experimental)** + +XLA is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage: most internal benchmarks run ~1.1-1.5x faster after XLA is enabled. +[AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training. + +**Multi-node Training** + +Supported on a Pyxis/Enroot Slurm cluster. + +### Mixed precision training + +Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps: + +1. Porting the model to use the FP16 data type where appropriate. +2. Adding loss scaling to preserve small gradient values. + +This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code. AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally. + +In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling. + +For information about: +- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation. +- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog. +- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide. + + +#### Enabling mixed precision + +This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--amp` flag to the `run_pretraining.py` or `run_tf_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code: + +1. Set the Keras mixed precision policy: + ```python + if config.amp: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") + tf.keras.mixed_precision.experimental.set_policy(policy) + ``` + +2. Use the loss scaling wrapper on the optimizer: + ```python + if config.amp: + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") + ``` + +3. Use scaled loss to calculate the gradients: + ```python + #Scale loss + if config.amp: + total_loss = optimizer.get_scaled_loss(total_loss) + gradients = tape.gradient(total_loss, model.trainable_variables) + #Get unscaled gradients if AMP + if config.amp: + gradients = optimizer.get_unscaled_gradients(gradients) + ``` + +#### Enabling TF32 + +TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. + +TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations. + +For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post. + +TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default. + +### Glossary + +**Fine-tuning** +Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required. + +**Language Model** +Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence. + +**Pre-training** +Training a model on vast amounts of data on the same (or different) task to build general understandings. + +**Transformer** +The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another. + + **Phase 1** +Pretraining on samples of sequence length 128 and at most 15% masked predictions per sequence. + +**Phase 2** +Pretraining on samples of sequence length 512 and at most 15% masked predictions per sequence. + +## Setup + +The following section lists the requirements that you need to meet in order to start training the ELECTRA model. + +### Requirements + +This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components: + +- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) +- [TensorFlow2 20.07-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-tensorflow) +- Supported GPUs: + - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) + - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/) + - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/) + +For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation: +- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html) +- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry) +- [Running TensorFlow2](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running) + +For those unable to use the TensorFlow 2 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html). + +For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster. + +More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide). + +## Quick Start Guide + +To train your model using mixed precision or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the ELECTRA model. The default parameters for pre-training have been set to run on both 8x A100 40G and 8 x V100 32G GPUs. For the specifics concerning training and inference, see the [Advanced](#advanced) section. + +1. Clone the repository. + +``` +git clone https://github.com/NVIDIA/DeepLearningExamples.git +cd DeepLearningExamples/TensorFlow2/LanguageModeling/ELECTRA +``` + +2. Build ELECTRA on top of the NGC container. +``` +bash scripts/docker/build.sh +``` + +3. Start an interactive session in the NGC container to run data download, training and inference. +``` +bash scripts/docker/launch.sh +``` + +Resultant logs of pre-training and fine-tuning routines are stored in the `results/` folder. Checkpoints are stored in the `results//` folder. + +Required data is downloaded into the `data/` directory by default. + +4. Download and preprocess the dataset. + +This repository provides scripts to download, verify, and extract the following datasets: + +- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering) +- Wikipedia (pre-training) +- BookCorpus (pre-training) + +To download, verify, extract the datasets, and create the shards in `tfrecord` format, run: +``` +/workspace/electra/data/create_datasets_from_start.sh +``` + +Note: For fine-tuning only, Wikipedia and Bookscorpus dataset download and preprocessing can be skipped by commenting it out. + +- Download Wikipedia only for pretraining + +The pre-training dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server most of the time gets overloaded and also contains broken links resulting in HTTP 403 and 503 errors. Hence, it is recommended to skip downloading BookCorpus data by running: + +``` +/workspace/electra/data/create_datasets_from_start.sh wiki_only +``` + +- Download Wikipedia and BookCorpus + +Users are welcome to download the BookCorpus from other sources to match our accuracy, or repeatedly try our script until the required number of files are downloaded by running the following: +``` +/workspace/electra/data/create_datasets_from_start.sh wiki_books +``` + +Note: Not using the BookCorpus can potentially change the final accuracy on a few downstream tasks. + +5. Start pretraining. + +To run on a single node 8 x V100 32G, from within the container, you can use the following script to run pre-training. +``` +bash scripts/run_pretraining.sh +``` + +The default hyperparameters are set to run on both 8 x A100 40G and 8 x V100 32G. + +For the other platforms, the configs present in `scripts/configs/pretrain_config.sh` can be used as shown below: +``` +bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_amp) +``` + +To run pre-training on multiple nodes, see the [Multi-node](#multi-node) section. + +6. Postprocess pretrained checkpoint and fine-tune on SQuAD dataset + +The above pretrained ELECTRA model representations can be fine-tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script extracts and saves the discriminator and generator from the pretrained checkpoint and fine-tunes the discriminator on SQuAD: + +``` +checkpoints=results/base/checkpoints bash scripts/finetune_ckpts_on_squad.sh +``` + +It internally runs `postprocess_pretrained_ckpt.py` which extracts and saves the discriminator and the generator from the pretrained checkpoint. + +The default hyperparameters are set to run on 8 x V100 16G. + +To run fine-tuning with the SQuAD dataset on Google's pretrained checkpoints, do the following. +``` +bash scripts/run_squad.sh +``` + +For other platforms, configs present in `scripts/configs/squad_config.sh` can be used as shown below: +``` +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) train_eval +``` + +7. Start validation/evaluation. + +Validation can be performed by running: +``` +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) eval +``` +Running training first is required to generate needed checkpoints. + +8. Start inference/predictions. + +Inference can be performed by running: +``` +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) prediction +``` +Inference predictions are saved to `/predictions.json`. + +## Advanced + +The following sections provide greater details of the datasets, running training and inference, and the training results. + +### Scripts and sample code + +Descriptions of the key scripts and folders are provided below. + +- `data/` - Contains scripts for downloading and preparing individual datasets, and will contain downloaded and processed datasets. +- `scripts/` - Contains shell scripts to launch the Docker container, data download, pre-training, fine-tuning and inference. +- `results/` - Folder where all training and inference results get stored by default. +- `run_squad.sh` - Interface for launching question answering fine-tuning with `run_tf_squad.py`. +- `run_pretraining.sh` - Interface for launching ELECTRA pre-training with `run_pretraining.py`. +- `finetune_ckpts_on_squad.sh` - Interface for extracting and saving discriminator and generator from the pretrained checkpoint and run SQuAD fine-tuning on discriminator. +- `build_pretraining_dataset.py` - Creates `tfrecord` files from shared text files in the final step of dataset creation. +- `postprocess_pretrained_ckpt.py` - Converts pretrained checkpoint to discriminator checkpoint and generator checkpoint which can be fed into `run_tf_squad.py`. +- `modeling.py` - Implements the ELECTRA pre-training and fine-tuning model architectures with TensorFlow2. +- `optimization.py` - Implements the Adam optimizer, LAMB and the learning rate schedule with TensorFlow2. +- `configuration.py` - Implements parent class for model config. +- `tokenization.py` - Implements the ELECTRA tokenizer. +- `run_pretraining.py` - Implements ELECTRA pre-training. +- `pretrain_utils.py` - Utilities required for pre-training such as dynamic masking etc., +- `run_tf_squad.py` - Implements fine-tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset. +- `inference.py` - Implements interactive question answering. +- `postprocess_pretrained_ckpt.py` - Implements extracting and saving the discriminator and the generator from the pretrained checkpoint. + + +### Parameters + +#### Pre-training parameters + +ELECTRA is designed to pre-train deep bidirectional networks for language representations. The following scripts replicate pre-training on Wikipedia + BookCorpus from this [paper](https://openreview.net/forum?id=r1xMH1BtvB). These scripts are general and can be used for pre-training language representations on any corpus of choice. + +In the parameters expected by `scripts/run_pretraining.sh`, `p1` stands for phase 1 whereas `p2` stands for phase 2 training. They are as follows: + +- `` is per-GPU batch size used for training. Larger batch sizes run more efficiently, but require more GPU memory. Default is 176. +- `` is the base learning rate for training. Default is 6e-3. +- `` is the type of math in your model, can be either `fp32` or `amp`. Default is `amp`. The options mean: + - FP32: 32-bit IEEE single precision float format. + - AMP: Automatic mixed precision 16 and 32-bit float format. +- `` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node. Default is 8. +- `` is the percentage of training steps used for warm-up at the start of training. Default is 2000. +- `` is the total number of training steps. Default is 10000. +- `` controls how often checkpoints are saved. Default is 500. +- `` if set to `true`, training should resume from the latest model in `/results/checkpoints`. Default is `false`. +- `` a flag indicating whether a larger batch should be simulated with gradient accumulation. Default is `true`. +- `` an integer indicating the number of steps to accumulate gradients over. Effective batch size / GPU = `training_batch_size` x `gradient_accumulation_steps`. Default is 48. +- `` random seed for the run. + +- `` is per-GPU batch size used for training in phase 2. Larger batch sizes run more efficiently, but require more memory. Default is 24. +- `` is the base learning rate for training phase 2. Default is 4e-3. +- `` is the percentage of training steps used for warm-up at the start of training. Default is 200. +- `` is the total number of training steps for phase 2, to be continued in addition to phase 1. Default is 930. +- `` an integer indicating the number of steps to accumulate gradients over in phase 2. Effective batch size / GPU = `training_batch_size_p2` * `gradient_accumulation_steps_p2`. Default is 144. +- `` A checkpoint to start the pre-training routine on (Usually a ELECTRA pretrained checkpoint). Default is `None`. + + +The complete list of the available parameters for the `run_pretraining.py` script are: + +``` + --model_name MODEL_NAME + - Model name, used to define the name of the results folder. + + --pretrain_tfrecords PRETRAIN_TFRECORDS + - Specifies tfrecord files used for pretraining. + + --max_seq_length MAX_SEQ_LENGTH + - The maximum total input sequence length after + WordPiece tokenization. Sequences longer than + this will be truncated, and sequences shorter + than this will be padded. + + --mask_prob MASK_PROB - Percentage of input tokens to mask out / replace. + + --disc_weight DISC_WEIGHT + - Ratio of discriminator loss over generator loss. + + --generator_hidden_size GENERATOR_HIDDEN_SIZE + - Fraction of discriminator hidden size for generator. + + --train_batch_size TRAIN_BATCH_SIZE + - Batch size per GPU for training. + + --learning_rate LEARNING_RATE + - The initial learning rate for the optimizer. + + --num_train_steps NUM_TRAIN_STEPS + - Total number of training steps to perform. + + --num_warmup_steps NUM_WARMUP_STEPS + - Number of steps of training to perform linear learning + rate warmup for. For example, 0.1 = 10% of training. + + --seed SEED - Sets the seed to use for random number generation. + + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + - Number of update steps to accumulate before + performing a backward/update pass. + + --fp16_compression - Whether to use 16-bit all reduce + + --amp - If set, will perform computations using + automatic mixed precision. + + --log_freq LOG_FREQ - If set, the script will output the training + loss every LOG_FREQ steps. + + --save_checkpoints_steps SAVE_CHECKPOINTS_STEPS + - Checkpoints saving frequency. + + --keep_checkpoint_max KEEP_CHECKPOINT_MAX + - Maximum number of checkpoints to keep. + + --restore_checkpoint RESTORE_CHECKPOINT + - Whether to restore from a checkpoint; if specified, + set to `path-to-checkpoint` or `latest` + + --phase2 - Specified if training on phase 2 only. + If not specified, default pre-training is on phase 1. + + --optimizer OPTIMIZER - Specifies optimizer, `adam` or `lamb`. + + --skip_adaptive - Whether to apply adaptive learning rate on LayerNorm and biases. + + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + - Number of steps to accumulate gradients across before + performing an update. + + --lr_decay_power LR_DECAY_POWER + - Learning rate polynomial decay power. + + --opt_beta_1 OPT_BETA_1 - beta2 of optimizer. + + --opt_beta_2 OPT_BETA_2 - beta2 of optimizer. + + --end_lr END_LR - Ending learning rate. + +``` + +#### Fine-tuning parameters + +Default arguments are listed below in the order `scripts/run_squad.sh` expects: + +- ELECTRA MODEL - The default is `"google/electra-base-discriminator"`. +- Number of training Epochs - The default is `2`. +- Batch size - The default is `16`. +- Learning rate - The default is `4e-4`. +- Precision (either `amp`, `tf32` or `fp32`) - The default is `amp`. +- Number of GPUs - The default is `8`. +- Seed - The default is `1`. +- SQuAD version - The default is `1.1` +- SQuAD directory - The default is `/workspace/electra/data/download/squad/v$SQUAD_VERSION`. +- Output directory for result - The default is `results/`. +- Initialize checkpoint - The default is `"None"` +- Mode (`train`, `eval`, `train_eval`, `prediction`) - The default is `train_eval`. + +The script saves the checkpoint at the end of each epoch to the `checkpoints/` folder. + +The main script `run_tf_squad.py` specific parameters are: + +``` + --electra_model ELECTRA_MODEL - Specifies the type of ELECTRA model to use; + should be the discriminator of a pretrained checkpoint(output of postprocess_pretrained_ckpt.py) + or one of the following: + google/electra-small-generator + google/electra-base-generator + google/electra-large-generator + google/electra-small-discriminator + google/electra-base-discriminator + google/electra-large-discriminator + + --amp - If set, will perform computations using + automatic mixed precision. + + --data_dir DATA_DIR - Path to the SQuAD json for training and evaluation. + + --max_seq_length MAX_SEQ_LENGTH + - The maximum total input sequence length + after WordPiece tokenization. + Sequences longer than this will be truncated, + and sequences shorter than this will be padded. + + --doc_stride DOC_STRIDE - When splitting up a long document into chunks + this parameters sets how much stride to take + between chunks of tokens. + + --max_query_length MAX_QUERY_LENGTH + - The maximum number of tokens for the question. + Questions longer than + will be truncated to the value specified. + + --n_best_size N_BEST_SIZE - The total number of n-best predictions to + generate in the nbest_predictions.json + output file. + + --max_answer_length MAX_ANSWER_LENGTH + - The maximum length of an answer that can be + generated. This is needed because the start and + end predictions are not conditioned on one another. + + --joint_head - If true, beam search will be used to jointly predict + the start and end positions. Default is True. + + --beam_size BEAM_SIZE - The beam size used to do joint predictions. The default value is 5. + + --verbose_logging - If true, all the warnings related to data + processing will be printed. A number of warnings + are expected for a normal SQuAD evaluation. + + --do_lower_case - Whether to lower case the input text. Set to + true for uncased models and false for cased models. + + --version_2_with_negative - If true, the SQuAD examples contain questions + that do not have an answer. + + --null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD + - A null answer will be predicted if null_score + is greater than NULL_SCORE_DIFF_THRESHOLD. +``` + +### Command-line options + +To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example: + +`python run_pretraining.py --help` + +`python run_tf_squad.py --help` + +Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section. + +### Getting the data + +For pre-training ELECTRA, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. ELECTRA requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences. + +The preparation of the pre-training dataset is described in the `dataPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows: + +1. Data download and extract - the dataset is downloaded and extracted. + +2. Clean and format - document tags, etc. are removed from the dataset. + +3. Sentence segmentation - the corpus text file is processed into separate sentences. + +4. Sharding - the sentence segmented corpus file is split into a number of uniformly distributed smaller text documents. + +5. `tfrecord` file creation - each text file shard is processed by the `build_pretraining_dataset.py` script to produce a corresponding `tfrecord` file. The script generates input data for the input text shard. + +The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and `tfrecord` file creation given an arbitrary text file containing a document-separated text corpus. + +For fine-tuning a pre-trained ELECTRA model for specific tasks, by default this repository prepares the following dataset: + +- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering + +Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time. + + +#### Multi-dataset + +This repository provides functionality to combine multiple datasets into a single dataset for pre-training on a diverse text corpus at the shard level. Currently Wikipedia and BookCorpus get merged in `data/create_datasets_from_start.sh`. Snippets to download and format more text corpuses can be added to `data/dataPrep.py`. The sharding scheme combines multiple corpuses together and splits them into the required number of training(90%) and testing(10%) shards. Once the data is sharded, the `build_pretraining_dataset.py` converts raw text shards to tokenized segments and saves the dataset to the `data` directory in TFRecord format. This dataset can now be used to pre-train ELECTRA. + + +### Training process + +The training process consists of two steps: pre-training and fine-tuning. + +#### Pre-training + +Pre-training is performed using `run_pretraining.py` along with parameters defined in `scripts/run_pretraining.sh` and `scripts/configs/pretrain_configs.sh`. + +The `run_pretraining.sh` script runs a job on a single node that trains the ELECTRA-base model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer. + +Phase 1: (Maximum sequence length of 128) +- Runs on 8 GPUs with training batch size of 176 per GPU +- Uses a learning rate of 6e-3 +- Has FP16 precision enabled +- Runs for 10000 steps, where the first 2000 are warm-up steps +- Saves a checkpoint every 500 iterations (keeps only the latest 5 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `results/` directory. +- Creates a log file containing all the output + +Phase 2: (Maximum sequence length of 512) +- Runs on 8 GPUs with training batch size of 24 per GPU +- Uses a learning rate of 4e-3 +- Has FP16 precision enabled +- Runs for 930 steps, where the first 200 are warm-up steps +- Saves a checkpoint every 500 iterations (keeps only the latest 5 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `results/` directory. +- Creates a log file containing all the output + +Specific configs available at `scripts/configs/pretrain_config.sh` can be run as follows: +``` +bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_amp) +bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgx2_16gpu_amp) +bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgx1_8gpu_amp) +``` + +The above commands will train ELECTRA based on Wikipedia and BookCorpus to state-of-the-art accuracy on any DGX platform using FP16 arithmetic. Around 96% of the training sequences are of length 128 (phase 1 of training) and less than 4% of the training sequences are of length 512 (phase 2 of training). + +In order to run pre-training routine on an initial checkpoint, perform the following in `scripts/run_pretraining.sh`: +- set `restore_checkpoint=` +- Note: The parameter value assigned to `--model_size` during training should remain unchanged. Also, to resume pre-training on your corpus of choice, the training dataset should be created using the same vocabulary file used in `data/create_datasets_from_start.sh`. + + +#### Multi-node + +Multi-node runs can be launched on a Pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 48-node NVIDIA DGX A100 example for both phase 1 and phase 2: + +``` +BATCHSIZE=176 LR=6e-3 GRAD_ACCUM_STEPS=1 PHASE=1 STEPS=10000 WARMUP=2000 b1=0.878 b2=0.974 decay=0.5 skip_adaptive=yes end_lr=0.0 sbatch N48 --ntasks-per-node=8 run.sub +BATCHSIZE=24 LR=4e-3 GRAD_ACCUM_STEPS=3 PHASE=2 STEPS=930 WARMUP=200 b1=0.878 b2=0.974 decay=0.5 skip_adaptive=yes end_lr=0.0 sbatch N48 --ntasks-per-node=8 run.sub +``` + +Checkpoint after phase 1 will be saved in `/models/`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1. + +The batch variables `BATCHSIZE`, `LR`, `GRAD_ACCUM_STEPS`, `PHASE`, `STEPS`, `WARMUP`, `b1`, `b2`, `decay`, `skip_adaptive` and `end_lr` refer to the Python arguments `train_batch_size`, `learning_rate`, `gradient_accumulation_steps`, `phase2`, `num_train_steps`, `num_warmup_steps`, `opt_beta_1`, `opt_beta_2`, `lr_decay_power`, `skip_adaptive` and `end_lr` in `run_pretraining.py` respectively. + +Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `docker_image` and `datadir` handle the location of the files for each phase. + +Refer to the files contents to see the full list of variables to adjust for your system. + + +#### Fine-tuning + +Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts: + +- Question Answering (`scripts/run_squad.sh`) + +By default, each Python script implements fine-tuning a pre-trained ELECTRA model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters: + +- Uses 8 GPUs +- Has FP16 precision enabled +- Has XLA enabled +- Saves a checkpoint at the end of training to the `checkpoints/` folder + +Specific configs available at `scripts/configs/squad_configs.sh` can be run as follows: +``` +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) train_eval +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgx2_16gpu_amp) train_eval +bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgx1_8gpu_amp) train_eval +``` + +Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through [Horovod](https://github.com/horovod/horovod). For a full list of parameters and associated explanations, see the [Parameters](#parameters) section. + +All fine-tuning shell scripts have the same positional arguments, outlined below: + +```bash scripts/run_squad.sh ``` + +By default, the mode positional argument is set to `train_eval`. See the [Fine-tuning parameters](#fine-tuning-parameters) for explanations of each positional argument. + +Note: The first positional argument (the path to the checkpoint to load) is required. + +Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`. + +### Inference process + +#### Fine-tuning inference + +Evaluation fine-tuning is enabled by the same scripts as training: + +- Question Answering (`scripts/run_squad.sh`) + +The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned ELECTRA model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed. + +Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former. + +`bash scripts/run_squad.sh ` + +To run inference interactively on question-context pairs, use the script `run_inference.py` as follows: + +`python run_inference.py --electra_model --init_checkpoint --question="What food does Harry like?" --context="My name is Harry and I grew up in Canada. I love apples."` + + +## Performance + +The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference). + +### Benchmarking + +The following section shows how to run benchmarks measuring the model performance in training and inference modes. + +#### Training performance benchmark + +Training performance benchmarks for both pre-training phases can be obtained by running `scripts/benchmark_pretraining.sh`. Default parameters are set to run a few training steps for a converging NVIDIA DGX A100 system. + +To benchmark training performance with other parameters, run: +``` +bash scripts/benchmark_pretraining.sh +``` + +An example call used to generate throughput numbers: +``` +bash scripts/benchmark_pretraining.sh 88 amp xla 8 true 2 12 4 base +``` + +Training performance benchmarks for fine-tuning can be obtained by running `scripts/benchmark_squad.sh`. The required parameters can be passed through the command-line as described in [Training process](#training-process). The performance information is printed after 200 training iterations. + +To benchmark the training performance on a specific batch size, run: +``` +bash scripts/benchmark_squad.sh train +``` + +An example call used to generate throughput numbers: +``` +bash scripts/benchmark_squad.sh train 8 16 +``` + +#### Inference performance benchmark + +Inference performance benchmarks fine-tuning can be obtained by running `scripts/benchmark_squad.sh`. The required parameters can be passed through the command-line as described in [Inference process](#inference-process). This script runs one epoch by default on the SQuAD v1.1 dataset and extracts the average performance for the given configuration. + +To benchmark the training performance on a specific batch size, run: +`bash scripts/benchmark_squad.sh train ` + +An example call used to generate throughput numbers: +`bash scripts/benchmark_squad.sh eval 8 256` + + +### Results + +The following sections provide details on how we achieved our performance and accuracy in training and inference. All results are on ELECTRA-base model and on SQuAD v1.1 dataset with a sequence length of 384 unless otherwise mentioned. + +#### Training accuracy results + +##### Pre-training loss curves +![Pretraining Loss Curves](images/total_loss.svg) + +Phase 1 is shown by the blue curve and Phase 2 by the grey. Y axis stands for the total loss and x axis for the total steps trained. + +##### Pre-training loss results + +| DGX System | GPUs | Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - TF32/FP32 | Final Loss - mixed precision | Time to train(hours) - TF32/FP32 | Time to train(hours) - mixed precision | Time to train speedup (TF32/FP32 to mixed precision) +|---|---|---|---|---|---|---|---|--- +|48 x DGX A100 |8 |176 and 24 |1 and 3 |8.686|8.68|1.61 |1.126|1.43 +|24 x DGX-2H |16|176 and 24 |1 and 3 |8.72 |8.67|5.58 |1.74 |3.20 +|1 x DGX A100 |8 |176 and 24 |48 and 144|- |- |54.84 |30.47|1.8 +|1 x DGX-1 16G |8 |88 and 12 |96 and 288|- |- |241.8 |65.1 |3.71 +|1 x DGX-2 32G |16|176 and 24 |24 and 72 |- |- |109.97|29.08|3.78 + +In the above table, FP32 and TF32 runs were made at half the batch per GPU and twice the gradient accumulation steps of a run with mixed precision in order to not run out of memory. + + +The SQuAD fine-tuning scripts by default train on [Google's ELECTRA++ base pretrained checkpoint](https://github.com/google-research/electra#released-models) which uses around 10x training dataset (dataset used by XLNet authors) and greater than 5x training steps compared to the training recipe in `scripts/run_pretraining.sh`. The latter trains and achieves state-of-the-art accuracy on Wikipedia and BookCorpus datasets only. + +##### Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB) + +Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. + +*ELECTRA BASE++* + +| GPUs | Batch size / GPU | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - TF32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) | +|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------| +| 1 | 32 | 87.19 / 92.85 | 87.19 / 92.84 | 1699 | 749 | 2.27 | +| 8 | 32 | 86.84 / 92.57 | 86.83 / 92.56 | 263 | 201 | 1.30 | + + +##### Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB) + +Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs. + +*ELECTRA BASE++* + +| GPUs | Batch size / GPU (FP32 : mixed precision) | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - FP32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) | +|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------| +| 1 | 8 : 16 | 87.36 / 92.82 | 87.32 / 92.74 | 5136 | 1378 | 3.73 | +| 8 | 8 : 16 | 87.02 / 92.73 | 87.02 / 92.72 | 730 | 334 | 2.18 | + +*ELECTRA BASE checkpoint Wikipedia and BookCorpus* + +GPUs | SQuAD version| Batch size / GPU (FP32 : mixed precision) | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - FP32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) | +|---------|-----|----------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------| +| 8 | v1.1 | 8 : 16 | 85.00 / 90.94 | 85.04 / 90.96 | 5136 | 1378 | 3.73 | +| 8 | v2.0 | 8 : 16 | 80.517 / 83.36 | 80.523 / 83.43 | 730 | 334 | 2.18 + +##### Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB) + +Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-2 (16x V100 32G) GPUs. + +*ELECTRA BASE++* + +| GPUs | Batch size / GPU | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - FP32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) | +|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------| +| 1 | 32 | 87.14 / 92.69 | 86.95 / 92.69 | 4478 | 1162 | 3.85 | +| 16 | 32 | 86.95 / 90.58 | 86.93 / 92.48 | 333 | 229 | 1.45 | + + +##### Training stability test + +###### Pre-training stability test: NVIDIA DGX A100 (8x A100 40GB) + +*ELECTRA BASE Wikipedia and BookCorpus* + +Training stability with 48 x DGX A100, TF32 computations and loss reported after Phase 2: + +| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation +|---|---|---|---|---|---|---|--- +|Final Loss| 8.72 | 8.69 | 8.71 | 8.7 | 8.68 | 8.7 | 0.015 + +###### Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB) + +*ELECTRA BASE++* + +Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v1.1: + +| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation +|---|---|---|---|---|---|---|--- +|Exact Match %| 86.99 | 86.81 | 86.95 | 87.10 | 87.26 | 87.02 | 0.17 +| f1 % | 92.7 | 92.66 | 92.65 | 92.61 | 92.97 | 92.72 | 0.14 + + Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v2.0: + +| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation +|---|---|---|---|---|---|---|--- +|Exact Match %| 83.00 | 82.84 | 83.11 | 82.70 | 82.94 | 82.91 | 0.15 +| f1 % | 85.63 | 85.48 | 85.69 | 85.31 | 85.57 | 85.54 | 0.15 + +#### Training performance results + +##### Training performance: NVIDIA DGX A100 (8x A100 40GB) + +Our results were obtained by running the `scripts/benchmark_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch. + +###### Pre-training NVIDIA DGX A100 (8x A100 40GB) + +| GPUs | Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision +|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|---------------------------------------------- +|1 | 88 and 176| 768 and 384 | 128| 533 |955 |1.79|1.00| 1.00 +|8 | 88 and 176| 96 and 48 | 128| 4202|7512|1.79|7.88| 7.87 +|1 | 12 and 24 | 2304 and 1152| 512| 90 |171 |1.90|1.00| 1.00 +|8 | 12 and 24 | 288 and 144 | 512| 716 |1347|1.88|7.96| 7.88 + +###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB) + +| GPUs | Batch size / GPU | Sequence length | Throughput - TF32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision | +|------------------|-----------|-----------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|---------------------------------------------- +| 1 | 32 | 384 | 107 | 317 | 2.96 | 1.00 | 1.00 +| 8 | 32 | 384 | 828 | 2221| 2.68 | 7.74 | 7.00 + +##### Training performance: NVIDIA DGX-1 (8x V100 16GB) + +Our results were obtained by running the `scripts/benchmark_squad.sh` training scripts in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch. + +###### Pre-training NVIDIA DGX-1 (8x V100 16GB) + +| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision +|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|---------------------------------------------- +|1 | 40 and 88| 1689 and 768 | 128| 116 |444 |3.83 |1.00 | 1.00 +|8 | 40 and 88| 211 and 96 | 128| 920 |3475|3.77 |7.93 | 7.83 +|1 | 6 and 12 | 4608 and 2304| 512| 24 |84 |3.50 |1.00 | 1.00 +|8 | 6 and 12 | 576 and 288 | 512| 190 |656 |3.45 |7.92 | 7.81 + +###### Fine-tuning NVIDIA DGX-1 (8x V100 16GB) + +| GPUs | Batch size / GPU (FP32 : mixed precision) | Sequence length | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision | +|------------------|-----------|-----------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|---------------------------------------------- +|1 | 8 : 16| 384| 35| 154| 4.4 | 1.00| 1.00 +|8 | 8 : 16| 384|268|1051| 3.92| 7.66| 6.82 + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +##### Training performance: NVIDIA DGX-2 (16x V100 32GB) + +Our results were obtained by running the `scripts/benchmark_squad.sh` training scripts in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch. + +###### Pre-training NVIDIA DGX-2 (16x V100 32GB) + +| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision +|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|---------------------------------------------- +|1 | 88 and 176| 768 and 384 | 128| 128 |500 |3.91| 1.00 | 1.00 +|8 | 88 and 176| 96 and 48 | 128| 1011|3916|3.87| 7.90 | 7.83 +|16| 88 and 176| 48 and 24 | 128| 2018|7773|3.85|15.77 |15.55 +|1 | 12 and 24 | 2304 and 1152| 512| 27 |96 |3.55| 1.00 | 1.00 +|8 | 12 and 24 | 288 and 144 | 512| 213 |754 |3.54| 7.89 | 7.85 +|16| 12 and 24 | 144 and 72 | 512| 426 |1506|3.54| 15.78|15.69 + +###### Fine-tuning NVIDIA DGX-2 (16x V100 32GB) + +| GPUs | Batch size / GPU | Sequence length | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision | +|------|-----------|-------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------| +| 1 | 16 | 384 | 40 | 184 | 4.6 | 1.00 | 1.00 | +| 8 | 16 | 384 | 311 | 1289 | 4.14 | 7.77 | 7.00 | +| 16 | 16 | 384 | 626 | 2594 | 4.14 | 15.65 | 14.09 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +#### Inference performance results + +##### Inference performance: NVIDIA DGX A100 (1x A100 40GB) + +Our results were obtained by running the `scripts/benchmark_squad.sh` inferencing benchmarking script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU. + +###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB) + +FP16 + +| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) | +|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------| +| 1 | 384 | 166 | 6.035 | 5.995 | 6.013 | 6.029 | +| 256 | 384 | 886 | 276.26 | 274.53 | 275.276 | 275.946 | +| 512 | 384 | 886 | 526.5 | 525.014 | 525.788 | 525.788 | + +TF32 + +| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) | +|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------| +| 1 | 384 | 122 | 8.228 | 8.171 | 8.198 | 8.221 | +| 256 | 384 | 342 | 729.293 | 727.990 | 728.505 | 729.027 | +| 512 | 384 | 350 | 1429.314 | 1427.719 | 1428.550 | 1428.550 | + + + +##### Inference performance: NVIDIA T4 + +Our results were obtained by running the `scripts/benchmark_squad.sh` script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA Tesla T4 (1x T4 16GB) GPU. + +###### Fine-tuning inference on NVIDIA T4 + +FP16 + +| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) | +|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------| +| 1 | 384 | 58 | 17.413 | 17.295 | 17.349 | 17.395 | +| 128 | 384 | 185 | 677.298 | 675.211 | 675.674 | 676.269 | +| 256 | 384 | 169 | 1451.396 | 1445.070 | 1447.654 | 1450.141 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +## Release notes + +### Changelog + +July 2020 +- Initial release. + +October 2020 +- Data preparation scripts for pre-training. +- Pre-training support. +- Mixed precision support with Keras AMP policy. +- Update beam size in SQuAD fine-tuning from 4 to 5 for higher accuracy. +- T4 inference performance. + +### Known issues + +There are no known issues with this model. diff --git a/modelzoo/ELECTRA/build_pretraining_dataset.py b/modelzoo/ELECTRA/build_pretraining_dataset.py new file mode 100644 index 00000000..e1385cb2 --- /dev/null +++ b/modelzoo/ELECTRA/build_pretraining_dataset.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright 2020 The Google Research Authors. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Writes out text data as tfrecords that ELECTRA can be pre-trained on.""" + +import argparse +import multiprocessing +import os +import random +import time +import tensorflow as tf + +import utils +from tokenization import ElectraTokenizer + + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +class ExampleBuilder(object): + """Given a stream of input text, creates pretraining examples.""" + + def __init__(self, tokenizer, max_length): + self._tokenizer = tokenizer + self._current_sentences = [] + self._current_length = 0 + self._max_length = max_length + self._target_length = max_length + + def add_line(self, line): + """Adds a line of text to the current example being built.""" + line = line.strip().replace("\n", " ") + if (not line) and self._current_length != 0: # empty lines separate docs + return self._create_example() + bert_tokens = self._tokenizer.tokenize(line) + bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens) + self._current_sentences.append(bert_tokids) + self._current_length += len(bert_tokids) + if self._current_length >= self._target_length: + return self._create_example() + return None + + def _create_example(self): + """Creates a pre-training example from the current list of sentences.""" + # small chance to only have one segment as in classification tasks + if random.random() < 0.1: + first_segment_target_length = 100000 + else: + # -3 due to not yet having [CLS]/[SEP] tokens in the input text + first_segment_target_length = (self._target_length - 3) // 2 + + first_segment = [] + second_segment = [] + for sentence in self._current_sentences: + # the sentence goes to the first segment if (1) the first segment is + # empty, (2) the sentence doesn't put the first segment over length or + # (3) 50% of the time when it does put the first segment over length + if (len(first_segment) == 0 or + len(first_segment) + len(sentence) < first_segment_target_length or + (len(second_segment) == 0 and + len(first_segment) < first_segment_target_length and + random.random() < 0.5)): + first_segment += sentence + else: + second_segment += sentence + + # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens + first_segment = first_segment[:self._max_length - 2] + second_segment = second_segment[:max(0, self._max_length - + len(first_segment) - 3)] + + # prepare to start building the next example + self._current_sentences = [] + self._current_length = 0 + # small chance for random-length instead of max_length-length example + if random.random() < 0.05: + self._target_length = random.randint(5, self._max_length) + else: + self._target_length = self._max_length + + return self._make_tf_example(first_segment, second_segment) + + def _make_tf_example(self, first_segment, second_segment): + """Converts two "segments" of text into a tf.train.Example.""" + vocab = self._tokenizer.vocab + input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]] + segment_ids = [0] * len(input_ids) + if second_segment: + input_ids += second_segment + [vocab["[SEP]"]] + segment_ids += [1] * (len(second_segment) + 1) + input_mask = [1] * len(input_ids) + input_ids += [0] * (self._max_length - len(input_ids)) + input_mask += [0] * (self._max_length - len(input_mask)) + segment_ids += [0] * (self._max_length - len(segment_ids)) + tf_example = tf.train.Example(features=tf.train.Features(feature={ + "input_ids": create_int_feature(input_ids), + "input_mask": create_int_feature(input_mask), + "segment_ids": create_int_feature(segment_ids) + })) + return tf_example + + +class ExampleWriter(object): + """Writes pre-training examples to disk.""" + + def __init__(self, job_id, vocab_file, output_dir, max_seq_length, + num_jobs, blanks_separate_docs, do_lower_case, + num_out_files=1000): + self._blanks_separate_docs = blanks_separate_docs + tokenizer = ElectraTokenizer( + vocab_file=vocab_file, + do_lower_case=do_lower_case) + self._example_builder = ExampleBuilder(tokenizer, max_seq_length) + self._writers = [] + for i in range(num_out_files): + if i % num_jobs == job_id: + output_fname = os.path.join( + output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format( + i, num_out_files)) + self._writers.append(tf.io.TFRecordWriter(output_fname)) + self.n_written = 0 + + def write_examples(self, input_file): + """Writes out examples from the provided input file.""" + with tf.io.gfile.GFile(input_file) as f: + for line in f: + line = line.strip() + if line or self._blanks_separate_docs: + example = self._example_builder.add_line(line) + if example: + self._writers[self.n_written % len(self._writers)].write( + example.SerializeToString()) + self.n_written += 1 + example = self._example_builder.add_line("") + if example: + self._writers[self.n_written % len(self._writers)].write( + example.SerializeToString()) + self.n_written += 1 + + def finish(self): + for writer in self._writers: + writer.close() + + +def write_examples(job_id, args): + """A single process creating and writing out pre-processed examples.""" + + def log(*args): + msg = " ".join(map(str, args)) + print("Job {}:".format(job_id), msg) + + log("Creating example writer") + example_writer = ExampleWriter( + job_id=job_id, + vocab_file=args.vocab_file, + output_dir=args.output_dir, + max_seq_length=args.max_seq_length, + num_jobs=args.num_processes, + blanks_separate_docs=args.blanks_separate_docs, + do_lower_case=args.do_lower_case, + num_out_files=args.num_out_files, + ) + log("Writing tf examples") + fnames = sorted(tf.io.gfile.listdir(args.corpus_dir)) + fnames = [f for (i, f) in enumerate(fnames) + if i % args.num_processes == job_id] + random.shuffle(fnames) + start_time = time.time() + for file_no, fname in enumerate(fnames): + if file_no > 0: + elapsed = time.time() - start_time + log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, " + "{:} examples written".format( + file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed), + int((len(fnames) - file_no) / (file_no / elapsed)), + example_writer.n_written)) + example_writer.write_examples(os.path.join(args.corpus_dir, fname)) + example_writer.finish() + log("Done!") + +# python build_pretraining_dataset --corpus-dir +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--corpus-dir", required=True, + help="Location of pre-training text files.") + parser.add_argument("--vocab-file", required=True, + help="Location of vocabulary file.") + parser.add_argument("--output-dir", required=True, + help="Where to write out the tfrecords.") + parser.add_argument("--max-seq-length", default=128, type=int, + help="Number of tokens per example.") + parser.add_argument("--num-processes", default=1, type=int, + help="Parallelize across multiple processes.") + parser.add_argument("--blanks-separate-docs", default=True, type=bool, + help="Whether blank lines indicate document boundaries.") + parser.add_argument("--do-lower-case", dest='do_lower_case', + action='store_true', help="Lower case input text.") + parser.add_argument("--no-lower-case", dest='do_lower_case', + action='store_false', help="Don't lower case input text.") + parser.add_argument("--num-out-files", default=1000, type=int, + help="Number of output files.") + parser.add_argument("--seed", default=1314, type=int) + args = parser.parse_args() + + random.seed(args.seed) + + utils.rmkdir(args.output_dir) + if args.num_processes == 1: + write_examples(0, args) + else: + jobs = [] + for i in range(args.num_processes): + job = multiprocessing.Process(target=write_examples, args=(i, args)) + jobs.append(job) + job.start() + for job in jobs: + job.join() + + +if __name__ == "__main__": + main() diff --git a/modelzoo/ELECTRA/configuration.py b/modelzoo/ELECTRA/configuration.py new file mode 100644 index 00000000..df8d5ae7 --- /dev/null +++ b/modelzoo/ELECTRA/configuration.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" ELECTRA model configuration """ + + +import logging + +from configuration_utils import PretrainedConfig + + +logger = logging.getLogger(__name__) + +ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json", + "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json", + "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json", + "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json", + "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json", + "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json", +} + + +class ElectraConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`. + It is used to instantiate an ELECTRA model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the ELECTRA `google/electra-small-discriminator `__ + architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the ELECTRA model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`. + embedding_size (:obj:`int`, optional, defaults to 128): + Dimensionality of the encoder layers and the pooler layer. + hidden_size (:obj:`int`, optional, defaults to 256): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, optional, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 1024): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + from transformers import ElectraModel, ElectraConfig + + # Initializing a ELECTRA electra-base-uncased style configuration + configuration = ElectraConfig() + + # Initializing a model from the electra-base-uncased style configuration + model = ElectraModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ + pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP + model_type = "electra" + + def __init__( + self, + vocab_size=30522, + embedding_size=128, + hidden_size=256, + num_hidden_layers=12, + num_attention_heads=4, + intermediate_size=1024, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps diff --git a/modelzoo/ELECTRA/configuration_utils.py b/modelzoo/ELECTRA/configuration_utils.py new file mode 100644 index 00000000..b90c4025 --- /dev/null +++ b/modelzoo/ELECTRA/configuration_utils.py @@ -0,0 +1,518 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Configuration base class and utilities.""" + + +import copy +import json +import logging +import os +from typing import Dict, Optional, Tuple + +from utils import log +from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url + + +logger = logging.getLogger(__name__) + + +class PretrainedConfig(object): + r""" Base class for all configuration classes. + Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. + + Note: + A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. + It only affects the model's configuration. + + Class attributes (overridden by derived classes): + - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values. + - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`. + + Args: + finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`): + Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. + num_labels (:obj:`int`, `optional`, defaults to `2`): + Number of classes to use when the model is a classification model (sequences/tokens) + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): + Should the model returns attentions weights. + output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`): + Should the model returns all hidden-states. + torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`): + Is the model used with Torchscript (for PyTorch models). + """ + pretrained_config_archive_map = {} # type: Dict[str, str] + model_type = "" # type: str + + def __init__(self, **kwargs): + # Attributes with defaults + self.output_attentions = kwargs.pop("output_attentions", False) + self.output_hidden_states = kwargs.pop("output_hidden_states", False) + self.output_past = kwargs.pop("output_past", True) # Not used by all models + self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models + self.use_bfloat16 = kwargs.pop("use_bfloat16", False) + self.pruned_heads = kwargs.pop("pruned_heads", {}) + + # Is decoder is used in encoder-decoder models to differentiate encoder from decoder + self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) + self.is_decoder = kwargs.pop("is_decoder", False) + + # Parameters for sequence generation + self.max_length = kwargs.pop("max_length", 20) + self.min_length = kwargs.pop("min_length", 0) + self.do_sample = kwargs.pop("do_sample", False) + self.early_stopping = kwargs.pop("early_stopping", False) + self.num_beams = kwargs.pop("num_beams", 1) + self.temperature = kwargs.pop("temperature", 1.0) + self.top_k = kwargs.pop("top_k", 50) + self.top_p = kwargs.pop("top_p", 1.0) + self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) + self.length_penalty = kwargs.pop("length_penalty", 1.0) + self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) + self.bad_words_ids = kwargs.pop("bad_words_ids", None) + self.num_return_sequences = kwargs.pop("num_return_sequences", 1) + + # Fine-tuning task arguments + self.architectures = kwargs.pop("architectures", None) + self.finetuning_task = kwargs.pop("finetuning_task", None) + self.num_labels = kwargs.pop("num_labels", 2) + self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)}) + self.id2label = dict((int(key), value) for key, value in self.id2label.items()) + self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys()))) + self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) + + # Tokenizer arguments TODO: eventually tokenizer and models should share the same config + self.prefix = kwargs.pop("prefix", None) + self.bos_token_id = kwargs.pop("bos_token_id", None) + self.pad_token_id = kwargs.pop("pad_token_id", None) + self.eos_token_id = kwargs.pop("eos_token_id", None) + self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) + + # task specific arguments + self.task_specific_params = kwargs.pop("task_specific_params", None) + + # TPU arguments + self.xla_device = kwargs.pop("xla_device", None) + + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + log("Can't set {} with value {} for {}".format(key, value, self)) + raise err + + @property + def num_labels(self): + return self._num_labels + + @num_labels.setter + def num_labels(self, num_labels): + self._num_labels = num_labels + self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)} + self.id2label = dict((int(key), value) for key, value in self.id2label.items()) + self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) + self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) + + def save_pretrained(self, save_directory): + """ + Save a configuration object to the directory `save_directory`, so that it + can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. + + Args: + save_directory (:obj:`string`): + Directory where the configuration JSON file will be saved. + """ + assert os.path.isdir( + save_directory + ), "Saving path should be a directory where the model and configuration can be saved" + + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file) + log("Configuration saved in {}".format(output_config_file)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": + r""" + + Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. + + Args: + pretrained_model_name_or_path (:obj:`string`): + either: + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or + download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to + our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a configuration file saved using the + :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: + ``./my_model_directory/configuration.json``. + cache_dir (:obj:`string`, `optional`): + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + kwargs (:obj:`Dict[str, any]`, `optional`): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is + controlled by the `return_unused_kwargs` keyword parameter. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Force to (re-)download the model weights and configuration files and override the cached versions if they exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + proxies (:obj:`Dict`, `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g.: + :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` + The proxies are used on each request. + return_unused_kwargs: (`optional`) bool: + If False, then this function returns just the final configuration object. + If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a + dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part + of kwargs which has not been used to update `config` and is otherwise ignored. + + Returns: + :class:`PretrainedConfig`: An instance of a configuration object + + Examples:: + + # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a + # derived class: BertConfig + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') + config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + assert config.output_attention == True + config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, + foo=False, return_unused_kwargs=True) + assert config.output_attention == True + assert unused_kwargs == {'foo': False} + + """ + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + return cls.from_dict(config_dict, **kwargs) + + @classmethod + def get_config_dict( + cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs + ) -> Tuple[Dict, Dict]: + """ + From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used + for instantiating a Config using `from_dict`. + + Parameters: + pretrained_model_name_or_path (:obj:`string`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict: + A map of `shortcut names` to `url`. By default, will use the current class attribute. + + Returns: + :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object. + + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + + if pretrained_config_archive_map is None: + pretrained_config_archive_map = cls.pretrained_config_archive_map + + if pretrained_model_name_or_path in pretrained_config_archive_map: + config_file = pretrained_config_archive_map[pretrained_model_name_or_path] + elif os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + config_file = pretrained_model_name_or_path + else: + config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME) + + try: + # Load from URL or cache if already cached + resolved_config_file = cached_path( + config_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + ) + # Load config dict + if resolved_config_file is None: + raise EnvironmentError + config_dict = cls._dict_from_json_file(resolved_config_file) + + except EnvironmentError: + if pretrained_model_name_or_path in pretrained_config_archive_map: + msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( + config_file + ) + else: + msg = ( + "Can't load '{}'. Make sure that:\n\n" + "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" + "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format( + pretrained_model_name_or_path, + pretrained_model_name_or_path, + pretrained_model_name_or_path, + CONFIG_NAME, + ) + ) + raise EnvironmentError(msg) + + except json.JSONDecodeError: + msg = ( + "Couldn't reach server at '{}' to download configuration file or " + "configuration file is not a valid JSON file. " + "Please check network or file content here: {}.".format(config_file, resolved_config_file) + ) + raise EnvironmentError(msg) + + if resolved_config_file == config_file: + log("loading configuration file {}".format(config_file)) + else: + log("loading configuration file {} from cache at {}".format(config_file, resolved_config_file)) + + return config_dict, kwargs + + @classmethod + def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig": + """ + Constructs a `Config` from a Python dictionary of parameters. + + Args: + config_dict (:obj:`Dict[str, any]`): + Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved + from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict` + method. + kwargs (:obj:`Dict[str, any]`): + Additional parameters from which to initialize the configuration object. + + Returns: + :class:`PretrainedConfig`: An instance of a configuration object + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + config = cls(**config_dict) + + if hasattr(config, "pruned_heads"): + config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + # log("Model config {}".format(str(config))) + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def from_json_file(cls, json_file: str) -> "PretrainedConfig": + """ + Constructs a `Config` from the path to a json file of parameters. + + Args: + json_file (:obj:`string`): + Path to the JSON file containing the parameters. + + Returns: + :class:`PretrainedConfig`: An instance of a configuration object + + """ + config_dict = cls._dict_from_json_file(json_file) + return cls(**config_dict) + + @classmethod + def _dict_from_json_file(cls, json_file: str): + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return json.loads(text) + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return "{} {}".format(self.__class__.__name__, self.to_json_string()) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. + + Returns: + :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + if hasattr(self.__class__, "model_type"): + output["model_type"] = self.__class__.model_type + return output + + def to_json_string(self): + """ + Serializes this instance to a JSON string. + + Returns: + :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format. + """ + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ + Save this instance to a json file. + + Args: + json_file_path (:obj:`string`): + Path to the JSON file in which this configuration instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def update(self, config_dict: Dict): + """ + Updates attributes of this class + with attributes from `config_dict`. + + Args: + :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class. + """ + for key, value in config_dict.items(): + setattr(self, key, value) + + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", + "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json", +} + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. + It is used to instantiate an BERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the BERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. + hidden_size (:obj:`int`, optional, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + from transformers import BertModel, BertConfig + + # Initializing a BERT bert-base-uncased style configuration + configuration = BertConfig() + + # Initializing a model from the bert-base-uncased style configuration + model = BertModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ + pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP + model_type = "bert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps \ No newline at end of file diff --git a/modelzoo/ELECTRA/data/BooksDownloader.py b/modelzoo/ELECTRA/data/BooksDownloader.py new file mode 100644 index 00000000..a10ebde0 --- /dev/null +++ b/modelzoo/ELECTRA/data/BooksDownloader.py @@ -0,0 +1,26 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess + +class BooksDownloader: + def __init__(self, save_path): + self.save_path = save_path + pass + + + def download(self): + bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out' + bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' + bookscorpus_download_command += ' --trash-bad-count' + bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) diff --git a/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py new file mode 100644 index 00000000..22e48d4b --- /dev/null +++ b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py @@ -0,0 +1,32 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os + +class BookscorpusTextFormatting: + def __init__(self, books_path, output_filename, recursive = False): + self.books_path = books_path + self.recursive = recursive + self.output_filename = output_filename + + + # This puts one book per line + def merge(self): + with open(self.output_filename, mode='w', newline='\n') as ofile: + for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True): + with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file: + for line in file: + if line.strip() != '': + ofile.write(line.strip() + ' ') + ofile.write("\n\n") \ No newline at end of file diff --git a/modelzoo/ELECTRA/data/Downloader.py b/modelzoo/ELECTRA/data/Downloader.py new file mode 100644 index 00000000..ebbd43d6 --- /dev/null +++ b/modelzoo/ELECTRA/data/Downloader.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader +from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader +from WikiDownloader import WikiDownloader +from BooksDownloader import BooksDownloader +from MRPCDownloader import MRPCDownloader +from SquadDownloader import SquadDownloader + + +class Downloader: + def __init__(self, dataset_name, save_path): + self.dataset_name = dataset_name + self.save_path = save_path + + + def download(self): + if self.dataset_name == 'bookscorpus': + self.download_bookscorpus() + + elif self.dataset_name == 'wikicorpus_en': + self.download_wikicorpus('en') + + elif self.dataset_name == 'wikicorpus_zh': + self.download_wikicorpus('zh') + + elif self.dataset_name == 'google_pretrained_weights': + self.download_google_pretrained_weights() + + elif self.dataset_name == 'nvidia_pretrained_weights': + self.download_nvidia_pretrained_weights() + + elif self.dataset_name == 'mrpc': + self.download_mrpc() + + elif self.dataset_name == 'squad': + self.download_squad() + + elif self.dataset_name == 'all': + self.download_bookscorpus(self.save_path) + self.download_wikicorpus('en', self.save_path) + self.download_wikicorpus('zh', self.save_path) + self.download_google_pretrained_weights(self.save_path) + self.download_nvidia_pretrained_weights(self.save_path) + self.download_mrpc(self.save_path) + self.download_squad(self.save_path) + + else: + print(self.dataset_name) + assert False, 'Unknown dataset_name provided to downloader' + + + def download_bookscorpus(self): + downloader = BooksDownloader(self.save_path) + downloader.download() + + + def download_wikicorpus(self, language): + downloader = WikiDownloader(language, self.save_path) + downloader.download() + + + def download_google_pretrained_weights(self): + downloader = GooglePretrainedWeightDownloader(self.save_path) + downloader.download() + + + def download_nvidia_pretrained_weights(self): + downloader = NVIDIAPretrainedWeightDownloader(self.save_path) + downloader.download() + + + def download_mrpc(self): + downloader = MRPCDownloader(self.save_path) + downloader.download() + + + def download_squad(self): + downloader = SquadDownloader(self.save_path) + downloader.download() diff --git a/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py new file mode 100644 index 00000000..bb0684d3 --- /dev/null +++ b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py @@ -0,0 +1,158 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import os +import urllib.request +import zipfile + +class GooglePretrainedWeightDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/google_pretrained_weights' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + # Download urls + self.model_urls = { + 'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'), + 'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'), + 'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'), + 'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'), + 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'), + 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'), + 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip') + } + + # SHA256sum verification for file download integrity (and checking for changes from the download source over time) + self.bert_base_uncased_sha = { + 'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc', + 'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84', + 'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b', + 'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e', + 'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3', + } + + self.bert_large_uncased_sha = { + 'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb', + 'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1', + 'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093', + 'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1', + 'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3', + } + + self.bert_base_cased_sha = { + 'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc', + 'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea', + 'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1', + 'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98', + 'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02', + } + + self.bert_large_cased_sha = { + 'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57', + 'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0', + 'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf', + 'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1', + 'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02', + } + + self.bert_base_multilingual_cased_sha = { + 'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0', + 'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5', + 'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37', + 'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa', + 'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c', + } + + self.bert_large_multilingual_uncased_sha = { + 'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624', + 'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429', + 'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7', + 'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29', + 'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f', + } + + self.bert_base_chinese_sha = { + 'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015', + 'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba', + 'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e', + 'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047', + 'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c', + } + + # Relate SHA to urls for loop below + self.model_sha = { + 'bert_base_uncased': self.bert_base_uncased_sha, + 'bert_large_uncased': self.bert_large_uncased_sha, + 'bert_base_cased': self.bert_base_cased_sha, + 'bert_large_cased': self.bert_large_cased_sha, + 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha, + 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha, + 'bert_base_chinese': self.bert_base_chinese_sha + } + + # Helper to get sha256sum of a file + def sha256sum(self, filename): + h = hashlib.sha256() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + + return h.hexdigest() + + def download(self): + # Iterate over urls: download, unzip, verify sha256sum + found_mismatch_sha = False + for model in self.model_urls: + url = self.model_urls[model][0] + file = self.save_path + '/' + self.model_urls[model][1] + + print('Downloading', url) + response = urllib.request.urlopen(url) + with open(file, 'wb') as handle: + handle.write(response.read()) + + print('Unzipping', file) + zip = zipfile.ZipFile(file, 'r') + zip.extractall(self.save_path) + zip.close() + + sha_dict = self.model_sha[model] + for extracted_file in sha_dict: + sha = sha_dict[extracted_file] + if sha != self.sha256sum(file[:-4] + '/' + extracted_file): + found_mismatch_sha = True + print('SHA256sum does not match on file:', extracted_file, 'from download url:', url) + else: + print(file[:-4] + '/' + extracted_file, '\t', 'verified') + + if not found_mismatch_sha: + print("All downloads pass sha256sum verification.") + + def serialize(self): + pass + + def deserialize(self): + pass + + def listAvailableWeights(self): + print("Available Weight Datasets") + for item in self.model_urls: + print(item) + + def listLocallyStoredWeights(self): + pass + diff --git a/modelzoo/ELECTRA/data/MRPCDownloader.py b/modelzoo/ELECTRA/data/MRPCDownloader.py new file mode 100644 index 00000000..42dd4227 --- /dev/null +++ b/modelzoo/ELECTRA/data/MRPCDownloader.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import os +import urllib.request +import sys + +class MRPCDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/mrpc' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py + self.download_urls = { + 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv' + } + + def download(self): + for item in self.download_urls: + url = item + file = self.download_urls[item] + + print('Downloading:', url) + if os.path.isfile(self.save_path + '/' + file): + print('** Download file already exists, skipping download') + else: + response = urllib.request.urlopen(url) + with open(self.save_path + '/' + file, "wb") as handle: + handle.write(response.read()) + + diff --git a/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py new file mode 100644 index 00000000..13c9a320 --- /dev/null +++ b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py @@ -0,0 +1,27 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +class NVIDIAPretrainedWeightDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/nvidia_pretrained_weights' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + pass + + + def download(self): + assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.' \ No newline at end of file diff --git a/modelzoo/ELECTRA/data/SquadDownloader.py b/modelzoo/ELECTRA/data/SquadDownloader.py new file mode 100644 index 00000000..6d64ffc6 --- /dev/null +++ b/modelzoo/ELECTRA/data/SquadDownloader.py @@ -0,0 +1,54 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import os +import urllib.request +import sys + +class SquadDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/squad' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + if not os.path.exists(self.save_path + '/v1.1'): + os.makedirs(self.save_path + '/v1.1') + + if not os.path.exists(self.save_path + '/v2.0'): + os.makedirs(self.save_path + '/v2.0') + + self.download_urls = { + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json', + 'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json', + 'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py', + } + + def download(self): + for item in self.download_urls: + url = item + file = self.download_urls[item] + + print('Downloading:', url) + if os.path.isfile(self.save_path + '/' + file): + print('** Download file already exists, skipping download') + else: + response = urllib.request.urlopen(url) + with open(self.save_path + '/' + file, "wb") as handle: + handle.write(response.read()) + + diff --git a/modelzoo/ELECTRA/data/TextSharding.py b/modelzoo/ELECTRA/data/TextSharding.py new file mode 100644 index 00000000..0753e742 --- /dev/null +++ b/modelzoo/ELECTRA/data/TextSharding.py @@ -0,0 +1,327 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from itertools import islice + +import multiprocessing +import statistics + +class Sharding: + def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set): + assert len(input_files) > 0, 'The input file list must contain at least one file.' + assert n_training_shards > 0, 'There must be at least one output shard.' + assert n_test_shards > 0, 'There must be at least one output shard.' + + self.n_training_shards = n_training_shards + self.n_test_shards = n_test_shards + self.fraction_test_set = fraction_test_set + + self.input_files = input_files + + self.output_name_prefix = output_name_prefix + self.output_training_identifier = '_training' + self.output_test_identifier = '_test' + self.output_file_extension = '.txt' + + self.articles = {} # key: integer identifier, value: list of articles + self.sentences = {} # key: integer identifier, value: list of sentences + self.output_training_files = {} # key: filename, value: list of articles to go into file + self.output_test_files = {} # key: filename, value: list of articles to go into file + + self.init_output_files() + + + # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines) + def load_articles(self): + print('Start: Loading Articles') + + global_article_count = 0 + for input_file in self.input_files: + print('input file:', input_file) + with open(input_file, mode='r', newline='\n') as f: + for i, line in enumerate(f): + if line.strip(): + self.articles[global_article_count] = line.rstrip() + global_article_count += 1 + + print('End: Loading Articles: There are', len(self.articles), 'articles.') + + + def segment_articles_into_sentences(self, segmenter): + print('Start: Sentence Segmentation') + if len(self.articles) is 0: + self.load_articles() + + assert len(self.articles) is not 0, 'Please check that input files are present and contain data.' + + # TODO: WIP: multiprocessing (create independent ranges and spawn processes) + use_multiprocessing = 'serial' + + def chunks(data, size=len(self.articles)): + it = iter(data) + for i in range(0, len(data), size): + yield {k: data[k] for k in islice(it, size)} + + if use_multiprocessing == 'manager': + manager = multiprocessing.Manager() + return_dict = manager.dict() + jobs = [] + n_processes = 7 # in addition to the main process, total = n_proc+1 + + def work(articles, return_dict): + sentences = {} + for i, article in enumerate(articles): + sentences[i] = segmenter.segment_string(articles[article]) + + if i % 5000 == 0: + print('Segmenting article', i) + + return_dict.update(sentences) + + for item in chunks(self.articles, len(self.articles)): + p = multiprocessing.Process(target=work, args=(item, return_dict)) + + # Busy wait + while len(jobs) >= n_processes: + pass + + jobs.append(p) + p.start() + + for proc in jobs: + proc.join() + + elif use_multiprocessing == 'queue': + work_queue = multiprocessing.Queue() + jobs = [] + + for item in chunks(self.articles, len(self.articles)): + pass + + else: # serial option + for i, article in enumerate(self.articles): + self.sentences[i] = segmenter.segment_string(self.articles[article]) + + if i % 5000 == 0: + print('Segmenting article', i) + + print('End: Sentence Segmentation') + + + def init_output_files(self): + print('Start: Init Output Files') + assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.' + assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.' + + for i in range(self.n_training_shards): + name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension + self.output_training_files[name] = [] + + for i in range(self.n_test_shards): + name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension + self.output_test_files[name] = [] + + print('End: Init Output Files') + + + def get_sentences_per_shard(self, shard): + result = 0 + for article_id in shard: + result += len(self.sentences[article_id]) + + return result + + + def distribute_articles_over_shards(self): + print('Start: Distribute Articles Over Shards') + assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.' + + # Create dictionary with - key: sentence count per article, value: article id number + sentence_counts = defaultdict(lambda: []) + + max_sentences = 0 + total_sentences = 0 + + for article_id in self.sentences: + current_length = len(self.sentences[article_id]) + sentence_counts[current_length].append(article_id) + max_sentences = max(max_sentences, current_length) + total_sentences += current_length + + n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences) + nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards + nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards + + consumed_article_set = set({}) + unused_article_set = set(self.articles.keys()) + + # Make first pass and add one article worth of lines per file + for file in self.output_training_files: + current_article_id = sentence_counts[max_sentences][-1] + sentence_counts[max_sentences].pop(-1) + self.output_training_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard: + nominal_sentences_per_training_shard = len(self.sentences[current_article_id]) + print('Warning: A single article contains more than the nominal number of sentences per training shard.') + + for file in self.output_test_files: + current_article_id = sentence_counts[max_sentences][-1] + sentence_counts[max_sentences].pop(-1) + self.output_test_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard: + nominal_sentences_per_test_shard = len(self.sentences[current_article_id]) + print('Warning: A single article contains more than the nominal number of sentences per test shard.') + + training_counts = [] + test_counts = [] + + for shard in self.output_training_files: + training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard])) + + training_median = statistics.median(training_counts) + test_median = statistics.median(test_counts) + + # Make subsequent passes over files to find articles to add without going over limit + history_remaining = [] + n_history_remaining = 4 + + while len(consumed_article_set) < len(self.articles): + for fidx, file in enumerate(self.output_training_files): + nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0: + nominal_next_article_size -= 1 + + if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median: + continue # skip adding to this file, will come back later if no file can accept unused articles + + current_article_id = sentence_counts[nominal_next_article_size][-1] + sentence_counts[nominal_next_article_size].pop(-1) + + self.output_training_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + for fidx, file in enumerate(self.output_test_files): + nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0: + nominal_next_article_size -= 1 + + if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median: + continue # skip adding to this file, will come back later if no file can accept unused articles + + current_article_id = sentence_counts[nominal_next_article_size][-1] + sentence_counts[nominal_next_article_size].pop(-1) + + self.output_test_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed + if len(history_remaining) == n_history_remaining: + history_remaining.pop(0) + history_remaining.append(len(unused_article_set)) + + history_same = True + for i in range(1, len(history_remaining)): + history_same = history_same and (history_remaining[i-1] == history_remaining[i]) + + if history_same: + nominal_sentences_per_training_shard += 1 + # nominal_sentences_per_test_shard += 1 + + training_counts = [] + test_counts = [] + for shard in self.output_training_files: + training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard])) + + training_median = statistics.median(training_counts) + test_median = statistics.median(test_counts) + + print('Distributing data over shards:', len(unused_article_set), 'articles remaining.') + + + if len(unused_article_set) != 0: + print('Warning: Some articles did not make it into output files.') + + + for shard in self.output_training_files: + print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard])) + + print('End: Distribute Articles Over Shards') + + + def write_shards_to_disk(self): + print('Start: Write Shards to Disk') + for shard in self.output_training_files: + self.write_single_shard(shard, self.output_training_files[shard]) + + for shard in self.output_test_files: + self.write_single_shard(shard, self.output_test_files[shard]) + + print('End: Write Shards to Disk') + + + def write_single_shard(self, shard_name, shard): + with open(shard_name, mode='w', newline='\n') as f: + for article_id in shard: + for line in self.sentences[article_id]: + f.write(line + '\n') + + f.write('\n') # Line break between articles + + +import nltk + +nltk.download('punkt') + +class NLTKSegmenter: + def __init(self): + pass + + def segment_string(self, article): + return nltk.tokenize.sent_tokenize(article) + diff --git a/modelzoo/ELECTRA/data/WikiDownloader.py b/modelzoo/ELECTRA/data/WikiDownloader.py new file mode 100644 index 00000000..505ec76c --- /dev/null +++ b/modelzoo/ELECTRA/data/WikiDownloader.py @@ -0,0 +1,57 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import os +import urllib.request +import subprocess +import sys + +class WikiDownloader: + def __init__(self, language, save_path): + self.save_path = save_path + '/wikicorpus_' + language + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + self.language = language + self.download_urls = { + 'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2', + 'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' + } + + self.output_files = { + 'en' : 'wikicorpus_en.xml.bz2', + 'zh' : 'wikicorpus_zh.xml.bz2' + } + + + def download(self): + if self.language in self.download_urls: + url = self.download_urls[self.language] + filename = self.output_files[self.language] + + print('Downloading:', url) + if os.path.isfile(self.save_path + '/' + filename): + print('** Download file already exists, skipping download') + else: + response = urllib.request.urlopen(url) + with open(self.save_path + '/' + filename, "wb") as handle: + handle.write(response.read()) + + # Always unzipping since this is relatively fast and will overwrite + print('Unzipping:', self.output_files[self.language]) + subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True) + + else: + assert False, 'WikiDownloader not implemented for this language yet.' \ No newline at end of file diff --git a/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py new file mode 100644 index 00000000..9d356b13 --- /dev/null +++ b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py @@ -0,0 +1,46 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os + +class WikicorpusTextFormatting: + def __init__(self, wiki_path, output_filename, recursive = False): + self.wiki_path = wiki_path + self.recursive = recursive + self.output_filename = output_filename + + + # This puts one article per line + def merge(self): + with open(self.output_filename, mode='w', newline='\n') as ofile: + for dirname in glob.glob(self.wiki_path + '/*/', recursive=False): + for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive): + print(filename) + article_lines = [] + article_open = False + + with open(filename, mode='r', newline='\n') as file: + for line in file: + if '' in line: + article_open = False + for oline in article_lines[1:]: + if oline != '\n': + ofile.write(oline.rstrip() + " ") + ofile.write("\n\n") + article_lines = [] + else: + if article_open: + article_lines.append(line) \ No newline at end of file diff --git a/modelzoo/ELECTRA/data/__init__.py b/modelzoo/ELECTRA/data/__init__.py new file mode 100644 index 00000000..98386fd4 --- /dev/null +++ b/modelzoo/ELECTRA/data/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/modelzoo/ELECTRA/data/create_datasets_from_start.sh b/modelzoo/ELECTRA/data/create_datasets_from_start.sh new file mode 100755 index 00000000..58a72437 --- /dev/null +++ b/modelzoo/ELECTRA/data/create_datasets_from_start.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +to_download=${1:-"wiki_only"} + +#Download +if [ "$to_download" = "wiki_books" ] ; then + python3 /workspace/electra/data/dataPrep.py --action download --dataset bookscorpus +fi +python3 /workspace/electra/data/dataPrep.py --action download --dataset wikicorpus_en + +#Download SQuAD +python3 /workspace/electra/data/dataPrep.py --action download --dataset squad + +# Properly format the text files +if [ "$to_download" = "wiki_books" ] ; then + python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset bookscorpus +fi +python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset wikicorpus_en + +if [ "$to_download" = "wiki_books" ] ; then + DATASET="books_wiki_en_corpus" +else + DATASET="wikicorpus_en" + # Shard the text files +fi + +# Shard the text files (group wiki+books then shard) +python3 /workspace/electra/data/dataPrep.py --action sharding --dataset $DATASET --n_test_shards 2048 --n_training_shards 2048 + +# Create tfrecoreds files Phase 1 +python3 /workspace/electra/data/dataPrep.py --action create_tfrecord_files --dataset $DATASET --max_seq_length 128 --n_test_shards 2048 --n_training_shards 2048 --vocab_file=vocab/vocab.txt --do_lower_case=1 + +# Create tfrecoreds files Phase 2 +python3 /workspace/electra/data/dataPrep.py --action create_tfrecord_files --dataset $DATASET --max_seq_length 512 --n_test_shards 2048 --n_training_shards 2048 --vocab_file=vocab/vocab.txt --do_lower_case=1 diff --git a/modelzoo/ELECTRA/data/dataPrep.py b/modelzoo/ELECTRA/data/dataPrep.py new file mode 100644 index 00000000..a029bc63 --- /dev/null +++ b/modelzoo/ELECTRA/data/dataPrep.py @@ -0,0 +1,312 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import BookscorpusTextFormatting +import Downloader +import TextSharding +import WikicorpusTextFormatting + +import argparse +import itertools +import multiprocessing +import os +import pprint +import subprocess + + +def main(args): + working_dir = os.environ['DATA_PREP_WORKING_DIR'] + + print('Working Directory:', working_dir) + print('Action:', args.action) + print('Dataset Name:', args.dataset) + + if args.input_files: + args.input_files = args.input_files.split(',') + + hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \ + + "_random_seed_" + str(args.random_seed) + + directory_structure = { + 'download' : working_dir + '/download', # Downloaded and decompressed + 'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor) + 'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same + 'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set), + 'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix, + 'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix + } + + print('\nDirectory Structure:') + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(directory_structure) + print('') + + if args.action == 'download': + if not os.path.exists(directory_structure['download']): + os.makedirs(directory_structure['download']) + + downloader = Downloader.Downloader(args.dataset, directory_structure['download']) + downloader.download() + + elif args.action == 'text_formatting': + assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights' + + if not os.path.exists(directory_structure['extracted']): + os.makedirs(directory_structure['extracted']) + + if not os.path.exists(directory_structure['formatted']): + os.makedirs(directory_structure['formatted']) + + if args.dataset == 'bookscorpus': + books_path = directory_structure['download'] + '/bookscorpus' + #books_path = directory_structure['download'] + output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt' + books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True) + books_formatter.merge() + + elif args.dataset == 'wikicorpus_en': + if args.skip_wikiextractor == 0: + path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py' + wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset + print('WikiExtractor Command:', wikiextractor_command) + wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) + #wikiextractor_process.communicate() + + wiki_path = directory_structure['extracted'] + '/wikicorpus_en' + output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt' + wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True) + wiki_formatter.merge() + + elif args.dataset == 'wikicorpus_zh': + raise NotImplementedError( + 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be ' + 'translated and properly segmented still, and should work once this step is added.') + # if args.skip_wikiextractor == 0: + # path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py' + # wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset + # print('WikiExtractor Command:', wikiextractor_command) + # wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) + # #wikiextractor_process.communicate() + # + # wiki_path = directory_structure['extracted'] + '/wikicorpus_zh' + # output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt' + # wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True) + # wiki_formatter.merge() + # + # assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.' + + elif args.action == 'sharding': + # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces) + if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset: + if args.input_files is None: + if args.dataset == 'bookscorpus': + args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'] + elif args.dataset == 'wikicorpus_en': + args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'] + elif args.dataset == 'wikicorpus_zh': + args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'] + elif args.dataset == 'books_wiki_en_corpus': + args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'] + + output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset + + if not os.path.exists(directory_structure['sharded']): + os.makedirs(directory_structure['sharded']) + + if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset): + os.makedirs(directory_structure['sharded'] + '/' + args.dataset) + + # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and + # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this. + # Different languages (e.g., Chinese simplified/traditional) may require translation and + # other packages to be called from here -- just add a conditional branch for those extra steps + segmenter = TextSharding.NLTKSegmenter() + sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set) + + sharding.load_articles() + sharding.segment_articles_into_sentences(segmenter) + sharding.distribute_articles_over_shards() + sharding.write_shards_to_disk() + + for _dir in ['train', 'test']: + if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir): + os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir) + absolute_dir = directory_structure['sharded'] + '/' + args.dataset + command = 'mv ' + absolute_dir + '/*' + _dir + '*.txt' + ' ' + absolute_dir + '/' + _dir + mv_process = subprocess.Popen(command, shell=True) + + mv_process.wait() + else: + assert False, 'Unsupported dataset for sharding' + + elif args.action == 'create_tfrecord_files': + + if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset): + os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset) + if args.vocab_file is None: + args.vocab_file = os.path.join(working_dir, "vocab.txt") + + for _dir in ['train', 'test']: + electra_preprocessing_command = 'python /workspace/electra/build_pretraining_dataset.py' + electra_preprocessing_command += ' --corpus-dir=' + directory_structure['sharded'] + '/' + args.dataset + '/' + _dir + electra_preprocessing_command += ' --output-dir=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + _dir + electra_preprocessing_command += ' --vocab-file=' + args.vocab_file + electra_preprocessing_command += ' --do-lower-case' if args.do_lower_case else ' --no-lower-case' + electra_preprocessing_command += ' --max-seq-length=' + str(args.max_seq_length) + electra_preprocessing_command += ' --num-processes=8' + electra_preprocessing_command += ' --num-out-files=' + str(args.n_training_shards) if _dir == 'train' \ + else ' --num-out-files=' + str(args.n_test_shards) + electra_preprocessing_process = subprocess.Popen(electra_preprocessing_command, shell=True) + + electra_preprocessing_process.wait() + + + elif args.action == 'create_hdf5_files': + raise NotImplementedError + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Preprocessing Application for Everything BERT-related' + ) + + parser.add_argument( + '--action', + type=str, + help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords', + choices={ + 'download', # Download and verify mdf5/sha sums + 'text_formatting', # Convert into a file that contains one article/book per line + 'sharding', # Convert previous formatted text into shards containing one sentence per line + 'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info + 'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info + } + ) + + parser.add_argument( + '--dataset', + type=str, + help='Specify the dataset to perform --action on', + choices={ + 'bookscorpus', + 'wikicorpus_en', + 'wikicorpus_zh', + 'books_wiki_en_corpus', + 'google_pretrained_weights', + 'nvidia_pretrained_weights', + 'mrpc', + 'squad', + 'all' + } + ) + + parser.add_argument( + '--input_files', + type=str, + help='Specify the input files in a comma-separated list (no spaces)' + ) + + parser.add_argument( + '--n_training_shards', + type=int, + help='Specify the number of training shards to generate', + default=2048 + ) + + parser.add_argument( + '--n_test_shards', + type=int, + help='Specify the number of test shards to generate', + default=2048 + ) + + parser.add_argument( + '--fraction_test_set', + type=float, + help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)', + default=0.1 + ) + + parser.add_argument( + '--segmentation_method', + type=str, + help='Specify your choice of sentence segmentation', + choices={ + 'nltk' + }, + default='nltk' + ) + + parser.add_argument( + '--n_processes', + type=int, + help='Specify the max number of processes to allow at one time', + default=4 + ) + + parser.add_argument( + '--random_seed', + type=int, + help='Specify the base seed to use for any random number generation', + default=12345 + ) + + parser.add_argument( + '--dupe_factor', + type=int, + help='Specify the duplication factor', + default=5 + ) + + parser.add_argument( + '--masked_lm_prob', + type=float, + help='Specify the probability for masked lm', + default=0.15 + ) + + parser.add_argument( + '--max_seq_length', + type=int, + help='Specify the maximum sequence length', + default=512 + ) + + parser.add_argument( + '--do_lower_case', + type=int, + help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)', + default=0 + ) + + parser.add_argument( + '--vocab_file', + type=str, + help='Specify absolute path to vocab file to use)' + ) + + parser.add_argument( + '--skip_wikiextractor', + type=int, + help='Specify whether to skip wikiextractor step 0=False, 1=True', + default=0 + ) + + parser.add_argument( + '--interactive_json_config_generator', + type=str, + help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords' + ) + + args = parser.parse_args() + main(args) diff --git a/modelzoo/ELECTRA/data/glue/download_mrpc.sh b/modelzoo/ELECTRA/data/glue/download_mrpc.sh new file mode 100755 index 00000000..65f3446b --- /dev/null +++ b/modelzoo/ELECTRA/data/glue/download_mrpc.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo "Downloading MRPC data" + +wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py + +python download_glue_data.py --data_dir . --tasks MRPC diff --git a/modelzoo/ELECTRA/data/squad/squad_download.sh b/modelzoo/ELECTRA/data/squad/squad_download.sh new file mode 100755 index 00000000..7aa6f268 --- /dev/null +++ b/modelzoo/ELECTRA/data/squad/squad_download.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo "Downloading dataset for squad..." + +# Download SQuAD + +v1="v1.1" +mkdir $v1 +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json +wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py + +EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945 -' +EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552 -' +EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9 -' +CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum` +CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum` +CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum` + +v2="v2.0" +mkdir $v2 +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json +wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py + +EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740 -' +EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8 -' +EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627 -' + +CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum` +CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum` +CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum` + +echo "Squad data download done!" + +echo "Verifying Dataset...." + +if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then + echo "train-v1.1.json is corrupted! md5sum doesn't match" +fi + +if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then + echo "dev-v1.1.json is corrupted! md5sum doesn't match" +fi +if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then + echo "evaluate-v1.1.py is corrupted! md5sum doesn't match" +fi + + +if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then + echo "train-v2.0.json is corrupted! md5sum doesn't match" +fi +if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then + echo "dev-v2.0.json is corrupted! md5sum doesn't match" +fi +if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then + echo "evaluate-v2.0.py is corrupted! md5sum doesn't match" +fi + +echo "Complete!" diff --git a/modelzoo/ELECTRA/file_utils.py b/modelzoo/ELECTRA/file_utils.py new file mode 100644 index 00000000..da6a96e0 --- /dev/null +++ b/modelzoo/ELECTRA/file_utils.py @@ -0,0 +1,515 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" + +import fnmatch +import json +import logging +import os +import shutil +import sys +import tarfile +import tempfile +from contextlib import contextmanager +from functools import partial, wraps +from hashlib import sha256 +from typing import Optional +from urllib.parse import urlparse +from zipfile import ZipFile, is_zipfile + +import boto3 +import requests +from botocore.config import Config +from botocore.exceptions import ClientError +from filelock import FileLock +from tqdm.auto import tqdm + +# from examples import __version__ +__version__ = "0.1" + + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +try: + USE_TF = os.environ.get("USE_TF", "AUTO").upper() + USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() + if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"): + import torch + + _torch_available = True # pylint: disable=invalid-name + logger.info("PyTorch version {} available.".format(torch.__version__)) + else: + logger.info("Disabling PyTorch because USE_TF is set") + _torch_available = False +except ImportError: + _torch_available = False # pylint: disable=invalid-name + +try: + USE_TF = os.environ.get("USE_TF", "AUTO").upper() + USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() + + if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"): + import tensorflow as tf + + assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 + _tf_available = True # pylint: disable=invalid-name + logger.info("TensorFlow version {} available.".format(tf.__version__)) + else: + logger.info("Disabling Tensorflow because USE_TORCH is set") + _tf_available = False +except (ImportError, AssertionError): + _tf_available = False # pylint: disable=invalid-name + +try: + from torch.hub import _get_torch_home + + torch_cache_home = _get_torch_home() +except ImportError: + torch_cache_home = os.path.expanduser( + os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")) + ) +default_cache_path = os.path.join(torch_cache_home, "transformers") + +try: + from pathlib import Path + + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)) + ) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv( + "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) + ) + +PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility +TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility + +WEIGHTS_NAME = "pytorch_model.bin" +TF2_WEIGHTS_NAME = "tf_model.h5" +TF_WEIGHTS_NAME = "model.ckpt" +CONFIG_NAME = "config.json" +MODEL_CARD_NAME = "modelcard.json" + + +MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]] +DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] +DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] + +S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" +CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net" + + +def is_torch_available(): + return _torch_available + + +def is_tf_available(): + return _tf_available + + +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_start_docstrings_to_callable(*docstr): + def docstring_decorator(fn): + class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0]) + intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name) + note = r""" + + .. note:: + Although the recipe for forward pass needs to be defined within + this function, one should call the :class:`Module` instance afterwards + instead of this since the former takes care of running the + pre and post processing steps while the latter silently ignores them. + """ + fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + "".join(docstr) + return fn + + return docstring_decorator + + +def is_remote_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https", "s3") + + +def hf_bucket_url(identifier, postfix=None, cdn=False) -> str: + endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX + if postfix is None: + return "/".join((endpoint, identifier)) + else: + return "/".join((endpoint, identifier, postfix)) + + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name + so that TF 2.0 can identify it as a HDF5 file + (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) + """ + url_bytes = url.encode("utf-8") + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode("utf-8") + etag_hash = sha256(etag_bytes) + filename += "." + etag_hash.hexdigest() + + if url.endswith(".h5"): + filename += ".h5" + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError("file {} not found".format(cache_path)) + + meta_path = cache_path + ".json" + if not os.path.exists(meta_path): + raise EnvironmentError("file {} not found".format(meta_path)) + + with open(meta_path, encoding="utf-8") as meta_file: + metadata = json.load(meta_file) + url = metadata["url"] + etag = metadata["etag"] + + return url, etag + + +def cached_path( + url_or_filename, + cache_dir=None, + force_download=False, + proxies=None, + resume_download=False, + user_agent=None, + extract_compressed_file=False, + force_extract=False, + local_files_only=False, +) -> Optional[str]: + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + Args: + cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). + force_download: if True, re-dowload the file even if it's already cached in the cache dir. + resume_download: if True, resume the download if incompletly recieved file is found. + user_agent: Optional string or dict that will be appended to the user-agent on remote requests. + extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed + file in a folder along the archive. + force_extract: if True when extract_compressed_file is True and the archive was already extracted, + re-extract the archive and overide the folder where it was extracted. + + Return: + None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + Local path (string) otherwise + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if is_remote_url(url_or_filename): + # URL, so get it from the cache (downloading if necessary) + output_path = get_from_cache( + url_or_filename, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + user_agent=user_agent, + local_files_only=local_files_only, + ) + elif os.path.exists(url_or_filename): + # File, and it exists. + output_path = url_or_filename + elif urlparse(url_or_filename).scheme == "": + # File, but it doesn't exist. + raise EnvironmentError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + if extract_compressed_file: + if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): + return output_path + + # Path where we extract compressed archives + # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" + output_dir, output_file = os.path.split(output_path) + output_extract_dir_name = output_file.replace(".", "-") + "-extracted" + output_path_extracted = os.path.join(output_dir, output_extract_dir_name) + + if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: + return output_path_extracted + + # Prevent parallel extractions + lock_path = output_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path_extracted, ignore_errors=True) + os.makedirs(output_path_extracted) + if is_zipfile(output_path): + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + elif tarfile.is_tarfile(output_path): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + else: + raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) + + return output_path_extracted + + return output_path + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise EnvironmentError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url, proxies=None): + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file, proxies=None): + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): + ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) + if is_torch_available(): + ua += "; torch/{}".format(torch.__version__) + if is_tf_available(): + ua += "; tensorflow/{}".format(tf.__version__) + if isinstance(user_agent, dict): + ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) + elif isinstance(user_agent, str): + ua += "; " + user_agent + headers = {"user-agent": ua} + if resume_size > 0: + headers["Range"] = "bytes=%d-" % (resume_size,) + response = requests.get(url, stream=True, proxies=proxies, headers=headers) + if response.status_code == 416: # Range not satisfiable + return + content_length = response.headers.get("Content-Length") + total = resume_size + int(content_length) if content_length is not None else None + progress = tqdm( + unit="B", + unit_scale=True, + total=total, + initial=resume_size, + desc="Downloading", + disable=bool(logger.getEffectiveLevel() == logging.NOTSET), + ) + for chunk in response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache( + url, + cache_dir=None, + force_download=False, + proxies=None, + etag_timeout=10, + resume_download=False, + user_agent=None, + local_files_only=False, +) -> Optional[str]: + """ + Given a URL, look for the corresponding file in the local cache. + If it's not there, download it. Then return the path to the cached file. + + Return: + None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + Local path (string) otherwise + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + os.makedirs(cache_dir, exist_ok=True) + + etag = None + if not local_files_only: + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url, proxies=proxies) + else: + try: + response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout) + if response.status_code == 200: + etag = response.headers.get("ETag") + except (EnvironmentError, requests.exceptions.Timeout): + # etag is already None + pass + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. + # try to get the last downloaded one + if etag is None: + if os.path.exists(cache_path): + return cache_path + else: + matching_files = [ + file + for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") + if not file.endswith(".json") and not file.endswith(".lock") + ] + if len(matching_files) > 0: + return os.path.join(cache_dir, matching_files[-1]) + else: + # If files cannot be found and local_files_only=True, + # the models might've been found if local_files_only=False + # Notify the user about that + if local_files_only: + raise ValueError( + "Cannot find the requested files in the cached path and outgoing traffic has been" + " disabled. To enable model look-ups and downloads online, set 'local_files_only'" + " to False." + ) + return None + + # From now on, etag is not None. + if os.path.exists(cache_path) and not force_download: + return cache_path + + # Prevent parallel downloads of the same file with a lock. + lock_path = cache_path + ".lock" + with FileLock(lock_path): + + if resume_download: + incomplete_path = cache_path + ".incomplete" + + @contextmanager + def _resumable_file_manager(): + with open(incomplete_path, "a+b") as f: + yield f + + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + else: + resume_size = 0 + else: + temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) + resume_size = 0 + + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + if resume_download: + logger.warn('Warning: resumable downloads are not implemented for "s3://" urls') + s3_get(url, temp_file, proxies=proxies) + else: + http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) + + logger.info("storing %s in cache at %s", url, cache_path) + os.replace(temp_file.name, cache_path) + + logger.info("creating metadata file for %s", cache_path) + meta = {"url": url, "etag": etag} + meta_path = cache_path + ".json" + with open(meta_path, "w") as meta_file: + json.dump(meta, meta_file) + + return cache_path diff --git a/modelzoo/ELECTRA/gpu_affinity.py b/modelzoo/ELECTRA/gpu_affinity.py new file mode 100644 index 00000000..68520734 --- /dev/null +++ b/modelzoo/ELECTRA/gpu_affinity.py @@ -0,0 +1,63 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os + +import pynvml + +pynvml.nvmlInit() + + +def systemGetDriverVersion(): + return pynvml.nvmlSystemGetDriverVersion() + + +def deviceGetCount(): + return pynvml.nvmlDeviceGetCount() + + +class device: + # assume nvml returns list of 64 bit ints + _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) + + def __init__(self, device_idx): + super().__init__() + self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx) + + def getName(self): + return pynvml.nvmlDeviceGetName(self.handle) + + def getCpuAffinity(self): + affinity_string = '' + for j in pynvml.nvmlDeviceGetCpuAffinity( + self.handle, device._nvml_affinity_elements + ): + # assume nvml returns list of 64 bit ints + affinity_string = '{:064b}'.format(j) + affinity_string + affinity_list = [int(x) for x in affinity_string] + affinity_list.reverse() # so core 0 is in 0th element of list + + return [i for i, e in enumerate(affinity_list) if e != 0] + + +def set_affinity(gpu_id=None): + if gpu_id is None: + gpu_id = int(os.getenv('LOCAL_RANK', 0)) + + dev = device(gpu_id) + os.sched_setaffinity(0, dev.getCpuAffinity()) + + # list of ints representing the logical cores this process is now affinitied with + return os.sched_getaffinity(0) diff --git a/modelzoo/ELECTRA/images/total_loss.svg b/modelzoo/ELECTRA/images/total_loss.svg new file mode 100644 index 00000000..215868d7 --- /dev/null +++ b/modelzoo/ELECTRA/images/total_loss.svg @@ -0,0 +1 @@ +891020-1k01k2k3k4k5k6k7k8k9k10k11k \ No newline at end of file diff --git a/modelzoo/ELECTRA/modeling.py b/modelzoo/ELECTRA/modeling.py new file mode 100644 index 00000000..437decca --- /dev/null +++ b/modelzoo/ELECTRA/modeling.py @@ -0,0 +1,1084 @@ +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import tensorflow as tf + +from configuration import ElectraConfig +from file_utils import add_start_docstrings, add_start_docstrings_to_callable +from modeling_utils import ACT2FN, TFBertEncoder, TFBertPreTrainedModel +from modeling_utils import get_initializer, shape_list +from tokenization_utils import BatchEncoding +import pretrain_utils, collections + +logger = logging.getLogger(__name__) + + +TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = { + "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5", + "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5", + "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5", + "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5", + "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5", + "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5", +} + + +class TFElectraEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.embedding_size = config.embedding_size + self.initializer_range = config.initializer_range + + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.embedding_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="position_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.embedding_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="token_type_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.amp = config.amp + + def build(self, input_shape): + """Build shared word embedding layer """ + with tf.name_scope("word_embeddings"): + # Create and initialize weights. The random normal initializer was chosen + # arbitrarily, and works well. + self.word_embeddings = self.add_weight( + "weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + super().build(input_shape) + + def call(self, inputs, mode="embedding", training=False): + """Get token embeddings of inputs. + Args: + inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) + mode: string, a valid value is one of "embedding" and "linear". + Returns: + outputs: (1) If mode == "embedding", output embedding tensor, float32 with + shape [batch_size, length, embedding_size]; (2) mode == "linear", output + linear tensor, float32 with shape [batch_size, length, vocab_size]. + Raises: + ValueError: if mode is not valid. + + Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + if mode == "embedding": + return self._embedding(inputs, training=training) + elif mode == "linear": + return self._linear(inputs) + else: + raise ValueError("mode {} is not valid.".format(mode)) + + def _embedding(self, inputs, training=False): + """Applies embedding based on inputs tensor.""" + input_ids, position_ids, token_type_ids, inputs_embeds = inputs + + if input_ids is not None: + input_shape = shape_list(input_ids) + else: + input_shape = shape_list(inputs_embeds)[:-1] + + seq_length = input_shape[1] + if position_ids is None: + position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + if inputs_embeds is None: + inputs_embeds = tf.gather(self.word_embeddings, input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + if self.amp: + embeddings = inputs_embeds + tf.cast(position_embeddings, tf.float16) + tf.cast(token_type_embeddings, tf.float16) + else: + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + def _linear(self, inputs): + """Computes logits by running inputs through a linear layer. + Args: + inputs: A float32 tensor with shape [batch_size, length, hidden_size] + Returns: + float32 tensor with shape [batch_size, length, vocab_size]. + """ + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] + + x = tf.reshape(inputs, [-1, self.embedding_size]) + logits = tf.matmul(x, self.word_embeddings, transpose_b=True) + + return tf.reshape(logits, [batch_size, length, self.vocab_size]) + + +class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense") + self.dense_prediction = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="dense_prediction") + self.config = config + + def call(self, discriminator_hidden_states, training=False): + hidden_states = self.dense(discriminator_hidden_states) + hidden_states = ACT2FN[self.config.hidden_act](hidden_states) + logits = tf.squeeze(self.dense_prediction(hidden_states), axis=-1) + + return logits + + +class TFElectraGeneratorPredictions(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = tf.keras.layers.Dense( + config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense") + + def call(self, generator_hidden_states, training=False): + hidden_states = self.dense(generator_hidden_states) + hidden_states = ACT2FN["gelu"](hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +class TFElectraPreTrainedModel(TFBertPreTrainedModel): + + config_class = ElectraConfig + pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "electra" + + def get_extended_attention_mask(self, attention_mask, input_shape): + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + + extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + return extended_attention_mask + + def get_head_mask(self, head_mask): + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + return head_mask + + +class TFElectraMainLayer(TFElectraPreTrainedModel): + + config_class = ElectraConfig + + def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs): + super().__init__(config, **kwargs) + + if shared_embeddings and input_embeddings is not None: + self.embeddings = input_embeddings + else: + self.embeddings = TFElectraEmbeddings(config, name="embeddings") + + if config.embedding_size != config.hidden_size: + self.embeddings_project = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="embeddings_project") + self.encoder = TFBertEncoder(config, name="encoder") + self.config = config + + def get_input_embeddings(self): + return self.embeddings + + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + raise NotImplementedError + + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + head_mask = self.get_head_mask(head_mask) + + hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states, training=training) + + hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training) + + return hidden_states + + +ELECTRA_START_DOCSTRING = r""" + This model is a `tf.keras.Model `__ sub-class. + Use it as a regular TF 2.0 Keras Model and + refer to the TF 2.0 documentation for all matter related to general usage and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having + all the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors + in the first positional argument : + + - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` + + Parameters: + config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +ELECTRA_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.ElectraTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + + `What are attention masks? <../glossary.html#attention-mask>`__ + head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + training (:obj:`boolean`, `optional`, defaults to :obj:`False`): + Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them + (if set to :obj:`False`) for evaluation. + +""" + + +@add_start_docstrings( + "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " + "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " + "hidden size and embedding size are different." + "" + "Both the generator and discriminator checkpoints may be loaded into this model.", + ELECTRA_START_DOCSTRING, +) +class TFElectraModel(TFElectraPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.electra = TFElectraMainLayer(config, name="electra") + + def get_input_embeddings(self): + return self.electra.embeddings + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import ElectraTokenizer, TFElectraModel + + tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') + model = TFElectraModel.from_pretrained('google/electra-small-discriminator') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + """ + outputs = self.electra(inputs, **kwargs) + return outputs + + +@add_start_docstrings( + """ +Electra model with a binary classification head on top as used during pre-training for identifying generated +tokens. + +Even though both the discriminator and generator may be loaded into this model, the discriminator is +the only model of the two to have the correct classification head to be used for this model.""", + ELECTRA_START_DOCSTRING, +) +class TFElectraForPreTraining(TFElectraPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") + + def get_input_embeddings(self): + return self.electra.embeddings + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + r""" + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Prediction scores of the head (scores for each token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import ElectraTokenizer, TFElectraForPreTraining + + tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') + model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + scores = outputs[0] + """ + + discriminator_hidden_states = self.electra( + input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training + ) + discriminator_sequence_output = discriminator_hidden_states[0] + logits = self.discriminator_predictions(discriminator_sequence_output) + output = (logits,) + output += discriminator_hidden_states[1:] + + return output # (loss), scores, (hidden_states), (attentions) + + +class TFElectraMaskedLMHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + super().build(input_shape) + + def call(self, hidden_states, training=False): + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + return hidden_states + + +@add_start_docstrings( + """ +Electra model with a language modeling head on top. + +Even though both the discriminator and generator may be loaded into this model, the generator is +the only model of the two to have been trained for the masked language modeling task.""", + ELECTRA_START_DOCSTRING, +) +class TFElectraForMaskedLM(TFElectraPreTrainedModel): + def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs): + super().__init__(config, **kwargs) + + self.vocab_size = config.vocab_size + self.electra = TFElectraMainLayer(config, + shared_embeddings=shared_embeddings, + input_embeddings=input_embeddings, + name="electra") + self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") + + def get_input_embeddings(self): + return self.electra.embeddings + + def get_output_embeddings(self): + return self.generator_lm_head + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + r""" + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import ElectraTokenizer, TFElectraForMaskedLM + + tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') + model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + prediction_scores = outputs[0] + + """ + + generator_hidden_states = self.electra( + input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training + ) + generator_sequence_output = generator_hidden_states[0] + prediction_scores = self.generator_predictions(generator_sequence_output, training=training) + prediction_scores = self.generator_lm_head(prediction_scores, training=training) + output = (prediction_scores,) + output += generator_hidden_states[1:] + + return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + +def get_generator_config(config, bert_config): + """Get model config for the generator network.""" + gen_config = ElectraConfig.from_dict(bert_config.to_dict()) + gen_config.hidden_size = int(round( + bert_config.hidden_size * config.generator_hidden_size)) + #To keep hidden size divisble by 64 - attention head size + if gen_config.hidden_size % 64 != 0: + gen_config.hidden_size += 64 - (gen_config.hidden_size % 64) + gen_config.num_hidden_layers = int(round( + bert_config.num_hidden_layers * config.generator_layers)) + gen_config.intermediate_size = 4 * gen_config.hidden_size + gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64) + return gen_config + +class PretrainingModel(tf.keras.Model): + """Transformer pre-training using the replaced-token-detection task.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + # Set up model config + self._config = config + self.disc_config = ElectraConfig(vocab_size=config.vocab_size, + embedding_size=config.embedding_size, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=4*config.hidden_size, + hidden_act=config.act_func, + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, ) + self.disc_config.update({"amp": config.amp}) + + # Set up discriminator + self.discriminator = TFElectraForPreTraining(self.disc_config) + + # Set up generator + gen_config = get_generator_config(config, self.disc_config) + gen_config.update({"amp": config.amp}) + if config.electra_objective: + if config.shared_embeddings: + self.generator = TFElectraForMaskedLM( + gen_config, shared_embeddings=True, + input_embeddings=self.discriminator.get_input_embeddings()) + else: + self.generator = TFElectraForMaskedLM(gen_config) + else: + self.generator = TFElectraForMaskedLM(self.disc_config) + + def call(self, features, is_training): + config = self._config + + # Mask the input + masked_inputs = pretrain_utils.mask( + config, pretrain_utils.features_to_inputs(features), config.mask_prob) + + # Generator + if config.uniform_generator: + mlm_output = self._get_masked_lm_output(masked_inputs, None, is_training=is_training) + else: + mlm_output = self._get_masked_lm_output( + masked_inputs, self.generator, is_training=is_training) + fake_data = self._get_fake_data(masked_inputs, mlm_output.logits) + total_loss = config.gen_weight * mlm_output.loss + + # Discriminator + disc_output = None + if config.electra_objective: + disc_output = self._get_discriminator_output( + fake_data.inputs, self.discriminator, fake_data.is_fake_tokens, + is_training=is_training) + total_loss += config.disc_weight * disc_output.loss + + # Evaluation inputs + eval_fn_inputs = { + "input_ids": masked_inputs.input_ids, + "masked_lm_preds": mlm_output.preds, + "mlm_loss": mlm_output.per_example_loss, + "masked_lm_ids": masked_inputs.masked_lm_ids, + "masked_lm_weights": masked_inputs.masked_lm_weights, + "input_mask": masked_inputs.input_mask + } + if config.electra_objective: + eval_fn_inputs.update({ + "disc_loss": disc_output.per_example_loss, + "disc_labels": disc_output.labels, + "disc_probs": disc_output.probs, + "disc_preds": disc_output.preds, + "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1, + output_type=tf.int32) + }) + + return total_loss, eval_fn_inputs + + def _get_masked_lm_output(self, inputs, generator, is_training=False): + """Masked language modeling softmax layer.""" + masked_lm_weights = inputs.masked_lm_weights + + if self._config.uniform_generator: + logits = tf.zeros(self.disc_config.vocab_size) + logits_tiled = tf.zeros( + pretrain_utils.get_shape_list(inputs.masked_lm_ids) + + [self.disc_config.vocab_size]) + logits_tiled += tf.reshape(logits, [1, 1, self.disc_config.vocab_size]) + logits = logits_tiled + else: + outputs = generator( + input_ids=inputs.input_ids, + attention_mask=inputs.input_mask, + token_type_ids=inputs.segment_ids, + training=is_training) + logits = outputs[0] + logits = pretrain_utils.gather_positions( + logits, inputs.masked_lm_positions) + + oh_labels = tf.one_hot( + inputs.masked_lm_ids, depth=self.disc_config.vocab_size, + dtype=tf.float32) + + probs = tf.cast(tf.nn.softmax(logits), tf.float32) + log_probs = tf.cast(tf.nn.log_softmax(logits), tf.float32) + label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) + + numerator = tf.reduce_sum(masked_lm_weights * label_log_probs) + denominator = tf.reduce_sum(masked_lm_weights) + 1e-6 + loss = numerator / denominator + preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) + + MLMOutput = collections.namedtuple( + "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) + return MLMOutput( + logits=logits, probs=probs, per_example_loss=label_log_probs, + loss=loss, preds=preds) + + def _get_discriminator_output(self, inputs, discriminator, labels, is_training=False): + """Discriminator binary classifier.""" + + outputs = discriminator( + input_ids=inputs.input_ids, + attention_mask=inputs.input_mask, + token_type_ids=inputs.segment_ids, + training=is_training, + ) + logits = outputs[0] + weights = tf.cast(inputs.input_mask, tf.float32) + labelsf = tf.cast(labels, tf.float32) + logits = tf.cast(logits, tf.float32) + losses = tf.nn.sigmoid_cross_entropy_with_logits( + logits=logits, labels=labelsf) * weights + per_example_loss = (tf.reduce_sum(losses, axis=-1) / + (1e-6 + tf.reduce_sum(weights, axis=-1))) + loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights)) + probs = tf.nn.sigmoid(logits) + preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32) + DiscOutput = collections.namedtuple( + "DiscOutput", ["loss", "per_example_loss", "probs", "preds", + "labels"]) + return DiscOutput( + loss=loss, per_example_loss=per_example_loss, probs=probs, + preds=preds, labels=labels, + ) + + def _get_fake_data(self, inputs, mlm_logits): + """Sample from the generator to create corrupted input.""" + inputs = pretrain_utils.unmask(inputs) + disallow = tf.one_hot( + inputs.masked_lm_ids, depth=self.disc_config.vocab_size, + dtype=tf.float32) if self._config.disallow_correct else None + sampled_tokens = tf.stop_gradient(pretrain_utils.sample_from_softmax( + mlm_logits / self._config.temperature, disallow=disallow)) + sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32) + updated_input_ids, masked = pretrain_utils.scatter_update( + inputs.input_ids, sampled_tokids, inputs.masked_lm_positions) + labels = masked * (1 - tf.cast( + tf.equal(updated_input_ids, inputs.input_ids), tf.int32)) + updated_inputs = pretrain_utils.get_updated_inputs( + inputs, input_ids=updated_input_ids) + FakedData = collections.namedtuple("FakedData", [ + "inputs", "is_fake_tokens", "sampled_tokens"]) + return FakedData(inputs=updated_inputs, is_fake_tokens=labels, + sampled_tokens=sampled_tokens) + + +@add_start_docstrings( + """ +Electra model with a token classification head on top. + +Both the discriminator and generator may be loaded into this model.""", + ELECTRA_START_DOCSTRING, +) +class TFElectraForTokenClassification(TFElectraPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier") + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + r""" + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import ElectraTokenizer, TFElectraForTokenClassification + + tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') + model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + scores = outputs[0] + """ + + discriminator_hidden_states = self.electra( + input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training + ) + discriminator_sequence_output = discriminator_hidden_states[0] + discriminator_sequence_output = self.dropout(discriminator_sequence_output) + logits = self.classifier(discriminator_sequence_output) + output = (logits,) + output += discriminator_hidden_states[1:] + + return output # (loss), scores, (hidden_states), (attentions) + + +class TFPoolerStartLogits(tf.keras.Model): + """ Compute SQuAD start_logits from sequence hidden states. """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + self.dense = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="start_logit_pooler_dense" + ) + + def call(self, hidden_states, p_mask=None, next_layer_dtype=tf.float32): + """ Args: + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` + invalid position mask such as query and special symbols (PAD, SEP, CLS) + 1.0 means token should be masked. + """ + x = tf.squeeze(self.dense(hidden_states), axis=-1, + name="squeeze_start_logit_pooler") + + if p_mask is not None: + x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask + + return x + + +class TFPoolerEndLogits(tf.keras.Model): + """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. + """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + self.dense_0 = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), + name="end_logit_pooler_dense_0" + ) + + self.activation = tf.keras.layers.Activation('tanh') # nn.Tanh() + self.LayerNorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=config.layer_norm_eps, + name="end_logit_pooler_LayerNorm") + self.dense_1 = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_1" + ) + + def call(self, hidden_states, start_states=None, start_positions=None, p_mask=None, training=False, + next_layer_dtype=tf.float32): + """ Args: + One of ``start_states``, ``start_positions`` should be not None. + If both are set, ``start_positions`` overrides ``start_states``. + **start_states**: ``torch.LongTensor`` of shape identical to hidden_states + hidden states of the first tokens for the labeled span. + **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the first token for the labeled span: + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` + Mask of invalid position such as query and special symbols (PAD, SEP, CLS) + 1.0 means token should be masked. + """ + assert ( + start_states is not None or start_positions is not None + ), "One of start_states, start_positions should be not None" + if start_positions is not None and training: + bsz, slen, hsz = hidden_states.shape + start_states = tf.gather(hidden_states, start_positions[:, None], axis=1, + batch_dims=1) # shape (bsz, 1, hsz) + start_states = tf.broadcast_to(start_states, (bsz, slen, hsz)) # shape (bsz, slen, hsz) + + x = self.dense_0(tf.concat([hidden_states, start_states], axis=-1)) + x = self.activation(x) + if training: + # since we are not doing beam search, add dimension with value=1. corresponds to dimension with top_k during inference - if not layernorm crashes + x = tf.expand_dims(x, axis=2) + x = self.LayerNorm(x) + + if training: + # undo the additional dimension added above + x = tf.squeeze(self.dense_1(x), axis=[-1, -2]) + else: + x = tf.squeeze(self.dense_1(x), axis=-1) + + if p_mask is not None: + x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask + + return x + + +class TFPoolerAnswerClass(tf.keras.Model): + """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + self.dense_0 = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), + name="pooler_answer_class_dense_0" + ) + + self.activation = tf.keras.layers.Activation('tanh') + self.dense_1 = tf.keras.layers.Dense( + 1, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), + name="pooler_answer_class_dense_1" + ) + + def call(self, hidden_states, start_states=None, start_positions=None, cls_index=None): + """ + Args: + One of ``start_states``, ``start_positions`` should be not None. + If both are set, ``start_positions`` overrides ``start_states``. + **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. + hidden states of the first tokens for the labeled span. + **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the first token for the labeled span. + **cls_index**: torch.LongTensor of shape ``(batch_size,)`` + position of the CLS token. If None, take the last token. + note(Original repo): + no dependency on end_feature so that we can obtain one single `cls_logits` + for each sample + """ + assert ( + start_states is not None or start_positions is not None + ), "One of start_states, start_positions should be not None" + if start_positions is not None: + start_states = tf.gather(hidden_states, start_positions[:, None], axis=1, + batch_dims=1) # shape (bsz, 1, hsz) + start_states = tf.squeeze(start_states, axis=1) # shape (bsz, hsz) + + if cls_index is not None: + cls_token_state = tf.gather(hidden_states, cls_index[:, None], axis=1, batch_dims=1) # shape (bsz, 1, hsz) + cls_token_state = tf.squeeze(cls_token_state, axis=1) # shape (bsz, hsz) + else: + cls_token_state = hidden_states[:, 0, :] # shape (bsz, hsz) + + x = self.dense_0(tf.concat([start_states, cls_token_state], axis=-1)) + x = self.activation(x) + x = tf.squeeze(self.dense_1(x), axis=-1) + + return x + + +class TFElectraForQuestionAnswering(TFElectraPreTrainedModel): + def __init__(self, config, args): + super().__init__(config, args) + + self.start_n_top = args.beam_size # config.start_n_top + self.end_n_top = args.beam_size # config.end_n_top + self.joint_head = args.joint_head + self.v2 = args.version_2_with_negative + self.electra = TFElectraMainLayer(config, name="electra") + self.num_hidden_layers = config.num_hidden_layers + self.amp = config.amp + + ##old head + if not self.joint_head: + self.qa_outputs = tf.keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs") + else: + self.start_logits = TFPoolerStartLogits(config, name='start_logits') + self.end_logits = TFPoolerEndLogits(config, name='end_logits') + if self.v2: + self.answer_class = TFPoolerAnswerClass(config, name='answer_class') + + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + start_positions=None, + end_positions=None, + cls_index=None, + p_mask=None, + is_impossible=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + outputs = self.electra( + input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training + ) + discriminator_sequence_output = outputs[0] + + # Simple head model + if not self.joint_head: + logits = self.qa_outputs(discriminator_sequence_output) + [start_logits, end_logits] = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1, name="squeeze_start_logit") + end_logits = tf.squeeze(end_logits, axis=-1, name="squeeze_end_logit") + outputs = (start_logits, end_logits) + outputs + return outputs + + start_logits = self.start_logits(discriminator_sequence_output, p_mask=p_mask, + next_layer_dtype=self.end_logits.dense_0.dtype) + if training: # start_positions is not None and end_positions is not None: + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(discriminator_sequence_output, start_positions=start_positions, p_mask=p_mask, + training=training, + next_layer_dtype=tf.float16 if self.amp else tf.float32) + + if self.v2: # cls_index is not None:#cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(discriminator_sequence_output, start_positions=start_positions, + cls_index=cls_index) + + else: + cls_logits = None + + outputs = (start_logits, end_logits, cls_logits) + outputs + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = discriminator_sequence_output.shape + start_n_top = min(self.start_n_top, slen) + end_n_top = min(self.end_n_top, slen) + start_log_probs = tf.nn.log_softmax(start_logits, axis=-1, name="start_logit_softmax") # shape (bsz, slen) + + start_top_log_probs, start_top_index = tf.math.top_k(start_log_probs, k=start_n_top, + name="start_log_probs_top_k") + + start_states = tf.gather(discriminator_sequence_output, start_top_index, axis=1, + batch_dims=1) # shape (bsz, start_n_top, hsz) + start_states = tf.broadcast_to(tf.expand_dims(start_states, axis=1), + [bsz, slen, start_n_top, hsz]) # shape (bsz, slen, start_n_top, hsz) + + discriminator_sequence_output_expanded = tf.broadcast_to( + tf.expand_dims(discriminator_sequence_output, axis=2), + list(start_states.shape)) # shape (bsz, slen, start_n_top, hsz) + + p_mask = tf.expand_dims(p_mask, axis=-1) if p_mask is not None else None + end_logits = self.end_logits(discriminator_sequence_output_expanded, start_states=start_states, + p_mask=p_mask, next_layer_dtype=tf.float16 if self.amp else tf.float32) # self.answer_class.dense_0.dtype) + end_log_probs = tf.nn.log_softmax(end_logits, axis=1, + name="end_logit_softmax") # shape (bsz, slen, start_n_top) + + # need to transpose because tf.math.top_k works on default axis=-1 + end_log_probs = tf.transpose(end_log_probs, perm=[0, 2, 1]) + end_top_log_probs, end_top_index = tf.math.top_k( + end_log_probs, k=end_n_top) # shape (bsz, end_n_top, start_n_top).perm(0,2,1) + end_top_log_probs = tf.reshape(end_top_log_probs, ( + -1, start_n_top * end_n_top)) # shape (bsz, self.start_n_top * self.end_n_top) + end_top_index = tf.reshape(end_top_index, + (-1, start_n_top * end_n_top)) # shape (bsz, self.start_n_top * self.end_n_top) + if self.v2: # cls_index is not None: + start_p = tf.nn.softmax(start_logits, axis=-1, name="start_softmax") + start_states = tf.einsum( + "blh,bl->bh", discriminator_sequence_output, tf.cast(start_p, tf.float16) if self.amp else start_p + ) # get the representation of START as weighted sum of hidden states + # explicitly setting cls_index to None + cls_logits = self.answer_class( + discriminator_sequence_output, start_states=start_states, cls_index=None) + # one single `cls_logits` for each sample + else: + cls_logits = tf.fill([bsz], 0.0) + + outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs + + # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits + return outputs diff --git a/modelzoo/ELECTRA/modeling_utils.py b/modelzoo/ELECTRA/modeling_utils.py new file mode 100644 index 00000000..bfbc4cf4 --- /dev/null +++ b/modelzoo/ELECTRA/modeling_utils.py @@ -0,0 +1,2843 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF general model utils.""" +import functools +import logging +import os + +import h5py +import numpy as np +import tensorflow as tf +from tensorflow.python.keras.saving import hdf5_format + +from configuration_utils import PretrainedConfig, BertConfig +from file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url +from file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable +from tokenization_utils import BatchEncoding +from utils import log + + +class TFModelUtilsMixin: + """ + A few utilities for `tf.keras.Model`s, to be used as a mixin. + """ + + def num_parameters(self, only_trainable: bool = False) -> int: + """ + Get number of (optionally, trainable) parameters in the model. + """ + if only_trainable: + return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables)) + else: + return self.count_params() + + +def keras_serializable(cls): + """ + Decorate a Keras Layer class to support Keras serialization. + + This is done by: + 1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at + serialization time + 2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and + convert it to a config object for the actual layer initializer + 3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does + not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model` + + :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a + `TF*MainLayer` class in this project) + :return: the same class object, with modifications for Keras deserialization. + """ + initializer = cls.__init__ + + config_class = getattr(cls, "config_class", None) + if config_class is None: + raise AttributeError("Must set `config_class` to use @keras_serializable") + + @functools.wraps(initializer) + def wrapped_init(self, *args, **kwargs): + transformers_config = kwargs.pop("transformers_config", None) + config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None) + if config is not None and transformers_config is not None: + raise ValueError("Must pass either `config` or `transformers_config`, not both") + elif config is not None: + # normal layer construction, call with unchanged args (config is already in there) + initializer(self, *args, **kwargs) + elif transformers_config is not None: + # Keras deserialization, convert dict to config + config = config_class.from_dict(transformers_config) + initializer(self, config, *args, **kwargs) + else: + raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)") + self._transformers_config = config + + cls.__init__ = wrapped_init + + if not hasattr(cls, "get_config"): + raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") + if hasattr(cls.get_config, "_is_default"): + + def get_config(self): + cfg = super(cls, self).get_config() + cfg["transformers_config"] = self._transformers_config.to_dict() + return cfg + + cls.get_config = get_config + + cls._keras_serializable = True + if hasattr(tf.keras.utils, "register_keras_serializable"): + cls = tf.keras.utils.register_keras_serializable()(cls) + return cls + + +class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): + r""" Base class for all TF models. + + :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models + as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. + + Class attributes (overridden by derived classes): + - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. + - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values. + - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: + + - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, + - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, + - ``path``: a path (string) to the TensorFlow checkpoint. + + - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. + """ + config_class = None + pretrained_model_archive_map = {} + base_model_prefix = "" + + @property + def dummy_inputs(self): + """ Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.constant(DUMMY_INPUTS)} + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + if not isinstance(config, PretrainedConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " + "To create a model from a pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + ) + ) + # Save config in model + self.config = config + + def get_input_embeddings(self): + """ + Returns the model's input embeddings. + + Returns: + :obj:`tf.keras.layers.Layer`: + A torch module mapping vocabulary to hidden states. + """ + base_model = getattr(self, self.base_model_prefix, self) + if base_model is not self: + return base_model.get_input_embeddings() + else: + raise NotImplementedError + + def get_output_embeddings(self): + """ + Returns the model's output embeddings. + + Returns: + :obj:`tf.keras.layers.Layer`: + A torch module mapping hidden states to vocabulary. + """ + return None # Overwrite for models with output embeddings + + def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): + """ Build a resized Embedding Variable from a provided token Embedding Module. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + + Args: + new_num_tokens: (`optional`) int + New number of tokens in the embedding matrix. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + If not provided or None: return the provided token Embedding Module. + Return: ``tf.Variable`` + Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None + """ + # if new_num_tokens is None: + # return old_embeddings + + # old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + # if old_num_tokens == new_num_tokens: + # return old_embeddings + + # # Build new embeddings + # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) + # new_embeddings.to(old_embeddings.weight.device) + + # # initialize all new embeddings (in particular added tokens) + # self._init_weights(new_embeddings) + + # # Copy token embeddings from the previous weights + # num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] + + # return new_embeddings + + def resize_token_embeddings(self, new_num_tokens=None): + """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. + Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + + new_num_tokens: (`optional`) int: + New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. + If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model. + + Return: ``tf.Variable`` + Pointer to the input tokens Embeddings Module of the model + """ + raise NotImplementedError + + def prune_heads(self, heads_to_prune): + """ Prunes heads of the base model. + + Arguments: + + heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). + """ + raise NotImplementedError + + def save_pretrained(self, save_directory): + """ Save a model and its configuration file to a directory, so that it + can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method. + """ + if os.path.isfile(save_directory): + log("Provided path ({}) should be a directory, not a file".format(save_directory)) + return + os.makedirs(save_directory, exist_ok=True) + + # Save configuration file + self.config.save_pretrained(save_directory) + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME) + self.save_weights(output_model_file) + + with h5py.File(output_model_file, "r") as f: + if "layer_names" not in f.attrs and "model_weights" in f: + f = f["model_weights"] + hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names")) + log(f"Model weights saved in {output_model_file}: {hdf5_layer_names}") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. + + The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. + It is up to you to train those weights with a downstream fine-tuning task. + + The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. + + Parameters: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) one of: + - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or + - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()` + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + from_pt: (`optional`) boolean, default False: + Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument). + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + # For example purposes. Not runnable. + model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') + model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config) + + """ + config = kwargs.pop("config", None) + cache_dir = kwargs.pop("cache_dir", None) + from_pt = kwargs.pop("from_pt", False) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) + + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + *model_args, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + **kwargs, + ) + else: + model_kwargs = kwargs + + # Load model + if pretrained_model_name_or_path is not None: + if pretrained_model_name_or_path in cls.pretrained_model_archive_map: + archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] + elif os.path.isdir(pretrained_model_name_or_path): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): + # Load from a TF 2.0 checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) + elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + "Error no file named {} found in directory {} or `from_pt` set to False".format( + [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path + ) + ) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + archive_file = pretrained_model_name_or_path + ".index" + else: + archive_file = hf_bucket_url( + pretrained_model_name_or_path, postfix=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME) + ) + + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + ) + except EnvironmentError as e: + if pretrained_model_name_or_path in cls.pretrained_model_archive_map: + log("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)) + else: + log( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ", ".join(cls.pretrained_model_archive_map.keys()), + archive_file, + ) + ) + raise e + if resolved_archive_file == archive_file: + log("loading weights file {}".format(archive_file)) + else: + log("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) + else: + resolved_archive_file = None + + # Instantiate model. + model = cls(config, *model_args, **model_kwargs) + + if from_pt: + # Load from a PyTorch checkpoint + raise NotImplementedError + # return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) + + model(model.dummy_inputs, training=False) # build the network with dummy inputs + + assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) + # 'by_name' allow us to do transfer learning by skipping/adding layers + # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 + try: + model.load_weights(resolved_archive_file, by_name=True) + except OSError: + raise OSError( + "Unable to load weights from h5 file. " + "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " + ) + + model(model.dummy_inputs, training=False) # Make sure restore ops are run + + # Check if the models are the same to output loading information + with h5py.File(resolved_archive_file, "r") as f: + if "layer_names" not in f.attrs and "model_weights" in f: + f = f["model_weights"] + hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names")) + model_layer_names = set(layer.name for layer in model.layers) + missing_keys = list(model_layer_names - hdf5_layer_names) + unexpected_keys = list(hdf5_layer_names - model_layer_names) + error_msgs = [] + + if len(unexpected_keys) > 0: + log( + f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when " + f"initializing {model.__class__.__name__}: {unexpected_keys}\n" + ) + else: + log(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") + if len(missing_keys) > 0: + log( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} " + f"and are newly initialized: {missing_keys}\n" + ) + else: + log( + f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n" + f"If your task is similar to the task the model of the ckeckpoint was trained on, " + f"you can already use {model.__class__.__name__} for predictions without further training." + ) + if len(error_msgs) > 0: + raise RuntimeError( + "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) + ) + if output_loading_info: + loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} + return model, loading_info + + return model + + def prepare_inputs_for_generation(self, inputs, **kwargs): + return {"inputs": inputs} + + def _do_output_past(self, outputs): + has_output_past = hasattr(self.config, "output_past") and self.config.output_past + has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len + + if has_output_past and not has_mem_len and len(outputs) > 1: + return True + elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1: + return True + + return False + + def generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + ): + r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling + and beam-search. + + Adapted in part from `Facebook's XLM beam search code`_. + + .. _`Facebook's XLM beam search code`: + https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 + + + Parameters: + + input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)` + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape `(1,)`. + + max_length: (`optional`) int + The max length of the sequence to be generated. Between 1 and infinity. Default to 20. + + min_length: (`optional`) int + The min length of the sequence to be generated. Between 0 and infinity. Default to 0. + do_sample: (`optional`) bool + If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + early_stopping: (`optional`) bool + if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + num_beams: (`optional`) int + Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. + + temperature: (`optional`) float + The value used to module the next token probabilities. Must be strictely positive. Default to 1.0. + + top_k: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + + top_p: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + repetition_penalty: (`optional`) float + The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. + + bos_token_id: (`optional`) int + Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist. + + pad_token_id: (`optional`) int + Pad token. Defaults to pad_token_id as defined in the models config. + + eos_token_id: (`optional`) int + EOS token. Defaults to eos_token_id as defined in the models config. + + length_penalty: (`optional`) float + Exponential penalty to the length. Default to 1. + + no_repeat_ngram_size: (`optional`) int + If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. + + bad_words_ids: (`optional`) list of lists of int + `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + + num_return_sequences: (`optional`) int + The number of independently computed returned sequences for each element in the batch. Default to 1. + + attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids` + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Defaults to `None`. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_start_token_id=None: (`optional`) int + If an encoder-decoder model starts decoding with a different token than BOS. + Defaults to `None` and is changed to `BOS` later. + + Return: + + output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)` + sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = shape_list(input_ids)[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." + assert temperature > 0, "`temperature` should be strictely positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictely positive." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictely positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = tf.fill((batch_size, 1), bos_token_id) + else: + assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + elif attention_mask is None: + attention_mask = tf.ones_like(input_ids) + + if pad_token_id is None and eos_token_id is not None: + log( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) + ) + pad_token_id = eos_token_id + + # current position and vocab size + cur_len = shape_list(input_ids)[1] + vocab_size = self.config.vocab_size + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if num_return_sequences > 1 or num_beams > 1: + input_ids_len = shape_list(input_ids)[-1] + input_ids = tf.broadcast_to( + tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + attention_mask = tf.broadcast_to( + tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + input_ids = tf.reshape( + input_ids, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = tf.reshape( + attention_mask, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) + assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_outputs = encoder(input_ids, attention_mask=attention_mask) + + # create empty decoder_input_ids + input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id + cur_len = 1 + + else: + encoder_outputs = None + cur_len = shape_list(input_ids)[-1] + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + batch_size=effective_batch_size, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + + return output + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + vocab_size, + encoder_outputs, + attention_mask, + ): + """ Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + + # length of generated sentences / unfinished sentences + unfinished_sents = tf.ones_like(input_ids[:, 0]) + sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length + + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask) + outputs = self(**model_inputs) + next_token_logits = outputs[0][:, -1, :] + + # if model has past, then set the past variable to speed up decoding + if self._do_output_past(outputs): + past = outputs[1] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, eos_token_indices_mask, -float("inf") + ) + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + # Sample + next_token = tf.squeeze( + tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 + ) + else: + # Greedy decoding + next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) + else: + tokens_to_add = next_token + + input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( + unfinished_sents, tf.cast(eos_in_sents, tf.int32) + ) + sent_lengths = ( + sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + + cur_len * is_sents_unfinished_and_token_to_add_is_eos + ) + + # unfinished_sents is set to zero if eos in sentence + unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos + + # stop when there is a in each sentence, or if we exceed the maximul length + if tf.math.reduce_max(unfinished_sents) == 0: + break + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + cur_len = cur_len + 1 + + # if there are different sentences lengths in the batch, some batches have to be padded + min_sent_length = tf.math.reduce_min(sent_lengths) + max_sent_length = tf.math.reduce_max(sent_lengths) + if min_sent_length != max_sent_length: + assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" + # finished sents are filled with pad_token + padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id + + # create length masks for tf.where operation + broad_casted_sent_lengths = tf.broadcast_to( + tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] + ) + broad_casted_range = tf.transpose( + tf.broadcast_to(tf.expand_dims(tf.range(max_length), -1), [max_length, batch_size]) + ) + + decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) + else: + decoded = input_ids + + return decoded + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + decoder_start_token_id, + eos_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + ): + """ Generate sequences for each example with beam search. + """ + + # generated hypotheses + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) + for _ in range(batch_size) + ] + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) + beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) + beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) + else: + beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) + + beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) + + # cache compute states + past = encoder_outputs + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask) + outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if self._do_output_past(outputs): + past = outputs[1] + + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + + # calculate log softmax score + scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + num_batch_hypotheses = batch_size * num_beams + + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) + + scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + num_batch_hypotheses = batch_size * num_beams + banned_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + assert shape_list(scores) == [batch_size * num_beams, vocab_size] + + if do_sample: + _scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # Top-p/top-k filtering + _scores = tf_top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) + + next_tokens = tf.random.categorical( + _scores, dtype=tf.int32, num_samples=2 * num_beams + ) # (batch_size, 2 * num_beams) + # Compute next scores + next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) + + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) + next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + else: + # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) + next_scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = tf.reshape( + next_scores, (batch_size, num_beams * vocab_size) + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) + + assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format(num_beams) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence or last iteration + if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() + ) + else: + # add next predicted token if it is not eos_token + next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # Check if were done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1) + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) + beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) + beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) + + # re-order batch + input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) + input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + # update current length + cur_len = cur_len + 1 + + # finalize all open beam hypotheses and end to generated hypotheses + for batch_idx in range(batch_size): + # Add all open beam hypothesis to generated_hyps + if done[batch_idx]: + continue + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx] + ): + assert tf.reduce_all( + next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].numpy().item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = batch_size if do_sample else batch_size * num_return_sequences + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths_list = [] + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + best_hyp = sorted_hyps.pop()[1] + sent_lengths_list.append(len(best_hyp)) + best.append(best_hyp) + assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format( + output_batch_size, len(best) + ) + + sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) + + # shorter batches are filled with pad_token + if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) + decoded_list = [] + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + assert sent_lengths[i] == shape_list(hypo)[0] + # if sent_length is max_len do not pad + if sent_lengths[i] == sent_max_len: + decoded_slice = hypo + else: + # else pad to sent_max_len + num_pad_tokens = sent_max_len - sent_lengths[i] + padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) + decoded_slice = tf.concat([hypo, padding], axis=-1) + + # finish sentence with EOS token + if sent_lengths[i] < max_length: + decoded_slice = tf.where( + tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], + eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), + decoded_slice, + ) + # add to list + decoded_list.append(decoded_slice) + + decoded = tf.stack(decoded_list) + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = tf.stack(best) + + return decoded + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = [] + for layer_past in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` and `mems` is at 2nd position + reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx] + reordered_layer_past = tf.concat(reordered_layer_past, axis=1) + # check that shape matches + assert shape_list(reordered_layer_past) == shape_list(layer_past) + reordered_past.append(reordered_layer_past) + past = tuple(reordered_past) + return past + + +def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): + # create logit penalties for already seen input_ids + token_penalties = np.ones(shape_list(logits)) + prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] + for i, prev_input_id in enumerate(prev_input_ids): + logit_penalized = logits[i].numpy()[prev_input_id] + logit_penalties = np.zeros(logit_penalized.shape) + # if previous logit score is < 0 then multiply repetition penalty else divide + logit_penalties[logit_penalized < 0] = repetition_penalty + logit_penalties[logit_penalized > 0] = 1 / repetition_penalty + np.put(token_penalties[i], prev_input_id, logit_penalties) + return tf.convert_to_tensor(token_penalties, dtype=tf.float32) + + +def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].numpy().tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_input_ids): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( + bad_words_ids + ) + + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + logits_shape = shape_list(logits) + + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + + if top_p < 1.0: + sorted_indices = tf.argsort(logits, direction="DESCENDING") + sorted_logits = tf.gather( + logits, sorted_indices, axis=-1, batch_dims=1 + ) # expects logits to be of dim (batch_size, vocab_size) + + cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove = tf.concat( + [ + tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), + sorted_indices_to_remove[:, min_tokens_to_keep:], + ], + -1, + ) + + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1) + sorted_indices_to_remove = tf.concat( + [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1, + ) + # scatter sorted tensors to original indexing + indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + return logits + + +def scatter_values_on_batch_indices(values, batch_indices): + shape = shape_list(batch_indices) + # broadcast batch dim to shape + broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) + # transform batch_indices to pair_indices + pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) + # scatter values to pair indices + return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len=None): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + if cur_len is None: + cur_len = self.max_length + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret + + +class TFConv1D(tf.keras.layers.Layer): + def __init__(self, nf, nx, initializer_range=0.02, **kwargs): + """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) + Basically works like a Linear layer but the weights are transposed + """ + super().__init__(**kwargs) + self.nf = nf + self.nx = nx + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) + ) + self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) + + def call(self, x): + bz, sl = shape_list(x)[:2] + + x = tf.reshape(x, [-1, self.nx]) + x = tf.matmul(x, self.weight) + self.bias + + x = tf.reshape(x, [bz, sl, self.nf]) + + return x + + +class TFSharedEmbeddings(tf.keras.layers.Layer): + """Construct shared token embeddings. + """ + + def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range + + def build(self, input_shape): + """Build shared token embedding layer + Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + self.weight = self.add_weight( + "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) + ) + super().build(input_shape) + + def call(self, inputs, mode="embedding"): + """Get token embeddings of inputs. + Args: + inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) + mode: string, a valid value is one of "embedding" and "linear". + Returns: + outputs: (1) If mode == "embedding", output embedding tensor, float32 with + shape [batch_size, length, embedding_size]; (2) mode == "linear", output + linear tensor, float32 with shape [batch_size, length, vocab_size]. + Raises: + ValueError: if mode is not valid. + + Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + if mode == "embedding": + return self._embedding(inputs) + elif mode == "linear": + return self._linear(inputs) + else: + raise ValueError("mode {} is not valid.".format(mode)) + + def _embedding(self, input_ids): + """Applies embedding based on inputs tensor.""" + return tf.gather(self.weight, input_ids) + + def _linear(self, inputs): + """Computes logits by running inputs through a linear layer. + Args: + inputs: A float32 tensor with shape [..., hidden_size] + Returns: + float32 tensor with shape [..., vocab_size]. + """ + first_dims = shape_list(inputs)[:-1] + + x = tf.reshape(inputs, [-1, self.hidden_size]) + logits = tf.matmul(x, self.weight, transpose_b=True) + + return tf.reshape(logits, first_dims + [self.vocab_size]) + + +class TFSequenceSummary(tf.keras.layers.Layer): + r""" Compute a single vector summary of a sequence hidden states according to various possibilities: + Args of the config class: + summary_type: + - 'last' => [default] take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj: Add a projection after the vector extraction + summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default + summary_first_dropout: Add a dropout before the projection and activation + summary_last_dropout: Add a dropout after the projection and activation + """ + + def __init__(self, config, initializer_range=0.02, **kwargs): + super().__init__(**kwargs) + + self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj + if self.has_summary: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = tf.keras.layers.Dense( + num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" + ) + + self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh" + if self.has_activation: + self.activation = tf.keras.activations.tanh + + self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 + if self.has_first_dropout: + self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) + + self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 + if self.has_last_dropout: + self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) + + def call(self, inputs, training=False): + """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer. + cls_index: [optional] position of the classification token if summary_type == 'cls_index', + shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. + if summary_type == 'cls_index' and cls_index is None: + we take the last token of the sequence as classification token + """ + if not isinstance(inputs, (dict, tuple, list)): + hidden_states = inputs + cls_index = None + elif isinstance(inputs, (tuple, list)): + hidden_states = inputs[0] + cls_index = inputs[1] if len(inputs) > 1 else None + assert len(inputs) <= 2, "Too many inputs." + else: + hidden_states = inputs.get("hidden_states") + cls_index = inputs.get("cls_index", None) + + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = tf.reduce_mean(hidden_states, axis=1) + elif self.summary_type == "cls_index": + hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] + if cls_index is None: + cls_index = tf.fill( + hidden_shape[:-2], hidden_shape[-2] - 1 + ) # A tensor full of shape [batch] or [batch, num choices] full of sequence length + cls_shape = shape_list(cls_index) + if len(cls_shape) <= len(hidden_shape) - 2: + cls_index = cls_index[..., tf.newaxis] + # else: + # cls_index = cls_index[..., tf.newaxis] + # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) + output = tf.squeeze( + output, axis=len(hidden_shape) - 2 + ) # shape of output: (batch, num choices, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + if self.has_first_dropout: + output = self.first_dropout(output, training=training) + + if self.has_summary: + output = self.summary(output) + + if self.has_activation: + output = self.activation(output) + + if self.has_last_dropout: + output = self.last_dropout(output, training=training) + + return output + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +def get_initializer(initializer_range=0.02): + """Creates a `tf.initializers.truncated_normal` with the given range. + Args: + initializer_range: float, initializer range for stddev. + Returns: + TruncatedNormal initializer with stddev = `initializer_range`. + """ + return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) + + +TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5", + "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5", + "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tf_model.h5", +} + + +def gelu(x): + """ Gaussian Error Linear Unit. + Original Implementation of the gelu activation function in Google Bert repo when initially created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) + return x * cdf + + +def gelu_new(x): + """Gaussian Error Linear Unit. + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def swish(x): + return x * tf.sigmoid(x) + + +ACT2FN = { + "gelu": tf.keras.layers.Activation(gelu), + "relu": tf.keras.activations.relu, + "swish": tf.keras.layers.Activation(swish), + "gelu_new": tf.keras.layers.Activation(gelu_new), +} + + +class TFBertEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.initializer_range = config.initializer_range + + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="position_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="token_type_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def build(self, input_shape): + """Build shared word embedding layer """ + with tf.name_scope("word_embeddings"): + # Create and initialize weights. The random normal initializer was chosen + # arbitrarily, and works well. + self.word_embeddings = self.add_weight( + "weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + super().build(input_shape) + + def call(self, inputs, mode="embedding", training=False): + """Get token embeddings of inputs. + Args: + inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) + mode: string, a valid value is one of "embedding" and "linear". + Returns: + outputs: (1) If mode == "embedding", output embedding tensor, float32 with + shape [batch_size, length, embedding_size]; (2) mode == "linear", output + linear tensor, float32 with shape [batch_size, length, vocab_size]. + Raises: + ValueError: if mode is not valid. + + Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + if mode == "embedding": + return self._embedding(inputs, training=training) + elif mode == "linear": + return self._linear(inputs) + else: + raise ValueError("mode {} is not valid.".format(mode)) + + def _embedding(self, inputs, training=False): + """Applies embedding based on inputs tensor.""" + input_ids, position_ids, token_type_ids, inputs_embeds = inputs + + if input_ids is not None: + input_shape = shape_list(input_ids) + else: + input_shape = shape_list(inputs_embeds)[:-1] + + seq_length = input_shape[1] + if position_ids is None: + position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + if inputs_embeds is None: + inputs_embeds = tf.gather(self.word_embeddings, input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + def _linear(self, inputs): + """Computes logits by running inputs through a linear layer. + Args: + inputs: A float32 tensor with shape [batch_size, length, hidden_size] + Returns: + float32 tensor with shape [batch_size, length, vocab_size]. + """ + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] + + x = tf.reshape(inputs, [-1, self.hidden_size]) + logits = tf.matmul(x, self.word_embeddings, transpose_b=True) + + return tf.reshape(logits, [batch_size, length, self.vocab_size]) + + +class TFBertSelfAttention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + assert config.hidden_size % config.num_attention_heads == 0 + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.amp = config.amp + + self.query = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, inputs, training=False): + hidden_states, attention_mask, head_mask = inputs + + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True + ) # (batch size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(shape_list(key_layer)[-1], tf.float32) + attention_scores = attention_scores / tf.cast(tf.math.sqrt(dk), tf.float16 if self.amp else tf.float32) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = tf.nn.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.matmul(attention_probs, value_layer) + + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) + + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) + return outputs + + +class TFBertSelfOutput(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, inputs, training=False): + hidden_states, input_tensor = inputs + + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class TFBertAttention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.self_attention = TFBertSelfAttention(config, name="self") + self.dense_output = TFBertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, inputs, training=False): + input_tensor, attention_mask, head_mask = inputs + + self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training) + attention_output = self.dense_output([self_outputs[0], input_tensor], training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class TFBertIntermediate(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class TFBertOutput(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, inputs, training=False): + hidden_states, input_tensor = inputs + + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class TFBertLayer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.attention = TFBertAttention(config, name="attention") + self.intermediate = TFBertIntermediate(config, name="intermediate") + self.bert_output = TFBertOutput(config, name="output") + + def call(self, inputs, training=False): + hidden_states, attention_mask, head_mask = inputs + + attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.bert_output([intermediate_output, attention_output], training=training) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + +class TFBertEncoder(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] + + def call(self, inputs, training=False): + hidden_states, attention_mask, head_mask = inputs + + all_hidden_states = () + all_attentions = () + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # outputs, (hidden states), (attentions) + + +class TFBertPooler(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + + def call(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + return pooled_output + + +class TFBertPredictionHeadTransform(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class TFBertLMPredictionHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.transform = TFBertPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + super().build(input_shape) + + def call(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + return hidden_states + + +class TFBertMLMHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class TFBertNSPHead(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.seq_relationship = tf.keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" + ) + + def call(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +@keras_serializable +class TFBertMainLayer(tf.keras.layers.Layer): + config_class = BertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.num_hidden_layers = config.num_hidden_layers + + self.embeddings = TFBertEmbeddings(config, name="embeddings") + self.encoder = TFBertEncoder(config, name="encoder") + self.pooler = TFBertPooler(config, name="pooler") + + def get_input_embeddings(self): + return self.embeddings + + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + raise NotImplementedError + + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + + extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_hidden_layers + # head_mask = tf.constant([0] * self.num_hidden_layers) + + embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + + +class TFBertPreTrainedModel(TFPreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + config_class = BertConfig + pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "bert" + + +BERT_START_DOCSTRING = r""" + This model is a `tf.keras.Model `__ sub-class. + Use it as a regular TF 2.0 Keras Model and + refer to the TF 2.0 documentation for all matter related to general usage and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having + all the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors + in the first positional argument : + + - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` + + Parameters: + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.BertTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`__ + head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + training (:obj:`boolean`, `optional`, defaults to :obj:`False`): + Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them + (if set to :obj:`False`) for evaluation. +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class TFBertModel(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.bert = TFBertMainLayer(config, name="bert") + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertModel + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertModel.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + """ + outputs = self.bert(inputs, **kwargs) + return outputs + + +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: + a `masked language modeling` head and a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, +) +class TFBertForPreTraining(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") + self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + + def get_output_embeddings(self): + return self.bert.embeddings + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForPreTraining + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForPreTraining.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + prediction_scores, seq_relationship_scores = outputs[:2] + + """ + outputs = self.bert(inputs, **kwargs) + + sequence_output, pooled_output = outputs[:2] + prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) + seq_relationship_score = self.nsp(pooled_output) + + outputs = (prediction_scores, seq_relationship_score,) + outputs[ + 2: + ] # add hidden states and attention if they are here + + return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) +class TFBertForMaskedLM(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + + def get_output_embeddings(self): + return self.bert.embeddings + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForMaskedLM + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + prediction_scores = outputs[0] + + """ + outputs = self.bert(inputs, **kwargs) + + sequence_output = outputs[0] + prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + + return outputs # prediction_scores, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, +) +class TFBertForNextSentencePrediction(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForNextSentencePrediction + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + seq_relationship_scores = outputs[0] + + """ + outputs = self.bert(inputs, **kwargs) + + pooled_output = outputs[1] + seq_relationship_score = self.nsp(pooled_output) + + outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, +) +class TFBertForSequenceClassification(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, name="bert") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForSequenceClassification + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + logits = outputs[0] + + """ + outputs = self.bert(inputs, **kwargs) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, +) +class TFBertForMultipleChoice(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @property + def dummy_inputs(self): + """ Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: + `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForMultipleChoice + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices + outputs = model(input_ids) + classification_scores = outputs[0] + + """ + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." + elif isinstance(inputs, dict): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + + flat_inputs = [ + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + ] + + outputs = self.bert(flat_inputs, training=training) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # reshaped_logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, +) +class TFBertForTokenClassification(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, name="bert") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForTokenClassification + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + scores = outputs[0] + + """ + outputs = self.bert(inputs, **kwargs) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # scores, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, +) +class TFBertForQuestionAnswering(TFBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, name="bert") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) + def call(self, inputs, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import BertTokenizer, TFBertForQuestionAnswering + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + start_scores, end_scores = outputs[:2] + + """ + outputs = self.bert(inputs, **kwargs) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + + return outputs # start_logits, end_logits, (hidden_states), (attentions) diff --git a/modelzoo/ELECTRA/optimization.py b/modelzoo/ELECTRA/optimization.py new file mode 100644 index 00000000..b83e487c --- /dev/null +++ b/modelzoo/ELECTRA/optimization.py @@ -0,0 +1,383 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions and classes related to optimization (weight updates).""" + +import re +import collections +import tensorflow as tf +import tensorflow_addons.optimizers as tfa_optimizers + +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import training_ops +from utils import log + + +class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): + """Applys a warmup schedule on a given learning rate decay schedule.""" + + def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None): + super().__init__() + self.initial_learning_rate = initial_learning_rate + self.warmup_steps = warmup_steps + self.power = power + self.decay_schedule_fn = decay_schedule_fn + self.name = name + + def __call__(self, step): + with tf.name_scope(self.name or "WarmUp") as name: + # Implements polynomial warmup. i.e., if global_step < warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + global_step_float = tf.cast(step, tf.float32) + warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) + warmup_percent_done = global_step_float / warmup_steps_float + warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power) + return tf.cond( + global_step_float < warmup_steps_float, + lambda: warmup_learning_rate, + lambda: self.decay_schedule_fn(step - self.warmup_steps), + name=name, + ) + + def get_config(self): + return { + "initial_learning_rate": self.initial_learning_rate, + "decay_schedule_fn": self.decay_schedule_fn, + "warmup_steps": self.warmup_steps, + "power": self.power, + "name": self.name, + } + + +def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01, + layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0, + optimizer="adam", skip_adaptive=False, power=1.0, beta_1=0.9, beta_2=0.999, end_lr=0.0): + """Creates an optimizer with learning rate schedule.""" + # Implements linear decay of the learning rate. + learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( + initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power + ) + if num_warmup_steps: + learning_rate_fn = WarmUp( + initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps + ) + layer_decay = None + if layerwise_lr_decay > 0 and n_transformer_layers is not None: + layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers) + + if optimizer == "adam": + optimizer = AdamWeightDecay( + learning_rate=learning_rate_fn, + weight_decay_rate=weight_decay_rate, + layer_decay=layer_decay, + beta_1=beta_1, + beta_2=beta_2, + epsilon=1e-6, + exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"], + clip_norm=clip_norm, + ) + else: + if skip_adaptive: + skip_list = ["layer_norm", "bias", "LayerNorm"] + else: + skip_list = ["None"] + log("Skip list for LAMB {}".format(skip_list)) + + optimizer = tfa_optimizers.LAMB( + learning_rate=learning_rate_fn, + weight_decay_rate=weight_decay_rate, + beta_1=beta_1, + beta_2=beta_2, + epsilon=1e-6, + exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"], + exclude_from_layer_adaptation=skip_list, + ) + + return optimizer + + +class AdamWeightDecay(tf.keras.optimizers.Adam): + """Adam enables L2 weight decay and clip_by_global_norm on gradients. + + Just adding the square of the weights to the loss function is *not* the + correct way of using L2 regularization/weight decay with Adam, since that will + interact with the m and v parameters in strange ways. + + Instead we want ot decay the weights in a manner that doesn't interact with + the m/v parameters. This is equivalent to adding the square of the weights to + the loss with plain (non-momentum) SGD. + """ + + def __init__( + self, + learning_rate=0.001, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-7, + amsgrad=False, + weight_decay_rate=0.0, + include_in_weight_decay=None, + exclude_from_weight_decay=None, + layer_decay=None, + clip_norm=1.0, + name="AdamWeightDecay", + **kwargs + ): + super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) + self.weight_decay_rate = weight_decay_rate + self._include_in_weight_decay = include_in_weight_decay + self._exclude_from_weight_decay = exclude_from_weight_decay + self.layer_decay = layer_decay + self.clip_norm = clip_norm + + @classmethod + def from_config(cls, config): + """Creates an optimizer from its config with WarmUp custom object.""" + custom_objects = {"WarmUp": WarmUp} + return super().from_config(config, custom_objects=custom_objects) + + def _prepare_local(self, var_device, var_dtype, apply_state): + super()._prepare_local(var_device, var_dtype, apply_state) + apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate") + + def _decay_weights_op(self, var, learning_rate, apply_state): + do_decay = self._do_use_weight_decay(var.name) + if do_decay: + return var.assign_sub( + learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking + ) + return tf.no_op() + + def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True): + grads, tvars = list(zip(*grads_and_vars)) + # Being done in train_step + ##(grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.clip_norm) + return super().apply_gradients(zip(grads, tvars), name=name, + experimental_aggregate_gradients=experimental_aggregate_gradients) + + def _get_lr(self, var, apply_state): + """Retrieves the learning rate with the given state.""" + # if apply_state is None: + # return self._decayed_lr_t[var_dtype], {} + var_name, var_device, var_dtype = var.name, var.device, var.dtype.base_dtype + + apply_state = apply_state or {} + coefficients = apply_state.get((var_device, var_dtype)) + if coefficients is None: + coefficients = self._fallback_apply_state(var_device, var_dtype) + apply_state[(var_device, var_dtype)] = coefficients + lr_t = coefficients["lr_t"] + lr = coefficients["lr"] + + if self.layer_decay is not None: + update_for_var = False + for key in self.layer_decay: + if key in var_name: + update_for_var = True + lr_t *= self.layer_decay[key] + lr *= self.layer_decay[key] + break + if not update_for_var: + raise ValueError("No learning rate specified for variable", var) + + return lr_t, lr, coefficients, dict(apply_state=apply_state) + + def _resource_apply_dense(self, grad, var, apply_state=None): + # print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype)) + lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + m = self.get_slot(var, 'm') + v = self.get_slot(var, 'v') + + if not self.amsgrad: + return training_ops.resource_apply_adam( + var.handle, + m.handle, + v.handle, + coefficients['beta_1_power'], + coefficients['beta_2_power'], + lr_t, + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + use_locking=self._use_locking) + else: + vhat = self.get_slot(var, 'vhat') + return training_ops.resource_apply_adam_with_amsgrad( + var.handle, + m.handle, + v.handle, + vhat.handle, + coefficients['beta_1_power'], + coefficients['beta_2_power'], + lr_t, + coefficients['beta_1_t'], + coefficients['beta_2_t'], + coefficients['epsilon'], + grad, + use_locking=self._use_locking) + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None): + # print("Sparse: {} {} {}".format(var.name, var.device, var.dtype.base_dtype)) + lr_t, lr, coefficients, kwargs = self._get_lr(var, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, 'm') + m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] + m_t = state_ops.assign(m, m * coefficients['beta_1_t'], + use_locking=self._use_locking) + with tf.control_dependencies([m_t]): + m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) + + # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) + v = self.get_slot(var, 'v') + v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] + v_t = state_ops.assign(v, v * coefficients['beta_2_t'], + use_locking=self._use_locking) + with tf.control_dependencies([v_t]): + v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) + + if not self.amsgrad: + v_sqrt = math_ops.sqrt(v_t) + var_update = state_ops.assign_sub( + var, lr * m_t / (v_sqrt + coefficients['epsilon']), + use_locking=self._use_locking) + return control_flow_ops.group(*[var_update, m_t, v_t]) + else: + v_hat = self.get_slot(var, 'vhat') + v_hat_t = math_ops.maximum(v_hat, v_t) + with tf.control_dependencies([v_hat_t]): + v_hat_t = state_ops.assign( + v_hat, v_hat_t, use_locking=self._use_locking) + v_hat_sqrt = math_ops.sqrt(v_hat_t) + var_update = state_ops.assign_sub( + var, + lr * m_t / (v_hat_sqrt + coefficients['epsilon']), + use_locking=self._use_locking) + return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t]) + + def get_config(self): + config = super().get_config() + config.update({"weight_decay_rate": self.weight_decay_rate}) + return config + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if self.weight_decay_rate == 0: + return False + + if self._include_in_weight_decay: + for r in self._include_in_weight_decay: + if re.search(r, param_name) is not None: + return True + + if self._exclude_from_weight_decay: + for r in self._exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + +# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py +class GradientAccumulator(object): + """Distribution strategies-aware gradient accumulation utility.""" + + def __init__(self): + """Initializes the accumulator.""" + self._gradients = [] + self._accum_steps = tf.Variable( + initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA + ) + + @property + def step(self): + """Number of accumulated steps.""" + return self._accum_steps.value() + + @property + def gradients(self): + """The accumulated gradients.""" + return list( + gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients() + ) + + def __call__(self, gradients): + """Accumulates :obj:`gradients`.""" + if not self._gradients: + self._gradients.extend( + [ + tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient + for gradient in gradients + ] + ) + + if len(gradients) != len(self._gradients): + raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) + + for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients): + if accum_gradient is not None and gradient is not None: + accum_gradient.assign_add(gradient) + + self._accum_steps.assign_add(1) + + def reset(self): + """Resets the accumulated gradients.""" + if self._gradients: + self._accum_steps.assign(0) + + for gradient in self._get_replica_gradients(): + if gradient is not None: + gradient.assign(tf.zeros_like(gradient)) + + def _get_replica_gradients(self): + if tf.distribute.has_strategy(): + # In a replica context, we want to accumulate gradients on each replica + # without synchronization, so we directly assign the value of the + # current replica. + replica_context = tf.distribute.get_replica_context() + + if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1: + return self._gradients + + return ( + gradient.device_map.select_for_current_replica(gradient.values, replica_context) + for gradient in self._gradients + if gradient is not None + ) + else: + return self._gradients + + +def _get_layer_decay(layer_decay, n_layers): + """Have lower learning rates for layers closer to the input.""" + key_to_depths = collections.OrderedDict({ + "/embeddings/": 0, + "/embeddings_project/": 0, + "/start_logits/": n_layers + 2, + "/end_logits/": n_layers + 2, + "/answer_class/": n_layers + 2, + "/qa_outputs/": n_layers + 2, + }) + for layer in range(n_layers): + key_to_depths["encoder/layer_._" + str(layer) + "/"] = layer + 1 + return { + key: layer_decay ** (n_layers + 2 - depth) + for key, depth in key_to_depths.items() + } diff --git a/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py new file mode 100644 index 00000000..a18c3643 --- /dev/null +++ b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import collections +import json +import os + +import tensorflow as tf + +from utils import log, heading +from run_pretraining import PretrainingConfig +from modeling import PretrainingModel + + +def from_pretrained_ckpt(args): + config = PretrainingConfig( + model_name='postprocessing', + data_dir='postprocessing', + generator_hidden_size=0.3333333, + ) + + # Padding for divisibility by 8 + if config.vocab_size % 8 != 0: + config.vocab_size += 8 - (config.vocab_size % 8) + + if args.amp: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") + tf.keras.mixed_precision.experimental.set_policy(policy) + print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 + print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 + + # Set up model + model = PretrainingModel(config) + + # Load checkpoint + checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model) + checkpoint.restore(args.pretrained_checkpoint).expect_partial() + log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint, int(checkpoint.step) - 1)) + + disc_dir = os.path.join(args.output_dir, 'discriminator') + gen_dir = os.path.join(args.output_dir, 'generator') + + heading(" ** Saving discriminator") + model.discriminator(model.discriminator.dummy_inputs) + model.discriminator.save_pretrained(disc_dir) + + heading(" ** Saving generator") + model.generator(model.generator.dummy_inputs) + model.generator.save_pretrained(gen_dir) + + +if __name__ == '__main__': + # Parse essential args + parser = argparse.ArgumentParser() + parser.add_argument('--pretrained_checkpoint') + parser.add_argument('--output_dir') + parser.add_argument('--amp', action='store_true', default=False) + args = parser.parse_args() + + from_pretrained_ckpt(args) diff --git a/modelzoo/ELECTRA/pretrain_utils.py b/modelzoo/ELECTRA/pretrain_utils.py new file mode 100644 index 00000000..029dce9f --- /dev/null +++ b/modelzoo/ELECTRA/pretrain_utils.py @@ -0,0 +1,367 @@ +# coding=utf-8 +# Copyright 2020 The Google Research Authors. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for preparing pre-training data and supplying them to the model.""" + +import collections + +import numpy as np +import tensorflow as tf + +import utils +import tokenization + + +def get_dataset(config, batch_size, num_cpu_threads=4, world_size=1, rank=0): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([config.max_seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64), + } + + input_files = [] + for input_pattern in config.pretrain_tfrecords.split(","): + input_files.extend(tf.io.gfile.glob(input_pattern)) + + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.shard(num_shards=world_size, index=rank) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files), seed=config.seed, reshuffle_each_iteration=False) + + cycle_length = min(num_cpu_threads, len(input_files)) + d = d.interleave( + tf.data.TFRecordDataset, + cycle_length=cycle_length, + deterministic=True) + d = d.shuffle(buffer_size=100, seed=config.seed, reshuffle_each_iteration=False) + + d = d.map(lambda record: _decode_record(record, name_to_features)) + d = d.batch(batch_size) + + return d + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.io.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + +# model inputs - it's a bit nicer to use a namedtuple rather than keep the +# features as a dict +Inputs = collections.namedtuple( + "Inputs", ["input_ids", "input_mask", "segment_ids", "masked_lm_positions", + "masked_lm_ids", "masked_lm_weights"]) + + +def features_to_inputs(features): + return Inputs( + input_ids=features["input_ids"], + input_mask=features["input_mask"], + segment_ids=features["segment_ids"], + masked_lm_positions=(features["masked_lm_positions"] + if "masked_lm_positions" in features else None), + masked_lm_ids=(features["masked_lm_ids"] + if "masked_lm_ids" in features else None), + masked_lm_weights=(features["masked_lm_weights"] + if "masked_lm_weights" in features else None), + ) + + +def get_updated_inputs(inputs, **kwargs): + features = inputs._asdict() + for k, v in kwargs.items(): + features[k] = v + return features_to_inputs(features) + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if isinstance(tensor, np.ndarray) or isinstance(tensor, list): + shape = np.array(tensor).shape + if isinstance(expected_rank, six.integer_types): + assert len(shape) == expected_rank + elif expected_rank is not None: + assert len(shape) in expected_rank + return shape + # + # if name is None: + # name = tensor.name + # + # if expected_rank is not None: + # assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def gather_positions(sequence, positions): + """Gathers the vectors at the specific positions over a minibatch. + + Args: + sequence: A [batch_size, seq_length] or + [batch_size, seq_length, depth] tensor of values + positions: A [batch_size, n_positions] tensor of indices + + Returns: A [batch_size, n_positions] or + [batch_size, n_positions, depth] tensor of the values at the indices + """ + shape = get_shape_list(sequence, expected_rank=[2, 3]) + depth_dimension = (len(shape) == 3) + if depth_dimension: + B, L, D = shape + else: + B, L = shape + D = 1 + sequence = tf.expand_dims(sequence, -1) + position_shift = tf.expand_dims(L * tf.range(B), -1) + flat_positions = tf.reshape(positions + position_shift, [-1]) + flat_sequence = tf.reshape(sequence, [B * L, D]) + gathered = tf.gather(flat_sequence, flat_positions) + if depth_dimension: + return tf.reshape(gathered, [B, -1, D]) + else: + return tf.reshape(gathered, [B, -1]) + + +def scatter_update(sequence, updates, positions): + """Scatter-update a sequence. + + Args: + sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor + updates: A tensor of size batch_size*seq_len(*depth) + positions: A [batch_size, n_positions] tensor + + Returns: A tuple of two tensors. First is a [batch_size, seq_len] or + [batch_size, seq_len, depth] tensor of "sequence" with elements at + "positions" replaced by the values at "updates." Updates to index 0 are + ignored. If there are duplicated positions the update is only applied once. + Second is a [batch_size, seq_len] mask tensor of which inputs were updated. + """ + shape = get_shape_list(sequence, expected_rank=[2, 3]) + depth_dimension = (len(shape) == 3) + if depth_dimension: + B, L, D = shape + else: + B, L = shape + D = 1 + sequence = tf.expand_dims(sequence, -1) + N = get_shape_list(positions)[1] + + shift = tf.expand_dims(L * tf.range(B), -1) + flat_positions = tf.reshape(positions + shift, [-1, 1]) + flat_updates = tf.reshape(updates, [-1, D]) + updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D]) + updates = tf.reshape(updates, [B, L, D]) + + flat_updates_mask = tf.ones([B * N], tf.int32) + updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L]) + updates_mask = tf.reshape(updates_mask, [B, L]) + not_first_token = tf.concat([tf.zeros((B, 1), tf.int32), + tf.ones((B, L - 1), tf.int32)], -1) + updates_mask *= not_first_token + updates_mask_3d = tf.expand_dims(updates_mask, -1) + + # account for duplicate positions + if sequence.dtype == tf.float32: + updates_mask_3d = tf.cast(updates_mask_3d, tf.float32) + updates /= tf.maximum(1.0, updates_mask_3d) + else: + assert sequence.dtype == tf.int32 + updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d)) + updates_mask = tf.minimum(updates_mask, 1) + updates_mask_3d = tf.minimum(updates_mask_3d, 1) + + updated_sequence = (((1 - updates_mask_3d) * sequence) + + (updates_mask_3d * updates)) + if not depth_dimension: + updated_sequence = tf.squeeze(updated_sequence, -1) + + return updated_sequence, updates_mask + + +def _get_candidates_mask(inputs: Inputs, vocab, + disallow_from_mask=None): + """Returns a mask tensor of positions in the input that can be masked out.""" + ignore_ids = [vocab["[SEP]"], vocab["[CLS]"], vocab["[MASK]"]] + candidates_mask = tf.ones_like(inputs.input_ids, tf.bool) + for ignore_id in ignore_ids: + candidates_mask &= tf.not_equal(inputs.input_ids, ignore_id) + candidates_mask &= tf.cast(inputs.input_mask, tf.bool) + if disallow_from_mask is not None: + candidates_mask &= ~disallow_from_mask + return candidates_mask + + +def mask(config, inputs, mask_prob, proposal_distribution=1.0, + disallow_from_mask=None, already_masked=None): + """Implementation of dynamic masking. The optional arguments aren't needed for + BERT/ELECTRA and are from early experiments in "strategically" masking out + tokens instead of uniformly at random. + + Args: + config: configure_pretraining.PretrainingConfig + inputs: pretrain_data.Inputs containing input input_ids/input_mask + mask_prob: percent of tokens to mask + proposal_distribution: for non-uniform masking can be a [B, L] tensor + of scores for masking each position. + disallow_from_mask: a boolean tensor of [B, L] of positions that should + not be masked out + already_masked: a boolean tensor of [B, N] of already masked-out tokens + for multiple rounds of masking + Returns: a pretrain_data.Inputs with masking added + """ + # Get the batch size, sequence length, and max masked-out tokens + N = config.max_predictions_per_seq + B, L = get_shape_list(inputs.input_ids) + + # Find indices where masking out a token is allowed + vocab = tokenization.ElectraTokenizer( + config.vocab_file, do_lower_case=config.do_lower_case).get_vocab() + candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask) + + # Set the number of tokens to mask out per example + num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32) + num_to_predict = tf.maximum(1, tf.minimum( + N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32))) + masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32) + if already_masked is not None: + masked_lm_weights *= (1 - already_masked) + + # Get a probability of masking each position in the sequence + candidate_mask_float = tf.cast(candidates_mask, tf.float32) + sample_prob = (proposal_distribution * candidate_mask_float) + sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True) + + # Sample the positions to mask out + sample_prob = tf.stop_gradient(sample_prob) + sample_logits = tf.math.log(sample_prob) + masked_lm_positions = tf.random.categorical( + sample_logits, N, dtype=tf.int32) + masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32) + + # Get the ids of the masked-out tokens + shift = tf.expand_dims(L * tf.range(B), -1) + flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1]) + masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), + flat_positions) + masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1]) + masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32) + + # Update the input ids + replace_with_mask_positions = masked_lm_positions * tf.cast( + tf.less(tf.random.uniform([B, N]), 0.85), tf.int32) + inputs_ids, _ = scatter_update( + inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), + replace_with_mask_positions) + + return get_updated_inputs( + inputs, + input_ids=tf.stop_gradient(inputs_ids), + masked_lm_positions=masked_lm_positions, + masked_lm_ids=masked_lm_ids, + masked_lm_weights=masked_lm_weights + ) + + +def unmask(inputs: Inputs): + unmasked_input_ids, _ = scatter_update( + inputs.input_ids, inputs.masked_lm_ids, inputs.masked_lm_positions) + return get_updated_inputs(inputs, input_ids=unmasked_input_ids) + + +def sample_from_softmax(logits, disallow=None): + if disallow is not None: + logits -= 1000.0 * disallow + uniform_noise = tf.random.uniform( + get_shape_list(logits), minval=0, maxval=1) + gumbel_noise = tf.cast(-tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9), logits.dtype) + return tf.one_hot(tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1, + output_type=tf.int32), logits.shape[-1]) + + +ENDC = "\033[0m" +COLORS = ["\033[" + str(n) + "m" for n in list(range(91, 97)) + [90]] +RED = COLORS[0] +BLUE = COLORS[3] +CYAN = COLORS[5] +GREEN = COLORS[1] + + +def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None): + """Pretty-print model inputs.""" + pos_to_tokid = {} + for tokid, pos, weight in zip( + inputs.masked_lm_ids[0], inputs.masked_lm_positions[0], + inputs.masked_lm_weights[0]): + if weight == 0: + pass + else: + pos_to_tokid[pos] = tokid + + text = "" + provided_update_mask = (updates_mask is not None) + if not provided_update_mask: + updates_mask = np.zeros_like(inputs.input_ids) + for pos, (tokid, um) in enumerate( + zip(inputs.input_ids[0], updates_mask[0])): + token = inv_vocab[tokid] + if token == "[PAD]": + break + if pos in pos_to_tokid: + token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC + if provided_update_mask: + assert um == 1 + else: + if provided_update_mask: + assert um == 0 + text += token + " " + utils.log(utils.printable_text(text)) diff --git a/modelzoo/ELECTRA/run.sub b/modelzoo/ELECTRA/run.sub new file mode 100644 index 00000000..50696b25 --- /dev/null +++ b/modelzoo/ELECTRA/run.sub @@ -0,0 +1,88 @@ +#!/bin/bash +#SBATCH --exclusive +#SBATCH --mem=0 +#SBATCH --overcommit + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux +# Docker image resulting from bash scripts/docker/build.sh +readonly docker_image="gitlab-master.nvidia.com/dl/joc/electra_tf2:keras_mp_20.07_clean_up" +# Location of dataset for phase 1 amd phase 2 +readonly datadir="/lustre/fsw/joc-luna/sharatht/electra_tf2_data/" + +readonly mounts=".:/workspace/electra,${datadir}:/workspace/electra/data" + +DGXSYSTEM=DGXA100 +cluster="selene" +if [[ "${DGXSYSTEM}" == DGX2* ]]; then + cluster='circe' +fi +if [[ "${DGXSYSTEM}" == DGXA100* ]]; then + cluster='selene' +fi + +BIND_CMD="./scripts/bind.sh --cpu=exclusive --ib=single --cluster=$cluster -- " + +BATCHSIZE=${BATCHSIZE:-16} +PHASE=${PHASE:-1} +LR=${LR:-3e-3} +STEPS=${STEPS:-57450} +WARMUP=${WARMUP:-3750} +GRAD_ACCUM_STEPS=${GRAD_ACCUM_STEPS:-1} +b1=${b1:-"0.878"} +b2=${b2:-"0.974"} +decay=${decay:-"0.5"} +end_lr=${end_lr:-"0.0"} +skip_adaptive=${skip_adaptive:-"yes"} +model_count=${model_count:-1} + +skip_flag="" +if [ "$skip_adaptive" = "yes" ] ; then + skip_flag=" --skip_adaptive" +fi + +ckpt_STEPS=$(awk -v a=$STEPS 'BEGIN { print a / 10}') + +if [ "$PHASE" = "1" ] ; then + +LAUNCH_CMD="$BIND_CMD python run_pretraining.py \ + --model_name='electra_keras_mp_base_lamb_48x8x${BATCHSIZE}x${GRAD_ACCUM_STEPS}_p1_skip_adaptive_${skip_adaptive}_LR_${LR}_WARMUP_${WARMUP}_STEPS_${STEPS}_b1_${b1}_b2_${b2}_decay_${decay}_end_lr_${end_lr}_${model_count}' \ + --pretrain_tfrecords='/workspace/electra/data/tfrecord_lower_case_1_seq_len_128_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*' \ + --num_train_steps=$STEPS \ + --num_warmup_steps=$WARMUP \ + --disc_weight=50.0 \ + --generator_hidden_size=0.3333333 \ + --learning_rate=$LR \ + --train_batch_size=$BATCHSIZE \ + --max_seq_length=128 --log_freq=10 \ + --save_checkpoints_steps=$ckpt_STEPS \ + --optimizer='lamb' $skip_flag --opt_beta_1=$b1 --opt_beta_2=$b2 --lr_decay_power=$decay --end_lr=$end_lr $skip_flag --gradient_accumulation_steps=$GRAD_ACCUM_STEPS --amp --xla " +else +LAUNCH_CMD="$BIND_CMD python run_pretraining.py \ + --model_name='electra_keras_mp_base_lamb_48x8x176x1_p1_skip_adaptive_yes_LR_6e-3_WARMUP_2000_STEPS_10000_b1_0.878_b2_0.974_decay_0.5_end_lr_0.0_${model_count}' \ + --pretrain_tfrecords='/workspace/electra/data/tfrecord_lower_case_1_seq_len_512_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*' \ + --num_train_steps=$STEPS \ + --num_warmup_steps=$WARMUP \ + --disc_weight=50.0 \ + --generator_hidden_size=0.3333333 \ + --learning_rate=$LR \ + --train_batch_size=$BATCHSIZE \ + --max_seq_length=512 --log_freq=10 \ + --restore_checkpoint --phase2 \ + --save_checkpoints_steps=$ckpt_STEPS \ + --optimizer='lamb' $skip_flag --opt_beta_1=$b1 --opt_beta_2=$b2 --lr_decay_power=$decay --end_lr=$end_lr $skip_flag --gradient_accumulation_steps=$GRAD_ACCUM_STEPS --amp --xla " +fi; + +srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${LAUNCH_CMD}" diff --git a/modelzoo/ELECTRA/run_inference.py b/modelzoo/ELECTRA/run_inference.py new file mode 100644 index 00000000..436f5814 --- /dev/null +++ b/modelzoo/ELECTRA/run_inference.py @@ -0,0 +1,212 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import subprocess +import time +import argparse +import json +import logging +import collections + +import tensorflow as tf + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + +from configuration import ElectraConfig +from modeling import TFElectraForQuestionAnswering +from tokenization import ElectraTokenizer +from squad_utils import SquadResult, RawResult, _get_best_indices + +TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google/electra-small-generator", + "google/electra-base-generator", + "google/electra-large-generator", + "google/electra-small-discriminator", + "google/electra-base-discriminator", + "google/electra-large-discriminator", + # See all ELECTRA models at https://huggingface.co/models?filter=electra +] + +_PrelimPrediction = collections.namedtuple( + "PrelimPrediction", + ["start_index", "end_index", "start_logit", "end_logit"]) + + +def parse_args(): + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument("--electra_model", default=None, type=str, required=True, + help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST)) + parser.add_argument("--init_checkpoint", + default=None, + type=str, + required=True, + help="The checkpoint file from pretraining") + parser.add_argument("--question", + default=None, + type=str, + required=True, + help="Question") + parser.add_argument("--context", + default=None, + type=str, + required=True, + help="Context") + parser.add_argument( + "--joint_head", + default=True, + type=bool, + help="Jointly predict the start and end positions", + ) + parser.add_argument( + "--beam_size", + default=4, + type=int, + help="Beam size when doing joint predictions", + ) + parser.add_argument("--n_best_size", default=20, type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json " + "output file.") + parser.add_argument("--max_answer_length", default=30, type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.") + + parser.add_argument('--version_2_with_negative', + action='store_true', + help='If true, the SQuAD examples contain some that do not have an answer.') + parser.add_argument('--null_score_diff_threshold', + type=float, default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.") + + args = parser.parse_args() + + return args + + +def get_predictions_joint_head(start_indices, end_indices, result, max_len, args): + predictions = [] + for i in range(args.beam_size): + start_index = start_indices[i] + for j in range(args.beam_size): + # for end_index in end_indices: + end_index = end_indices[i * args.beam_size + j] + if start_index >= max_len: + continue + if end_index >= max_len: + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > args.max_answer_length: + continue + predictions.append( + _PrelimPrediction( + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[i], + end_logit=result.end_logits[i * args.beam_size + j])) + return predictions + + +def get_predictions(start_indices, end_indices, result, max_len, args): + predictions = [] + for start_index in start_indices: + for end_index in end_indices: + if start_index >= max_len: + continue + if end_index >= max_len: + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > args.max_answer_length: + continue + predictions.append( + _PrelimPrediction( + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + return predictions + + +def main(): + args = parse_args() + print("***** Loading tokenizer and model *****") + electra_model = args.electra_model + config = ElectraConfig.from_pretrained(electra_model) + tokenizer = ElectraTokenizer.from_pretrained(electra_model) + model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, args=args) + + print("***** Loading fine-tuned checkpoint: {} *****".format(args.init_checkpoint)) + model.load_weights(args.init_checkpoint, by_name=False, skip_mismatch=False).expect_partial() + + question, text = args.question, args.context + encoding = tokenizer.encode_plus(question, text, return_tensors='tf') + input_ids, token_type_ids, attention_mask = encoding["input_ids"], encoding["token_type_ids"], \ + encoding["attention_mask"] + all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0]) + if not args.joint_head: + start_logits, end_logits = model(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + )[:2] + start_logits = start_logits[0].numpy().tolist() + end_logits = end_logits[0].numpy().tolist() + result = RawResult(unique_id=0, + start_logits=start_logits, + end_logits=end_logits) + + start_indices = _get_best_indices(result.start_logits, args.n_best_size) + end_indices = _get_best_indices(result.end_logits, args.n_best_size) + predictions = get_predictions(start_indices, end_indices, result, len(all_tokens), args) + null_score = result.start_logits[0] + result.end_logits[0] + + else: + outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + output = [output[0].numpy().tolist() for output in outputs] + start_logits = output[0] + start_top_index = output[1] + end_logits = output[2] + end_top_index = output[3] + cls_logits = output[4] + result = SquadResult( + 0, + start_logits, + end_logits, + start_top_index=start_top_index, + end_top_index=end_top_index, + cls_logits=cls_logits, + ) + predictions = get_predictions_joint_head(result.start_top_index, result.end_top_index, result, len(all_tokens), args) + null_score = result.cls_logits + + predictions = sorted(predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) + answer = predictions[0] + answer = ' '.join(all_tokens[answer.start_index: answer.end_index + 1]) + if args.null_score_diff_threshold > null_score and args.version_2_with_negative: + answer = '' + + print(answer) + + return answer + + +if __name__ == "__main__": + main() diff --git a/modelzoo/ELECTRA/run_pretraining.py b/modelzoo/ELECTRA/run_pretraining.py new file mode 100644 index 00000000..eb58e9fd --- /dev/null +++ b/modelzoo/ELECTRA/run_pretraining.py @@ -0,0 +1,505 @@ +# coding=utf-8 +# Copyright 2020 The Google Research Authors. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pre-trains an ELECTRA model.""" + +import argparse +import collections +import json +import time +import datetime +import os + +import tensorflow as tf +import horovod.tensorflow as hvd +from horovod.tensorflow.compression import Compression +from gpu_affinity import set_affinity + +import utils +import sys +import pretrain_utils +from utils import get_rank, get_world_size, is_main_process, log, log_config, setup_logger, postprocess_dllog +from tokenization import ElectraTokenizer +from modeling import PretrainingModel +from optimization import create_optimizer, GradientAccumulator +import dllogger + +class PretrainingConfig(object): + """Defines pre-training hyperparameters.""" + + def __init__(self, model_name, **kwargs): + self.model_name = model_name + self.seed = 42 + + self.debug = False # debug mode for quickly running things + self.do_train = True # pre-train ELECTRA + self.do_eval = False # evaluate generator/discriminator on unlabeled data + self.phase2 = False + + # amp + self.amp = True + self.xla = True + self.fp16_compression = False + + # optimizer type + self.optimizer = 'adam' + self.gradient_accumulation_steps = 1 + + # lamb whitelisting for LN and biases + self.skip_adaptive = False + + # loss functions + self.electra_objective = True # if False, use the BERT objective instead + self.gen_weight = 1.0 # masked language modeling / generator loss + self.disc_weight = 50.0 # discriminator loss + self.mask_prob = 0.15 # percent of input tokens to mask out / replace + + # optimization + self.learning_rate = 5e-4 + self.lr_decay_power = 0.5 + self.weight_decay_rate = 0.01 + self.num_warmup_steps = 10000 + self.opt_beta_1 = 0.878 + self.opt_beta_2 = 0.974 + self.end_lr = 0.0 + + # training settings + self.log_freq = 10 + self.skip_checkpoint = False + self.save_checkpoints_steps = 1000 + self.num_train_steps = 1000000 + self.num_eval_steps = 100 + self.keep_checkpoint_max = 5 # maximum number of recent checkpoint files to keep; change to 0 or None to keep all checkpoints + self.restore_checkpoint = None + self.load_weights = False + self.steps_this_run = -1 + + # model settings + self.model_size = "base" # one of "small", "base", or "large" + # override the default transformer hparams for the provided model size; see + # modeling.BertConfig for the possible hparams and util.training_utils for + # the defaults + self.model_hparam_overrides = ( + kwargs["model_hparam_overrides"] + if "model_hparam_overrides" in kwargs else {}) + self.embedding_size = None # bert hidden size by default + self.vocab_size = 30522 # number of tokens in the vocabulary + self.do_lower_case = True # lowercase the input? + + # generator settings + self.uniform_generator = False # generator is uniform at random + self.shared_embeddings = True # share generator/discriminator token embeddings? + # self.untied_generator = True # tie all generator/discriminator weights? + self.generator_layers = 1.0 # frac of discriminator layers for generator + self.generator_hidden_size = 0.25 # frac of discrim hidden size for gen + self.disallow_correct = False # force the generator to sample incorrect + # tokens (so 15% of tokens are always + # fake) + self.temperature = 1.0 # temperature for sampling from generator + + # batch sizes + self.max_seq_length = 128 + self.train_batch_size = 128 + self.eval_batch_size = 128 + + self.results_dir = "results" + self.json_summary = None + self.update(kwargs) + # default locations of data files + + self.pretrain_tfrecords = os.path.join( + "data", "pretrain_tfrecords/pretrain_data.tfrecord*") + self.vocab_file = os.path.join("vocab", "vocab.txt") + self.model_dir = os.path.join(self.results_dir, "models", model_name) + self.checkpoints_dir = os.path.join(self.model_dir, "checkpoints") + self.weights_dir = os.path.join(self.model_dir, "weights") + self.results_txt = os.path.join(self.results_dir, "unsup_results.txt") + self.results_pkl = os.path.join(self.results_dir, "unsup_results.pkl") + self.log_dir = os.path.join(self.model_dir, "logs") + + self.max_predictions_per_seq = int((self.mask_prob + 0.005) * + self.max_seq_length) + + # defaults for different-sized model + if self.model_size == "base": + self.embedding_size = 768 + self.hidden_size = 768 + self.num_hidden_layers = 12 + if self.hidden_size % 64 != 0: + raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size)) + self.num_attention_heads = int(self.hidden_size / 64.) + elif self.model_size == "large": + self.embedding_size = 1024 + self.hidden_size = 1024 + self.num_hidden_layers = 24 + if self.hidden_size % 64 != 0: + raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size)) + self.num_attention_heads = int(self.hidden_size / 64.) + else: + raise ValueError("--model_size : 'base' and 'large supported only.") + self.act_func = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + + self.update(kwargs) + + def update(self, kwargs): + for k, v in kwargs.items(): + if v is not None: + self.__dict__[k] = v + + +def metric_fn(config, metrics, eval_fn_inputs): + """Computes the loss and accuracy of the model.""" + d = eval_fn_inputs + metrics["masked_lm_accuracy"].update_state( + y_true=tf.reshape(d["masked_lm_ids"], [-1]), + y_pred=tf.reshape(d["masked_lm_preds"], [-1]), + sample_weight=tf.reshape(d["masked_lm_weights"], [-1])) + metrics["masked_lm_loss"].update_state( + values=tf.reshape(d["mlm_loss"], [-1]), + sample_weight=tf.reshape(d["masked_lm_weights"], [-1])) + if config.electra_objective: + metrics["sampled_masked_lm_accuracy"].update_state( + y_true=tf.reshape(d["masked_lm_ids"], [-1]), + y_pred=tf.reshape(d["sampled_tokids"], [-1]), + sample_weight=tf.reshape(d["masked_lm_weights"], [-1])) + if config.disc_weight > 0: + metrics["disc_loss"].update_state(d["disc_loss"]) + #metrics["disc_auc"].update_state( + # d["disc_labels"] * d["input_mask"], + # d["disc_probs"] * tf.cast(d["input_mask"], tf.float32)) + metrics["disc_accuracy"].update_state( + y_true=d["disc_labels"], y_pred=d["disc_preds"], + sample_weight=d["input_mask"]) + metrics["disc_precision"].update_state( + y_true=d["disc_labels"], y_pred=d["disc_preds"], + sample_weight=d["disc_preds"] * d["input_mask"]) + metrics["disc_recall"].update_state( + y_true=d["disc_labels"], y_pred=d["disc_preds"], + sample_weight=d["disc_labels"] * d["input_mask"]) + return metrics + +@tf.function +def train_one_step(config, model, optimizer, features, accumulator, first_step, take_step, clip_norm=1.0): + + #Forward and Backward pass + with tf.GradientTape() as tape: + total_loss, eval_fn_inputs = model(features, is_training=True) + unscaled_loss = tf.stop_gradient(total_loss) + if config.amp: + total_loss = optimizer.get_scaled_loss(total_loss) + + #Backpropogate gradients + #tape = hvd.DistributedGradientTape( + # tape, sparse_as_dense=True, + # compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none) + gradients = tape.gradient(total_loss, model.trainable_variables) + + #Get unscaled gradients if AMP + if config.amp: + gradients = optimizer.get_unscaled_gradients(gradients) + + #Accumulate gradients + accumulator(gradients) + #Need to call apply_gradients on very first step irrespective of gradient accumulation + #This is required for the optimizer to build it's states + if first_step or take_step: + #All reduce and Clip the accumulated gradients + allreduced_accumulated_gradients = [None if g is None else hvd.allreduce(g / tf.cast(config.gradient_accumulation_steps, g.dtype), + compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none) + for g in accumulator.gradients] + (clipped_accumulated_gradients, _) = tf.clip_by_global_norm(allreduced_accumulated_gradients, clip_norm=clip_norm) + #Weight update + optimizer.apply_gradients(zip(clipped_accumulated_gradients, model.trainable_variables)) + accumulator.reset() + + #brodcast model weights after first train step + if first_step: + hvd.broadcast_variables(model.variables, root_rank=0) + hvd.broadcast_variables(optimizer.variables(), root_rank=0) + + return unscaled_loss, eval_fn_inputs + +def main(e2e_start_time): + # Parse essential argumentss + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", required=True) + parser.add_argument("--model_size", default="base", type=str, help="base or large") + parser.add_argument("--pretrain_tfrecords", type=str) + parser.add_argument("--phase2", action='store_true') + parser.add_argument("--fp16_compression", action='store_true') + parser.add_argument("--amp", action='store_true', + help="Whether to use fp16.") + parser.add_argument("--xla", action='store_true', + help="Whether to use xla.") + parser.add_argument("--seed", default=42, type=int) + parser.add_argument("--num_train_steps", type=int) + parser.add_argument("--num_warmup_steps", type=int) + parser.add_argument("--learning_rate", type=float) + parser.add_argument("--train_batch_size", type=int) + parser.add_argument("--max_seq_length", type=int) + + parser.add_argument("--mask_prob", type=float) + parser.add_argument("--disc_weight", type=float) + parser.add_argument("--generator_hidden_size", type=float) + + parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency") + parser.add_argument("--save_checkpoints_steps", type=int) + parser.add_argument("--steps_this_run", type=int, default=-1, help="run a fixed number of steps only") + parser.add_argument("--keep_checkpoint_max", type=int) + parser.add_argument("--restore_checkpoint", default=None, type=str) + parser.add_argument("--load_weights", action='store_true') + parser.add_argument("--weights_dir") + + parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") + parser.add_argument("--skip_adaptive", action='store_true', help="Whether to apply adaptive LR on LayerNorm and biases") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps") + parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power") + parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1") + parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2") + parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR") + parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs") + parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results") + parser.add_argument("--skip_checkpoint", action='store_true', default=False, help="Path to store logs") + parser.add_argument('--json-summary', type=str, default=None, + help='If provided, the json summary will be written to the specified file.') + args = parser.parse_args() + config = PretrainingConfig(**args.__dict__) + # Padding for divisibility by 8 + if config.vocab_size % 8 != 0: + config.vocab_size += 8 - (config.vocab_size % 8) + + # Set up tensorflow + hvd.init() + + args.log_dir = config.log_dir + # DLLogger + setup_logger(args) + dllogger.metadata('training_sequences_per_second', {'unit': 'sequences/s'}) + dllogger.metadata('final_loss', {'unit': None}) + dllogger.metadata('e2e_train_time', {'unit': 's'}) + + set_affinity(hvd.local_rank()) + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + tf.config.optimizer.set_jit(config.xla) + #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp}) + + if config.amp: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") + tf.keras.mixed_precision.experimental.set_policy(policy) + print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 + print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 + + #tf.random.set_seed(config.seed) + + # Set up config cont' + if config.load_weights and config.restore_checkpoint: + raise ValueError("`load_weights` and `restore_checkpoint` should not be on at the same time.") + if config.phase2 and not config.restore_checkpoint: + raise ValueError("`phase2` cannot be used without `restore_checkpoint`.") + utils.heading("Config:") + log_config(config) + + # Save pretrain configs + pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') + if is_main_process(): + utils.write_json(config.__dict__, pretrain_config_json) + log("Configuration saved in {}".format(pretrain_config_json)) + + # Set up model + model = PretrainingModel(config) + + # Set up metrics + metrics = dict() + metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") + metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") + metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="masked_lm_accuracy") + metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss") + if config.electra_objective: + metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="sampled_masked_lm_accuracy") + if config.disc_weight > 0: + metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") + metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") + metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(name="disc_accuracy") + metrics["disc_precision"] = tf.keras.metrics.Accuracy(name="disc_precision") + metrics["disc_recall"] = tf.keras.metrics.Accuracy(name="disc_recall") + + # Set up tensorboard + current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + train_log_dir = os.path.join(config.log_dir, current_time, + 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) + train_summary_writer = tf.summary.create_file_writer(train_log_dir) + + # Set up dataset + dataset = pretrain_utils.get_dataset( + config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) + train_iterator = iter(dataset) + + # Set up optimizer + optimizer = create_optimizer( + init_lr=config.learning_rate, + num_train_steps=config.num_train_steps, + num_warmup_steps=config.num_warmup_steps, + weight_decay_rate=config.weight_decay_rate, + optimizer=config.optimizer, + skip_adaptive=config.skip_adaptive, + power=config.lr_decay_power, + beta_1=config.opt_beta_1, + beta_2=config.opt_beta_2, + end_lr=config.end_lr) + + accumulator = GradientAccumulator() + if config.amp: + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") + + # Set up model checkpoint + checkpoint = tf.train.Checkpoint( + step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model) + manager = tf.train.CheckpointManager(checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) + if config.restore_checkpoint and config.restore_checkpoint != "latest": + checkpoint.restore(config.restore_checkpoint) + log(" ** Restored model checkpoint from {}".format(config.restore_checkpoint)) + elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint: + checkpoint.restore(manager.latest_checkpoint) + log(" ** Restored model checkpoint from {}".format(manager.latest_checkpoint)) + elif config.load_weights: + model.generator(model.generator.dummy_inputs) + model.discriminator(model.discriminator.dummy_inputs) + model.generator.load_weights(os.path.join(config.weights_dir, 'generator', 'tf_model.h5')) + model.discriminator.load_weights(os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5')) + else: + log(" ** Initializing from scratch.") + + restore_iterator = bool(config.restore_checkpoint) and config.restore_checkpoint == "latest" + # Initialize global step for phase2 + if config.phase2 and not bool(checkpoint.phase2): + optimizer.iterations.assign(0) + checkpoint.step.assign(0) + checkpoint.phase2.assign(True) + restore_iterator = False + if bool(checkpoint.phase2): + manager = tf.train.CheckpointManager( + checkpoint, config.checkpoints_dir, + checkpoint_name='ckpt-p2', + max_to_keep=config.keep_checkpoint_max) + + # Set up iterator checkpoint + iter_checkpoint = tf.train.Checkpoint( + train_iterator=train_iterator, world_size=tf.Variable(get_world_size()), rank=tf.Variable(get_rank())) + iter_manager = tf.train.CheckpointManager( + iter_checkpoint, + os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), + checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), + max_to_keep=config.keep_checkpoint_max) + if restore_iterator and iter_manager.latest_checkpoint: + ckpt_world_size = tf.train.load_variable( + iter_manager.latest_checkpoint, 'world_size/.ATTRIBUTES/VARIABLE_VALUE') + if ckpt_world_size == get_world_size(): + iter_checkpoint.restore(iter_manager.latest_checkpoint) + log(" ** Restored iterator checkpoint from {}".format(iter_manager.latest_checkpoint), all_rank=True) + + utils.heading("Running training") + accumulator.reset() + train_start, start_step = time.time(), int(checkpoint.step) - 1 + local_step = 0 + saved_ckpt = False + while int(checkpoint.step) <= config.num_train_steps: + saved_ckpt = False + step = int(checkpoint.step) + features = next(train_iterator) + iter_start = time.time() + + # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) + total_loss, eval_fn_inputs = train_one_step(config, model, optimizer, features, accumulator, + local_step==1, take_step=local_step % args.gradient_accumulation_steps == 0) + # if step == 300: tf.profiler.experimental.stop() + + metrics["train_perf"].update_state( + config.train_batch_size * get_world_size() / (time.time() - iter_start)) + metrics["total_loss"].update_state(values=total_loss) + metric_fn(config, metrics, eval_fn_inputs) + + if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0): + log_info_dict = {k:float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items()} + dllogger.log(step=(step,), data=log_info_dict, verbosity=0) + log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' + 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, '.format( + step=step, **log_info_dict, + loss_scale=optimizer.loss_scale if config.amp else 1, + elapsed=utils.get_readable_time(time.time() - train_start), + eta=utils.get_readable_time( + (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), + all_rank=True) + + with train_summary_writer.as_default(): + for key, m in metrics.items(): + tf.summary.scalar(key, m.result(), step=step) + + if int(checkpoint.step) < config.num_train_steps: + for m in metrics.values(): + m.reset_states() + + #Print allreduced metrics on the last step + if (int(checkpoint.step) == config.num_train_steps and (local_step % args.gradient_accumulation_steps == 0)) or ((local_step + 1) % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0): + log_info_dict = {k:float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy" in k else float(hvd.allreduce(v.result()).numpy()) for k, v in metrics.items()} + log_info_dict["training_sequences_per_second"] = log_info_dict["train_perf"] + log_info_dict["final_loss"] = log_info_dict["total_loss"] + log_info_dict["e2e_train_time"] = time.time() - e2e_start_time + dllogger.log(step=(), data=log_info_dict, verbosity=0) + log(' Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' + 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'.format( + step=step, **log_info_dict), + all_rank=False) + + if local_step % args.gradient_accumulation_steps == 0: + checkpoint.step.assign(int(optimizer.iterations)) + + if not config.skip_checkpoint and (local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0): + saved_ckpt = True + if is_main_process(): + save_path = manager.save(checkpoint_number=step) + log(" ** Saved model checkpoint for step {}: {}".format(step, save_path)) + iter_save_path = iter_manager.save(checkpoint_number=step) + log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True) + local_step += 1 + if config.steps_this_run != -1 and (local_step % (config.steps_this_run * args.gradient_accumulation_steps) == 0): + #terminating run sooner as steps_this_run has been reached + log("terminating as steps_this_run:{} has been reached".format(config.steps_this_run)) + break + + step = (int(checkpoint.step) - 1) + dllogger.flush() + if not config.skip_checkpoint and not saved_ckpt: + if is_main_process(): + save_path = manager.save(checkpoint_number=step) + log(" ** Saved model checkpoint for step {}: {}".format(step, save_path)) + iter_save_path = iter_manager.save(checkpoint_number=step) + log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True) + + return args + + +if __name__ == "__main__": + start_time = time.time() + args = main(start_time) + log("Total Time:{:.4f}".format(time.time() - start_time)) + if is_main_process(): + postprocess_dllog(args) diff --git a/modelzoo/ELECTRA/run_tf_squad.py b/modelzoo/ELECTRA/run_tf_squad.py new file mode 100644 index 00000000..b76d6e12 --- /dev/null +++ b/modelzoo/ELECTRA/run_tf_squad.py @@ -0,0 +1,675 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import subprocess +import time +import argparse +import json +import logging + +import tensorflow as tf + +import horovod.tensorflow as hvd +from horovod.tensorflow.compression import Compression +from gpu_affinity import set_affinity + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + +from tqdm import tqdm +import dllogger +from utils import is_main_process, format_step, get_rank, get_world_size, log +from configuration import ElectraConfig +from modeling import TFElectraForQuestionAnswering +from tokenization import ElectraTokenizer +from optimization import create_optimizer +from squad_utils import SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features, \ + SquadResult, RawResult, get_answers + + +TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google/electra-small-generator", + "google/electra-base-generator", + "google/electra-large-generator", + "google/electra-small-discriminator", + "google/electra-base-discriminator", + "google/electra-large-discriminator", + # See all ELECTRA models at https://huggingface.co/models?filter=electra +] + +def parse_args(): + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument("--electra_model", default=None, type=str, required=True, + help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST)) + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="Path to dataset.") + parser.add_argument("--output_dir", default=".", type=str, required=True, + help="The output directory where the model checkpoints and predictions will be written.") + parser.add_argument("--init_checkpoint", + default=None, + type=str, + help="The checkpoint file from pretraining") + + # Other parameters + parser.add_argument("--do_train", action='store_true', help="Whether to run training.") + parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to use evaluate accuracy of predictions") + parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") + parser.add_argument("--predict_file", default=None, type=str, + help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + + parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") + parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") + parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay_rate", default=0.01, type=float, help="Weight decay if we apply some.") + parser.add_argument("--layerwise_lr_decay", default=0.8, type=float, + help="The layerwise learning rate decay. Shallower layers have lower learning rates.") + + parser.add_argument("--num_train_epochs", default=3, type=int, + help="Total number of training epochs to perform.") + parser.add_argument("--max_steps", default=-1.0, type=float, + help="Total number of training steps to perform.") + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " + "of training.") + + parser.add_argument("--max_seq_length", default=384, type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.") + parser.add_argument("--doc_stride", default=128, type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.") + parser.add_argument("--max_query_length", default=64, type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.") + parser.add_argument("--vocab_file", default=None, type=str, + help="Path to vocabulary file use for tokenization") + parser.add_argument("--ci", action="store_true", help="true if running on CI") + parser.add_argument( + "--joint_head", + default=True, + type=bool, + help="Jointly predict the start and end positions", + ) + parser.add_argument( + "--beam_size", + default=4, + type=int, + help="Beam size when doing joint predictions", + ) + parser.add_argument("--n_best_size", default=20, type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json " + "output file.") + parser.add_argument("--max_answer_length", default=30, type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.") + + parser.add_argument("--verbose_logging", action='store_true', + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." + ) + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=os.getenv('LOCAL_RANK', -1), + help="local_rank for distributed training on gpus") + parser.add_argument('--amp', + action='store_true', + help="Automatic mixed precision training") + parser.add_argument('--fp16_all_reduce', + action='store_true', + help="Whether to use 16-bit all reduce") + parser.add_argument('--xla', + action='store_true', + help="Whether to use XLA") + parser.add_argument('--version_2_with_negative', + action='store_true', + help='If true, the SQuAD examples contain some that do not have an answer.') + parser.add_argument('--null_score_diff_threshold', + type=float, default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument('--log_freq', + type=int, default=50, + help='frequency of logging loss.') + parser.add_argument('--json-summary', type=str, default="results/dllogger.json", + help='If provided, the json summary will be written to the specified file.') + parser.add_argument("--eval_script", + help="Script to evaluate squad predictions", + default="evaluate.py", + type=str) + parser.add_argument("--use_env", + action='store_true', + help="Whether to read local rank from ENVVAR") + parser.add_argument('--skip_checkpoint', + default=False, + action='store_true', + help="Whether to save checkpoints") + parser.add_argument('--disable-progress-bar', + default=False, + action='store_true', + help='Disable tqdm progress bar') + parser.add_argument("--skip_cache", + default=False, + action='store_true', + help="Whether to cache train features") + parser.add_argument("--cache_dir", + default=None, + type=str, + help="Location to cache train feaures. Will default to the dataset direct") + args = parser.parse_args() + + if not args.do_train and (not args.init_checkpoint or args.init_checkpoint == 'None'): + raise ValueError("Checkpoint is required if do_train is not set") + + return args + + +def get_dataset_from_features(features, batch_size, drop_remainder=True, ngpu=8, mode="train", v2=False): + """Input function for training""" + + all_input_ids = tf.convert_to_tensor([f.input_ids for f in features], dtype=tf.int64) + all_input_mask = tf.convert_to_tensor([f.attention_mask for f in features], dtype=tf.int64) + all_segment_ids = tf.convert_to_tensor([f.token_type_ids for f in features], dtype=tf.int64) + all_start_pos = tf.convert_to_tensor([f.start_position for f in features], dtype=tf.int64) + all_end_pos = tf.convert_to_tensor([f.end_position for f in features], dtype=tf.int64) + + # if v2 else None: + all_cls_index = tf.convert_to_tensor([f.cls_index for f in features], dtype=tf.int64) + all_p_mask = tf.convert_to_tensor([f.p_mask for f in features], dtype=tf.float32) + all_is_impossible = tf.convert_to_tensor([f.is_impossible for f in features], dtype=tf.float32) + + dataset = tf.data.Dataset.from_tensor_slices( + (all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos) + + (all_cls_index, all_p_mask, all_is_impossible)) + if ngpu > 1: + dataset = dataset.shard(get_world_size(), get_rank()) + + if mode == "train": + dataset = dataset.shuffle(batch_size * 3) + # dataset = dataset.map(self._preproc_samples, + # num_parallel_calls=multiprocessing.cpu_count()//self._num_gpus) + dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) + dataset = dataset.prefetch(batch_size) + + return dataset + + +@tf.function +def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0): + with tf.GradientTape() as tape: + [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs + + if not v2: + is_impossible = None + + start_logits, end_logits, cls_logits = model(input_ids, + attention_mask=input_mask, + token_type_ids=segment_ids, + start_positions=start_positions, + end_positions=end_positions, + cls_index=cls_index, + p_mask=p_mask, + is_impossible=is_impossible, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=True, + )[0:3] + + # If we are on multi-GPU, split add a dimension + if len(start_positions.shape) > 1: + start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions") + if len(end_positions.shape) > 1: + end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions") + if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None: + is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible") + + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.shape[1] + start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions") + end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions") + + start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32)) + end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32)) + loss_value = (start_loss + end_loss) / 2 + + if v2: + cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32)) + loss_value += cls_loss_value * 0.5 + + unscaled_loss = tf.stop_gradient(loss_value) + if amp: + loss_value = opt.get_scaled_loss(loss_value) + + tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True, + compression=Compression.fp16 if fp16 else Compression.none) + gradients = tape.gradient(loss_value, model.trainable_variables) + if amp: + gradients = opt.get_unscaled_gradients(gradients) + (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm) + opt.apply_gradients(zip(gradients, model.trainable_variables)) # , clip_norm=1.0) + + if init: + hvd.broadcast_variables(model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + + return unscaled_loss # , outputs#, tape.gradient(loss_value, model.trainable_variables) + + +@tf.function +def infer_step(model, input_ids, + attention_mask=None, + token_type_ids=None, + cls_index=None, + p_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + return model(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + cls_index=cls_index, + p_mask=p_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + training=training, + ) + + +def main(): + args = parse_args() + + hvd.init() + set_affinity(hvd.local_rank()) + + if is_main_process(): + log("Running total processes: {}".format(get_world_size())) + log("Starting process: {}".format(get_rank())) + + if is_main_process(): + dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, + filename=args.json_summary), + dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) + else: + dllogger.init(backends=[]) + + dllogger.metadata("exact_match", {"unit": None}) + dllogger.metadata("F1", {"unit": None}) + dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"}) + dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"}) + + tf.random.set_seed(args.seed) + dllogger.log(step="PARAMETER", data={"SEED": args.seed}) + # script parameters + BATCH_SIZE = args.train_batch_size + EVAL_BATCH_SIZE = args.predict_batch_size + USE_XLA = args.xla + USE_AMP = args.amp + EPOCHS = args.num_train_epochs + + if not args.do_train: + EPOCHS = args.num_train_epochs = 1 + log("Since running inference only, setting args.num_train_epochs to 1") + + if not os.path.exists(args.output_dir) and is_main_process(): + os.makedirs(args.output_dir) + + # TensorFlow configuration + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + tf.config.optimizer.set_jit(USE_XLA) + #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) + + if args.amp: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") + tf.keras.mixed_precision.experimental.set_policy(policy) + print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 + print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 + + if is_main_process(): + log("***** Loading tokenizer and model *****") + # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) + electra_model = args.electra_model + config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir) + config.update({"amp": args.amp}) + if args.vocab_file is None: + tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir) + else: + tokenizer = ElectraTokenizer( + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case) + + model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args) + + if is_main_process(): + log("***** Loading dataset *****") + # Load data + processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() + train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None + dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None + + if is_main_process(): + log("***** Loading features *****") + # Load cached features + squad_version = '2.0' if args.version_2_with_negative else '1.1' + if args.cache_dir is None: + args.cache_dir = args.data_dir + cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format( + electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), + str(args.max_query_length), squad_version) + cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format( + electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), + str(args.max_query_length), squad_version) + + try: + with open(cached_train_features_file, "rb") as reader: + train_features = pickle.load(reader) if args.do_train else [] + with open(cached_dev_features_file, "rb") as reader: + dev_features = pickle.load(reader) if args.do_predict else [] + except: + train_features = ( # TODO: (yy) do on rank 0? + squad_convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=True, + return_dataset="", + ) + if args.do_train + else [] + ) + dev_features = ( + squad_convert_examples_to_features( + examples=dev_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=False, + return_dataset="", + ) + if args.do_predict + else [] + ) + # Dump Cached features + if not args.skip_cache and is_main_process(): + if args.do_train: + log("***** Building Cache Files: {} *****".format(cached_train_features_file)) + with open(cached_train_features_file, "wb") as writer: + pickle.dump(train_features, writer) + if args.do_predict: + log("***** Building Cache Files: {} *****".format(cached_dev_features_file)) + with open(cached_dev_features_file, "wb") as writer: + pickle.dump(dev_features, writer) + + len_train_features = len(train_features) + total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1 + train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1 + len_dev_features = len(dev_features) + total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1 + + train_dataset = get_dataset_from_features(train_features, BATCH_SIZE, + v2=args.version_2_with_negative) if args.do_train else [] + dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev", + v2=args.version_2_with_negative) if args.do_predict else [] + + opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps, + num_warmup_steps=int(args.warmup_proportion * total_train_steps), + weight_decay_rate=args.weight_decay_rate, + layerwise_lr_decay=args.layerwise_lr_decay, + n_transformer_layers=model.num_hidden_layers) + if USE_AMP: + # loss scaling is currently required when using mixed precision + opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic") + + # Define loss function + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + loss_class = tf.keras.losses.BinaryCrossentropy( + from_logits=True, + name='binary_crossentropy' + ) + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") + model.compile(optimizer=opt, loss=loss, metrics=[metric]) + train_loss_results = [] + + if args.do_train and is_main_process(): + log("***** Running training *****") + log(" Num examples = ", len_train_features) + log(" Num Epochs = ", args.num_train_epochs) + log(" Instantaneous batch size per GPU = ", args.train_batch_size) + log( + " Total train batch size (w. parallel, distributed & accumulation) = ", + args.train_batch_size + * get_world_size(), + ) + log(" Total optimization steps =", total_train_steps) + + total_train_time = 0 + latency = [] + for epoch in range(EPOCHS): + if args.do_train: + epoch_loss_avg = tf.keras.metrics.Mean() + epoch_perf_avg = tf.keras.metrics.Mean() + epoch_start = time.time() + + epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5, + disable=not is_main_process()) + for iter, inputs in enumerate(epoch_iterator): + # breaking criterion if max_steps if > 1 + if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps: + break + iter_start = time.time() + # Optimize the model + loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0), + v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP) + #introduce CPU-GPU sync for training perf computation + loss_numpy = loss_value.numpy() + + epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start)) + if iter % args.log_freq == 0: + if is_main_process(): + log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value, + epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1, + int(opt.iterations))) + dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()), + "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())}) + + # Track progress + epoch_loss_avg.update_state(loss_value) # Add current batch loss + + # End epoch + train_loss_results.append(epoch_loss_avg.result()) + total_train_time += float(time.time() - epoch_start) + # Summarize and save checkpoint at the end of each epoch + if is_main_process(): + + dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time, + "training_sequences_per_second": float( + epoch_perf_avg.result().numpy() * get_world_size()), + "final_loss": float(epoch_loss_avg.result().numpy())}) + + if not args.skip_checkpoint: + if args.ci: + checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1) + else: + checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1) + if is_main_process(): + model.save_weights(checkpoint_name) + + + if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1): + if not args.do_train: + log("***** Loading checkpoint: {} *****".format(args.init_checkpoint)) + model.load_weights(args.init_checkpoint).expect_partial() + + current_feature_id = 0 + all_results = [] + if is_main_process(): + log("***** Running evaluation *****") + log(" Num Batches = ", total_dev_steps) + log(" Batch size = ", args.predict_batch_size) + + raw_infer_start = time.time() + if is_main_process(): + infer_perf_avg = tf.keras.metrics.Mean() + dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5, + disable=not is_main_process()) + for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator: + # training=False is needed only if there are layers with different + # behavior during training versus inference (e.g. Dropout). + + iter_start = time.time() + + if not args.joint_head: + batch_start_logits, batch_end_logits = infer_step(model, input_ids, + attention_mask=input_mask, + token_type_ids=segment_ids, + )[:2] + #Synchronize with GPU to compute time + _ = batch_start_logits.numpy() + + else: + + outputs = infer_step(model, input_ids, + attention_mask=input_mask, + token_type_ids=segment_ids, + cls_index=cls_index, + p_mask=p_mask, + ) + #Synchronize with GPU to compute time + _ = outputs[0].numpy() + + infer_time = (time.time() - iter_start) + infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time) + latency.append(infer_time) + + for iter_ in range(input_ids.shape[0]): + + if not args.joint_head: + start_logits = batch_start_logits[iter_].numpy().tolist() + end_logits = batch_end_logits[iter_].numpy().tolist() + dev_feature = dev_features[current_feature_id] + current_feature_id += 1 + unique_id = int(dev_feature.unique_id) + all_results.append(RawResult(unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + else: + dev_feature = dev_features[current_feature_id] + current_feature_id += 1 + unique_id = int(dev_feature.unique_id) + output = [output[iter_].numpy().tolist() for output in outputs] + + start_logits = output[0] + start_top_index = output[1] + end_logits = output[2] + end_top_index = output[3] + cls_logits = output[4] + result = SquadResult( + unique_id, + start_logits, + end_logits, + start_top_index=start_top_index, + end_top_index=end_top_index, + cls_logits=cls_logits, + ) + + all_results.append(result) + + # Compute and save predictions + answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args) + + output_prediction_file = os.path.join(args.output_dir, "predictions.json") + output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") + e2e_infer_time = time.time() - raw_infer_start + # if args.version_2_with_negative: + # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") + # else: + # output_null_log_odds_file = None + with open(output_prediction_file, "w") as f: + f.write(json.dumps(answers, indent=4) + "\n") + with open(output_nbest_file, "w") as f: + f.write(json.dumps(nbest_answers, indent=4) + "\n") + + if args.do_eval: + if args.version_2_with_negative: + dev_file = "dev-v2.0.json" + else: + dev_file = "dev-v1.1.json" + + eval_out = subprocess.check_output([sys.executable, args.eval_script, + args.data_dir + "/" + dev_file, output_prediction_file]) + log(eval_out.decode('UTF-8')) + scores = str(eval_out).strip() + exact_match = float(scores.split(":")[1].split(",")[0]) + if args.version_2_with_negative: + f1 = float(scores.split(":")[2].split(",")[0]) + else: + f1 = float(scores.split(":")[2].split("}")[0]) + + log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8'))) + log("**EVAL SUMMARY** - Epoch: {:03d}, EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s" + .format(epoch, exact_match, f1, infer_perf_avg.result())) + + latency_all = sorted(latency)[:-2] + log( + "**LATENCY SUMMARY** - Epoch: {:03d}, Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms" + .format(epoch, sum(latency_all) / len(latency_all) * 1000, + sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000, + sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000, + sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000, + )) + dllogger.log(step=tuple(), + data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), + "e2e_inference_time": e2e_infer_time}) + + if is_main_process() and args.do_train and args.do_eval: + log( + "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s" + .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(), + infer_perf_avg.result())) + dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1}) + + +if __name__ == "__main__": + main() diff --git a/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh b/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh new file mode 100644 index 00000000..cbeac9ed --- /dev/null +++ b/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +train_batch_size_p1=${1:-"176"} +learning_rate_p1="6e-7" +precision=${2:-"amp"} +xla=${3:-"xla"} +num_gpus=${4:-8} +warmup_steps_p1="10" +train_steps_p1=10 +save_checkpoint_steps=500 +resume_training="false" +optimizer="lamb" +accumulate_gradients=${5:-"true"} +gradient_accumulation_steps_p1=${6:-48} +seed=42 +job_name="electra_lamb_pretraining_benchmark" +train_batch_size_p2=${7:-24} +learning_rate_p2="4e-7" +warmup_steps_p2="10" +train_steps_p2=10 +gradient_accumulation_steps_p2=${8:-144} +electra_model=${9:-"base"} + +restore_checkpoint=false bash scripts/run_pretraining.sh $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model diff --git a/modelzoo/ELECTRA/scripts/benchmark_squad.sh b/modelzoo/ELECTRA/scripts/benchmark_squad.sh new file mode 100644 index 00000000..39263d65 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/benchmark_squad.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +mode=${1:-"train"} +num_gpu=${2:-"8"} +batch_size=${3:-"16"} +infer_batch_size=${4:-"$batch_size"} +precision=${5:-"amp"} +SQUAD_VERSION=${6:-"1.1"} +squad_dir=${7:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"} +OUT_DIR=${8:-"results/"} +init_checkpoint=${9:-"None"} +cache_dir=${10:-"$squad_dir"} + +bash scripts/run_squad.sh google/electra-base-discriminator 1 $batch_size $infer_batch_size 8e-4 $precision $num_gpu $RANDOM $SQUAD_VERSION $squad_dir $OUT_DIR $init_checkpoint $mode interactive $cache_dir 200 diff --git a/modelzoo/ELECTRA/scripts/bind.sh b/modelzoo/ELECTRA/scripts/bind.sh new file mode 100755 index 00000000..0d1a69b5 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/bind.sh @@ -0,0 +1,226 @@ +#! /bin/bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +print_usage() { + cat << EOF +${0} [options] [--] COMMAND [ARG...] + +Control binding policy for each task. Assumes one rank will be launched for each GPU. + +Options: + --cpu=MODE + * exclusive -- bind each rank to an exclusive set of cores near its GPU + * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading + * node -- bind each rank to all cores in the NUMA node nearest its GPU [default] + * *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file + * off -- don't bind + --mem=MODE + * node -- bind each rank to the nearest NUMA node [default] + * *.sh -- bind each rank using the bash associative array bind_mem from a file + * off -- don't bind + --ib=MODE + * single -- bind each rank to a single IB device near its GPU + * off -- don't bind [default] + --cluster=CLUSTER + Select which cluster is being used. May be required if system params cannot be detected. +EOF +} + +################################################################################ +# Argument parsing +################################################################################ + +cpu_mode='node' +mem_mode='node' +ib_mode='off' +cluster='' +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) print_usage ; exit 0 ;; + --cpu=*) cpu_mode="${1/*=/}"; shift ;; + --cpu) cpu_mode="$2"; shift 2 ;; + --mem=*) mem_mode="${1/*=/}"; shift ;; + --mem) mem_mode="$2"; shift 2 ;; + --ib=*) ib_mode="${1/*=/}"; shift ;; + --ib) ib_mode="$2"; shift 2 ;; + --cluster=*) cluster="${1/*=/}"; shift ;; + --cluster) cluster="$2"; shift 2 ;; + --) shift; break ;; + *) break ;; + esac +done +if [ $# -lt 1 ]; then + echo 'ERROR: no command given' 2>&1 + print_usage + exit 1 +fi + +################################################################################ +# Get system params +################################################################################ + +# LOCAL_RANK is set with an enroot hook for Pytorch containers +# SLURM_LOCALID is set by Slurm +# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun +readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}" +if [ -z "${local_rank}" ]; then + echo 'ERROR: cannot read LOCAL_RANK from env' >&2 + exit 1 +fi + +num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits) +if [ "${local_rank}" -ge "${num_gpus}" ]; then + echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2 + exit 1 +fi + +get_lscpu_value() { + awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}" +} +lscpu_out=$(lscpu) +num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}") +num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}") +cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}") + +echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}" + +readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes )) +if [ ${num_gpus} -gt 1 ]; then + readonly gpus_per_node=$(( num_gpus / num_nodes )) +else + readonly gpus_per_node=1 +fi +readonly cores_per_gpu=$(( cores_per_node / gpus_per_node )) +readonly local_node=$(( local_rank / gpus_per_node )) + + +declare -a ibdevs=() +case "${cluster}" in + circe) + # Need to specialize for circe because IB detection is hard + ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10) + ;; + selene) + # Need to specialize for selene because IB detection is hard + ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9) + ;; + '') + if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then + mapfile -t ibdevs <<< "${ibstat_out}" + fi + ;; + *) + echo "ERROR: Unknown cluster '${cluster}'" >&2 + exit 1 + ;; +esac +readonly num_ibdevs="${#ibdevs[@]}" + +################################################################################ +# Setup for exec +################################################################################ + +declare -a numactl_args=() + +case "${cpu_mode}" in + exclusive) + numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \ + $(( local_rank * cores_per_gpu )) \ + $(( (local_rank + 1) * cores_per_gpu - 1 )) \ + $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \ + $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \ + )" ) + ;; + exclusive,nosmt) + numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \ + $(( local_rank * cores_per_gpu )) \ + $(( (local_rank + 1) * cores_per_gpu - 1 )) \ + )" ) + ;; + node) + numactl_args+=( "--cpunodebind=${local_node}" ) + ;; + *.sh) + source "${cpu_mode}" + if [ -n "${bind_cpu_cores:-}" ]; then + numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" ) + elif [ -n "${bind_cpu_nodes:-}" ]; then + numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" ) + else + echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2 + exit 1 + fi + ;; + off|'') + ;; + *) + echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1 + print_usage + exit 1 + ;; +esac + +case "${mem_mode}" in + node) + numactl_args+=( "--membind=${local_node}" ) + ;; + *.sh) + source "${mem_mode}" + if [ -z "${bind_mem:-}" ]; then + echo "ERROR: invalid memory affinity file ${mem_mode}." >&2 + exit 1 + fi + numactl_args+=( "--membind=${bind_mem[${local_rank}]}" ) + ;; + off|'') + ;; + *) + echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1 + print_usage + exit 1 + ;; +esac + +case "${ib_mode}" in + single) + if [ "${num_ibdevs}" -eq 0 ]; then + echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1 + else + readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}" + export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}" + export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}" + fi + ;; + off|'') + ;; + *) + echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1 + print_usage + exit 1 + ;; +esac + +################################################################################ +# Exec +################################################################################ + +if [ "${#numactl_args[@]}" -gt 0 ] ; then + set -x + exec numactl "${numactl_args[@]}" -- "${@}" +else + exec "${@}" +fi diff --git a/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh b/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh new file mode 100644 index 00000000..7ddb3299 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh @@ -0,0 +1,411 @@ +#!/usr/bin/env bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Full pretraining configs for NVIDIA DGX-A100 (8x NVIDIA A100 40GB GPU) + +dgxa100_8gpu_amp () +{ + train_batch_size_p1="176" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=8 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=48 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=24 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=144 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgxa100_8gpu_tf32 () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="tf32" + xla="xla" + num_gpus=8 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=96 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=288 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + + +# Full pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU) + +dgx2_16gpu_amp () +{ + train_batch_size_p1="176" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=16 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=24 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=24 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=72 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgx2_16gpu_fp32 () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="fp32" + xla="xla" + num_gpus=16 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=48 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=144 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +# Full pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU) + +dgx1_8gpu_amp () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=8 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=96 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=288 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgx1_8gpu_fp32 () +{ + train_batch_size_p1="40" + learning_rate_p1="6e-3" + precision="fp32" + xla="xla" + num_gpus=8 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=211 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=6 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=576 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +# Full pretraining configs for NVIDIA DGX-A100 (1x NVIDIA A100 40GB GPU) + +dgxa100_1gpu_amp () +{ + train_batch_size_p1="176" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=384 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=24 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=1152 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgxa100_1gpu_tf32 () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="tf32" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=768 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=2304 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +# Full pretraining configs for NVIDIA DGX-2H (1x NVIDIA V100 32GB GPU) + +dgx2_1gpu_amp () +{ + train_batch_size_p1="176" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=384 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=24 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=1152 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgx2_1gpu_fp32 () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="fp32" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=768 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=2304 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +# Full pretraining configs for NVIDIA DGX-1 (1x NVIDIA V100 16GB GPU) + +dgx1_1gpu_amp () +{ + train_batch_size_p1="88" + learning_rate_p1="6e-3" + precision="amp" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=768 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=12 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=2304 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} + +dgx1_1gpu_fp32 () +{ + train_batch_size_p1="40" + learning_rate_p1="6e-3" + precision="fp32" + xla="xla" + num_gpus=1 + warmup_steps_p1="2000" + train_steps_p1=10000 + save_checkpoint_steps=500 + resume_training="false" + optimizer="lamb" + accumulate_gradients="true" + gradient_accumulation_steps_p1=1689 + seed=42 + job_name="electra_lamb_pretraining" + train_batch_size_p2=6 + learning_rate_p2="4e-3" + warmup_steps_p2="200" + train_steps_p2=933 + gradient_accumulation_steps_p2=4608 + electra_model="base" + echo $train_batch_size_p1 $learning_rate_1 $precision $num_gpus $xla \ + $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \ + $resume_training $optimizer $accumulate_gradients \ + $gradient_accumulation_steps_p1 $seed $job_name \ + $train_batch_size_p2 $learning_rate_p2 \ + $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \ + $electra_model + +} diff --git a/modelzoo/ELECTRA/scripts/configs/squad_config.sh b/modelzoo/ELECTRA/scripts/configs/squad_config.sh new file mode 100644 index 00000000..a29eb71c --- /dev/null +++ b/modelzoo/ELECTRA/scripts/configs/squad_config.sh @@ -0,0 +1,271 @@ +#!/usr/bin/env bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dgxa100_8gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="512" + learning_rate="8e-4" + precision="amp" + num_gpu="8" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgxa100_8gpu_tf32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="512" + learning_rate="8e-4" + precision="tf32" + num_gpu="8" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU) + +dgx2_16gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="256" + learning_rate="1e-3" + precision="amp" + num_gpu="16" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgx2_16gpu_fp32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="256" + learning_rate="1e-3" + precision="fp32" + num_gpu="16" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU) + +dgx1_8gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="16" + infer_batch_size="256" + learning_rate="4e-4" + precision="amp" + num_gpu="8" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgx1_8gpu_fp32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="8" + infer_batch_size="256" + learning_rate="3e-4" + precision="fp32" + num_gpu="8" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +#Squad 2.0 +dgx1_8gpu_amp_v2 () +{ + electra_model="google/electra-base-discriminator" + epochs="3" + batch_size="16" + infer_batch_size="256" + learning_rate="4e-4" + precision="amp" + num_gpu="8" + seed="1" + SQUAD_VERSION="2.0" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} +# 1GPU configs + +dgxa100_1gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="512" + learning_rate="2e-4" + precision="amp" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgxa100_1gpu_tf32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="512" + learning_rate="2e-4" + precision="tf32" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU) + +dgx2_1gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="256" + learning_rate="2e-4" + precision="amp" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgx2_1gpu_fp32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="32" + infer_batch_size="256" + learning_rate="2e-4" + precision="fp32" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU) + +dgx1_1gpu_amp () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="16" + infer_batch_size="256" + learning_rate="1e-4" + precision="amp" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} + +dgx1_1gpu_fp32 () +{ + electra_model="google/electra-base-discriminator" + epochs="2" + batch_size="8" + infer_batch_size="256" + learning_rate="1e-4" + precision="fp32" + num_gpu="1" + seed="1" + SQUAD_VERSION="1.1" + squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION" + OUT_DIR="results/" + init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt" + echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \ + $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \ + $OUT_DIR $init_checkpoint +} diff --git a/modelzoo/ELECTRA/scripts/docker/build.sh b/modelzoo/ELECTRA/scripts/docker/build.sh new file mode 100644 index 00000000..b0f1ec61 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/docker/build.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker build --network=host . --rm -t electra diff --git a/modelzoo/ELECTRA/scripts/docker/launch.sh b/modelzoo/ELECTRA/scripts/docker/launch.sh new file mode 100644 index 00000000..b5bd7d60 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/docker/launch.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CMD=${1:-/bin/bash} +NV_VISIBLE_DEVICES=${2:-"all"} +DOCKER_BRIDGE=${3:-"host"} + +docker run -it --rm \ + --gpus device=$NV_VISIBLE_DEVICES \ + --net=$DOCKER_BRIDGE \ + --shm-size=1g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --privileged \ + -e LD_LIBRARY_PATH='/workspace/install/lib/' \ + -v $PWD:/workspace/electra \ + electra $CMD diff --git a/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh b/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh new file mode 100644 index 00000000..39d6a8c5 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +checkpoints=${checkpoints:-"results/base/checkpoints"} +for folder in $checkpoints; do + + ckpts_dir=${folder} + output_dir=${folder} + + for f in $ckpts_dir/*.index; do + ckpt=${f%.*} + echo "==================================== START $ckpt ====================================" + python postprocess_pretrained_ckpt.py --pretrained_checkpoint=$ckpt --output_dir=$output_dir --amp + bash scripts/run_squad.sh $output_dir/discriminator; + echo "==================================== END $ckpt ===================================="; + done +done diff --git a/modelzoo/ELECTRA/scripts/run_pretraining.sh b/modelzoo/ELECTRA/scripts/run_pretraining.sh new file mode 100644 index 00000000..7fd3feb8 --- /dev/null +++ b/modelzoo/ELECTRA/scripts/run_pretraining.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo "Container nvidia build = " $NVIDIA_BUILD_ID +train_batch_size_p1=${1:-176} +learning_rate_p1=${2:-"6e-3"} +precision=${3:-"amp"} +num_gpus=${4:-8} +xla=${5:-"xla"} +warmup_steps_p1=${6:-"2000"} +train_steps_p1=${7:-10000} +save_checkpoint_steps=${8:-500} +resume_training=${9:-"false"} +optimizer=${10:-"lamb"} +accumulate_gradients=${11:-"true"} +gradient_accumulation_steps_p1=${12:-48} +seed=${13:-12439} +job_name=${14:-"electra_lamb_pretraining"} +train_batch_size_p2=${15:-24} +learning_rate_p2=${16:-"4e-3"} +warmup_steps_p2=${17:-"200"} +train_steps_p2=${18:-933} +gradient_accumulation_steps_p2=${19:-144} +ELECTRA_MODEL=${20:-"base"} +DATASET_P1="tfrecord_lower_case_1_seq_len_128_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*" # change this for other datasets +DATA_DIR_P1=${21:-"$DATA_PREP_WORKING_DIR/$DATASET_P1"} +DATASET_P2="tfrecord_lower_case_1_seq_len_512_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*" # change this for other datasets +DATA_DIR_P2=${22:-"$DATA_PREP_WORKING_DIR/$DATASET_P2"} +CODEDIR=${23:-"/workspace/electra"} +init_checkpoint=${24:-"None"} +restore_checkpoint=${restore_checkpoint:-"true"} +RESULTS_DIR=$CODEDIR/results + +if [ ! -d "$RESULTS_DIR" ] ; then + echo "Error! $RESULTS_DIR directory missing." + exit -1 +fi + +PREFIX="" +TEST_RESULT=$(awk 'BEGIN {print ('1' <= '${num_gpus}')}') +if [ "$TEST_RESULT" == 1 ] ; then + PREFIX="horovodrun -np $num_gpus " +fi + +if [ "$precision" = "amp" ] ; then + PREC="--amp " +elif [ "$precision" = "fp32" ] ; then + PREC="" +elif [ "$precision" = "tf32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +if [ "$xla" = "xla" ] ; then + PREC="$PREC --xla" +fi + +ACCUMULATE_GRADIENTS="" +if [ "$accumulate_gradients" == "true" ] ; then + ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_p1" +fi + +CHECKPOINT="" +if [ "$resume_training" == "true" ] ; then + CHECKPOINT="--restore_checkpoint=latest" +fi + +if [ "$init_checkpoint" != "None" ] ; then + CHECKPOINT="--restore_checkpoint=$init_checkpoint" +fi + +CMD=" $CODEDIR/run_pretraining.py" +CMD+=" --model_name=${ELECTRA_MODEL}" +CMD+=" --pretrain_tfrecords=$DATA_DIR_P1" +CMD+=" --model_size=${ELECTRA_MODEL}" +CMD+=" --train_batch_size=$train_batch_size_p1" +CMD+=" --max_seq_length=128 --disc_weight=50.0 --generator_hidden_size=0.3333333 " +CMD+=" --num_train_steps=$train_steps_p1" +CMD+=" --num_warmup_steps=$warmup_steps_p1" +CMD+=" --save_checkpoints_steps=$save_checkpoint_steps" +CMD+=" --learning_rate=$learning_rate_p1" +CMD+=" --optimizer=${optimizer} --skip_adaptive --opt_beta_1=0.878 --opt_beta_2=0.974 --lr_decay_power=0.5" +CMD+=" --seed=$seed" +CMD+=" $PREC" +CMD+=" $ACCUMULATE_GRADIENTS" +CMD+=" $CHECKPOINT" +CMD+=" --log_dir ${RESULTS_DIR} " + +CMD="$PREFIX python3 $CMD" +echo "Launch command: $CMD" + +printf -v TAG "electra_pretraining_phase1_%s" "$precision" +DATESTAMP=`date +'%y%m%d%H%M%S'` +LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log +printf "Logs written to %s\n" "$LOGFILE" + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee $LOGFILE +fi + +set +x + +echo "finished pretraining phase1" + +#Start Phase2 +ACCUMULATE_GRADIENTS="" +if [ "$accumulate_gradients" == "true" ] ; then + ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_p2" +fi + +RESTORE_CHECKPOINT="" +if [ "$restore_checkpoint" == "true" ] ; then + RESTORE_CHECKPOINT="--restore_checkpoint=latest --phase2" +fi + +CMD=" $CODEDIR/run_pretraining.py" +CMD+=" --model_name=${ELECTRA_MODEL}" +CMD+=" --pretrain_tfrecords=$DATA_DIR_P2" +CMD+=" --model_size=${ELECTRA_MODEL}" +CMD+=" --train_batch_size=$train_batch_size_p2" +CMD+=" --max_seq_length=512 --disc_weight=50.0 --generator_hidden_size=0.3333333 ${RESTORE_CHECKPOINT}" +CMD+=" --num_train_steps=$train_steps_p2" +CMD+=" --num_warmup_steps=$warmup_steps_p2" +CMD+=" --save_checkpoints_steps=$save_checkpoint_steps" +CMD+=" --learning_rate=$learning_rate_p2" +CMD+=" --optimizer=${optimizer} --skip_adaptive --opt_beta_1=0.878 --opt_beta_2=0.974 --lr_decay_power=0.5" +CMD+=" --seed=$seed" +CMD+=" $PREC" +CMD+=" $ACCUMULATE_GRADIENTS" +CMD+=" --log_dir ${RESULTS_DIR} " + +CMD="$PREFIX python3 $CMD" +echo "Launch command: $CMD" + + +printf -v TAG "electra_pretraining_phase2_%s" "$precision" +DATESTAMP=`date +'%y%m%d%H%M%S'` +LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log +printf "Logs written to %s\n" "$LOGFILE" + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee $LOGFILE +fi + +set +x + +echo "finished pretraining phase2" diff --git a/modelzoo/ELECTRA/scripts/run_squad.sh b/modelzoo/ELECTRA/scripts/run_squad.sh new file mode 100644 index 00000000..c9ac17bf --- /dev/null +++ b/modelzoo/ELECTRA/scripts/run_squad.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +electra_model=${1:-"google/electra-base-discriminator"} +epochs=${2:-"2"} +batch_size=${3:-"16"} +infer_batch_size=${4:-"128"} +learning_rate=${5:-"4e-4"} +precision=${6:-"amp"} +num_gpu=${7:-"8"} +seed=${8:-"$RANDOM"} +SQUAD_VERSION=${9:-"1.1"} +squad_dir=${10:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"} +OUT_DIR=${11:-"results/"} +init_checkpoint=${12:-"None"} +mode=${13:-"train_eval"} +env=${14:-"interactive"} +cache_dir=${15:-"$squad_dir"} +max_steps=${16:-"-1"} + +echo "out dir is $OUT_DIR" +mkdir -p $OUT_DIR +if [ ! -d "$OUT_DIR" ]; then + echo "ERROR: non existing $OUT_DIR" + exit 1 +fi + +use_fp16="" +if [ "$precision" = "amp" ] ; then + echo "mixed-precision training and xla activated!" + use_fp16=" --amp --xla " +fi + +if [ "$num_gpu" = "1" ] ; then + export CUDA_VISIBLE_DEVICES=0 + mpi_command=" " +else + unset CUDA_VISIBLE_DEVICES + mpi_command=" horovodrun -np $num_gpu " +fi + +if [ "$env" = "cluster" ] ; then + unset CUDA_VISIBLE_DEVICES + mpi_command=" " +fi + +v2="" +echo "Running SQuAD-v$SQUAD_VERSION" +if [ "$SQUAD_VERSION" = "2.0" ] ; then + v2=" --version_2_with_negative " +fi + +CMD=" $mpi_command python run_tf_squad.py " +CMD+="--init_checkpoint=$init_checkpoint " +if [ "$mode" = "train" ] ; then + CMD+="--do_train " + CMD+="--train_batch_size=$batch_size " +elif [ "$mode" = "eval" ] ; then + CMD+="--do_predict " + CMD+="--predict_batch_size=$infer_batch_size " + CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py " + CMD+="--do_eval " +elif [ "$mode" = "prediction" ] ; then + CMD+="--do_predict " + CMD+="--predict_batch_size=$infer_batch_size " +else + CMD+=" --do_train " + CMD+=" --train_batch_size=$batch_size " + CMD+="--do_predict " + CMD+="--predict_batch_size=$infer_batch_size " + CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py " + CMD+="--do_eval " +fi + +CMD+=" $v2 " +CMD+=" --data_dir $squad_dir " +CMD+=" --do_lower_case " +CMD+=" --electra_model=$electra_model " +CMD+=" --learning_rate=$learning_rate " +CMD+=" --warmup_proportion 0.05 " +CMD+=" --weight_decay_rate 0.01 " +CMD+=" --layerwise_lr_decay 0.8 " +CMD+=" --seed=$seed " +CMD+=" --num_train_epochs=$epochs " +CMD+=" --max_seq_length=384 " +CMD+=" --doc_stride=128 " +CMD+=" --beam_size 5 " +CMD+=" --joint_head True " +CMD+=" --null_score_diff_threshold -5.6 " +CMD+=" --output_dir=$OUT_DIR " +CMD+=" $use_fp16" +CMD+=" --cache_dir=$cache_dir " +CMD+=" --max_steps=$max_steps " +CMD+=" --vocab_file=/workspace/electra/vocab/vocab.txt " + +LOGFILE=$OUT_DIR/logfile.txt +echo "$CMD |& tee $LOGFILE" +time $CMD |& tee $LOGFILE diff --git a/modelzoo/ELECTRA/squad_utils.py b/modelzoo/ELECTRA/squad_utils.py new file mode 100644 index 00000000..a15c4dd9 --- /dev/null +++ b/modelzoo/ELECTRA/squad_utils.py @@ -0,0 +1,1093 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import csv +import logging +import os +import math +import collections +from functools import partial +from multiprocessing import Pool, cpu_count +import horovod.tensorflow as hvd + +import numpy as np +from tqdm import tqdm + + +from file_utils import is_tf_available, is_torch_available +from tokenization_utils import BasicTokenizer, whitespace_tokenize + +if is_torch_available(): + import torch + from torch.utils.data import TensorDataset + +if is_tf_available(): + import tensorflow as tf + +logger = logging.getLogger(__name__) + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _new_check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + # if len(doc_spans) == 1: + # return True + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span["start"] + doc_span["length"] - 1 + if position < doc_span["start"]: + continue + if position > end: + continue + num_left_context = position - doc_span["start"] + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + +def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): + features = [] + if is_training and not example.is_impossible: + # Get start and end position + start_position = example.start_position + end_position = example.end_position + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + return [] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + spans = [] + + truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + sequence_added_tokens = ( + tokenizer.max_len - tokenizer.max_len_single_sentence + 1 + if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) + else tokenizer.max_len - tokenizer.max_len_single_sentence + ) + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + + encoded_dict = tokenizer.encode_plus( + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + max_length=max_seq_length, + return_overflowing_tokens=True, + pad_to_max_length=True, + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", + return_token_type_ids=True, + ) + + paragraph_len = min( + len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens, + ) + + if tokenizer.pad_token_id in encoded_dict["input_ids"]: + if tokenizer.padding_side == "right": + non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] + else: + last_padding_id_position = ( + len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) + ) + non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] + + else: + non_padded_ids = encoded_dict["input_ids"] + + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + + if "overflowing_tokens" not in encoded_dict: + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = ( + j + if tokenizer.padding_side == "left" + else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + ) + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + for span in spans: + # Identify the position of the CLS token + cls_index = span["input_ids"].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = np.array(span["token_type_ids"]) + + p_mask = np.minimum(p_mask, 1) + + if tokenizer.padding_side == "right": + # Limit positive values to one + p_mask = 1 - p_mask + + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 + + # Set the CLS index to '0' + p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if tokenizer.padding_side == "left": + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + features.append( + SquadFeatures( + span["input_ids"], + span["attention_mask"], + span["token_type_ids"], + cls_index, + p_mask.tolist(), + example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. + unique_id=0, + paragraph_len=span["paragraph_len"], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible, + ) + ) + return features + + +def squad_convert_example_to_features_init(tokenizer_for_convert): + global tokenizer + tokenizer = tokenizer_for_convert + + +def squad_convert_examples_to_features( + examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1 +): + """ + Converts a list of examples into a list of features that can be directly given as input to a model. + It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. + + Args: + examples: list of :class:`~transformers.data.processors.squad.SquadExample` + tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` + max_seq_length: The maximum sequence length of the inputs. + doc_stride: The stride used when the context is too large and is split across several features. + max_query_length: The maximum length of the query. + is_training: whether to create features for model evaluation or model training. + return_dataset: Default False. Either 'pt' or 'tf'. + if 'pt': returns a torch.data.TensorDataset, + if 'tf': returns a tf.data.Dataset + threads: multiple processing threadsa-smi + + + Returns: + list of :class:`~transformers.data.processors.squad.SquadFeatures` + + Example:: + + processor = SquadV2Processor() + examples = processor.get_dev_examples(data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) + """ + + # Defining helper methods + features = [] + threads = min(threads, cpu_count()) + with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial( + squad_convert_example_to_features, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + is_training=is_training, + ) + features = list( + tqdm( + p.imap(annotate_, examples, chunksize=32), + total=len(examples), + desc="convert squad examples to features", + mininterval=5, + disable=hvd.rank() not in [-1, 0] + ) + ) + new_features = [] + unique_id = 1000000000 + example_index = 0 + for example_features in tqdm(features, total=len(features), desc="add example index and unique id", + mininterval=5, disable=hvd.rank() not in [-1, 0]): + if not example_features: + continue + for example_feature in example_features: + example_feature.example_index = example_index + example_feature.unique_id = unique_id + new_features.append(example_feature) + unique_id += 1 + example_index += 1 + features = new_features + del new_features + if return_dataset == "pt": + if not is_torch_available(): + raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) + all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) + all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float) + + if not is_training: + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + dataset = TensorDataset( + all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask + ) + else: + all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + dataset = TensorDataset( + all_input_ids, + all_attention_masks, + all_token_type_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + all_is_impossible, + ) + + return features, dataset + elif return_dataset == "tf": + if not is_tf_available(): + raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") + + def gen(): + for ex in features: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + { + "start_position": ex.start_position, + "end_position": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + "is_impossible": ex.is_impossible, + }, + ) + + return tf.data.Dataset.from_generator( + gen, + ( + {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, + { + "start_position": tf.int64, + "end_position": tf.int64, + "cls_index": tf.int64, + "p_mask": tf.int32, + "is_impossible": tf.int32, + }, + ), + ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + }, + { + "start_position": tf.TensorShape([]), + "end_position": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + "is_impossible": tf.TensorShape([]), + }, + ), + ) + + return features + + +class DataProcessor(object): # TODO can be removed + """Base class for data converters for sequence classification data sets.""" + + def get_example_from_tensor_dict(self, tensor_dict): + """Gets an example from a dict with tensorflow tensors + Args: + tensor_dict: Keys and values should match the corresponding Glue + tensorflow_dataset examples. + """ + raise NotImplementedError() + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + def tfds_map(self, example): + """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. + This method converts examples to the correct format.""" + if len(self.get_labels()) > 1: + example.label = self.get_labels()[int(example.label)] + return example + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + return list(csv.reader(f, delimiter="\t", quotechar=quotechar)) + + +class SquadProcessor(DataProcessor): + """ + Processor for the SQuAD data set. + Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. + """ + + train_file = None + dev_file = None + + def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): + if not evaluate: + answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8") + answer_start = tensor_dict["answers"]["answer_start"][0].numpy() + answers = [] + else: + answers = [ + {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")} + for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"]) + ] + + answer = None + answer_start = None + + return SquadExample( + qas_id=tensor_dict["id"].numpy().decode("utf-8"), + question_text=tensor_dict["question"].numpy().decode("utf-8"), + context_text=tensor_dict["context"].numpy().decode("utf-8"), + answer_text=answer, + start_position_character=answer_start, + title=tensor_dict["title"].numpy().decode("utf-8"), + answers=answers, + ) + + def get_examples_from_dataset(self, dataset, evaluate=False): + """ + Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset. + + Args: + dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` + evaluate: boolean specifying if in evaluation mode or in training mode + + Returns: + List of SquadExample + + Examples:: + + import tensorflow_datasets as tfds + dataset = tfds.load("squad") + + training_examples = get_examples_from_dataset(dataset, evaluate=False) + evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) + """ + + if evaluate: + dataset = dataset["validation"] + else: + dataset = dataset["train"] + + examples = [] + for tensor_dict in tqdm(dataset, mininterval=5, disable=hvd.rank() not in [-1, 0]): + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + + return examples + + def get_train_examples(self, data_dir, filename=None): + """ + Returns the training examples from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the training file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + + """ + if data_dir is None: + data_dir = "" + + if self.train_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open( + os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "train") + + def get_dev_examples(self, data_dir, filename=None): + """ + Returns the evaluation example from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the evaluation file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + """ + if data_dir is None: + data_dir = "" + + if self.dev_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open( + os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "dev") + + def _create_examples(self, input_data, set_type): + is_training = set_type == "train" + examples = [] + for entry in tqdm(input_data, mininterval=5, disable=hvd.rank() not in [-1, 0]): + title = entry["title"] + for paragraph in entry["paragraphs"]: + context_text = paragraph["context"] + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position_character = None + answer_text = None + answers = [] + + if "is_impossible" in qa: + is_impossible = qa["is_impossible"] + else: + is_impossible = False + + if not is_impossible: + if is_training: + answer = qa["answers"][0] + answer_text = answer["text"] + start_position_character = answer["answer_start"] + else: + answers = qa["answers"] + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + context_text=context_text, + answer_text=answer_text, + start_position_character=start_position_character, + title=title, + is_impossible=is_impossible, + answers=answers, + ) + + examples.append(example) + return examples + + +class SquadV1Processor(SquadProcessor): + train_file = "train-v1.1.json" + dev_file = "dev-v1.1.json" + + +class SquadV2Processor(SquadProcessor): + train_file = "train-v2.0.json" + dev_file = "dev-v2.0.json" + + +class SquadExample(object): + """ + A single training/test example for the Squad dataset, as loaded from disk. + + Args: + qas_id: The example's unique identifier + question_text: The question string + context_text: The context string + answer_text: The answer string + start_position_character: The character position of the start of the answer + title: The title of the example + answers: None by default, this is used during evaluation. Holds answers as well as their start positions. + is_impossible: False by default, set to True if the example has no possible answer. + """ + + def __init__( + self, + qas_id, + question_text, + context_text, + answer_text, + start_position_character, + title, + answers=[], + is_impossible=False, + ): + self.qas_id = qas_id + self.question_text = question_text + self.context_text = context_text + self.answer_text = answer_text + self.title = title + self.is_impossible = is_impossible + self.answers = answers + + self.start_position, self.end_position = 0, 0 + + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in self.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + self.doc_tokens = doc_tokens + self.char_to_word_offset = char_to_word_offset + + # Start and end positions only has a value during evaluation. + if start_position_character is not None and not is_impossible: + self.start_position = char_to_word_offset[start_position_character] + self.end_position = char_to_word_offset[ + min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) + ] + + +class SquadFeatures(object): + """ + Single squad example features to be fed to a model. + Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample` + using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + cls_index: the index of the CLS token. + p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. + Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer + example_index: the index of the example + unique_id: The unique Feature identifier + paragraph_len: The length of the context + token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. + If a token does not have their maximum context in this feature object, it means that another feature object + has more information related to that token and should be prioritized over this feature for that token. + tokens: list of tokens corresponding to the input ids + token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. + start_position: start of the answer token index + end_position: end of the answer token index + """ + + def __init__( + self, + input_ids, + attention_mask, + token_type_ids, + cls_index, + p_mask, + example_index, + unique_id, + paragraph_len, + token_is_max_context, + tokens, + token_to_orig_map, + start_position, + end_position, + is_impossible, + ): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.cls_index = cls_index + self.p_mask = p_mask + + self.example_index = example_index + self.unique_id = unique_id + self.paragraph_len = paragraph_len + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +class SquadResult(object): + """ + Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. + + Args: + unique_id: The unique identifier corresponding to that example. + start_logits: The logits corresponding to the start of the answer + end_logits: The logits corresponding to the end of the answer + """ + + def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): + self.start_logits = start_logits + self.end_logits = end_logits + self.unique_id = unique_id + + if start_top_index: + self.start_top_index = start_top_index + self.end_top_index = end_top_index + self.cls_logits = cls_logits + + + + + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def get_answers(examples, features, results, args): + predictions = collections.defaultdict(list) # it is possible that one example corresponds to multiple features + _Prediction = collections.namedtuple('_Prediction', ['text', 'start_logit', 'end_logit']) + + if args.version_2_with_negative: + null_vals = collections.defaultdict(lambda: (float("inf"), 0, 0)) + + for ex, feat, result in match_results(examples, features, results): + if not args.joint_head: + start_indices = _get_best_indices(result.start_logits, args.n_best_size) + end_indices = _get_best_indices(result.end_logits, args.n_best_size) + prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args) + feature_null_score = result.start_logits[0] + result.end_logits[0] + + else: + prelim_predictions = get_valid_prelim_predictions_joint_head(result.start_top_index, result.end_top_index, + feat, result, args) + # start_indices = result.start_top_index + # end_indices = result.end_top_index + feature_null_score = result.cls_logits + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + if args.version_2_with_negative and feature_null_score < null_vals[ex.qas_id][0]: + null_vals[ex.qas_id] = (feature_null_score, result.start_logits[0], result.end_logits[0]) + + curr_predictions = [] + seen_predictions = set() + for pred in prelim_predictions: + if len(curr_predictions) == args.n_best_size: + break + if pred.start_index > 0: # this is a non-null prediction TODO: this probably is irrelevant + final_text = get_answer_text(ex, feat, pred, args) + else: + final_text = '' + if final_text in seen_predictions: + continue + + seen_predictions.add(final_text) + curr_predictions.append(_Prediction(final_text, pred.start_logit, pred.end_logit)) + predictions[ex.qas_id] += curr_predictions + + # Add empty prediction + if args.version_2_with_negative: + for qas_id in predictions.keys(): + predictions[qas_id].append(_Prediction('', + null_vals[qas_id][1], + null_vals[qas_id][2])) + + nbest_answers = collections.defaultdict(list) + answers = {} + for qas_id, preds in predictions.items(): + # nbest = sorted( + # preds, + # key=lambda x: (x.start_logit + x.end_logit), + # reverse=True)[:args.n_best_size] + seen_predictions = set() + nbest = [] + for pred in sorted(predictions[qas_id], key=lambda x: (x.start_logit + x.end_logit), reverse=True): + if len(nbest) >= args.n_best_size: + break + if pred.text in seen_predictions: + continue + seen_predictions.add(pred.text) + nbest.append(pred) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if not nbest or (args.version_2_with_negative and len(nbest) == 1): + nbest.append(_Prediction(text="empty", start_logit=0.0, end_logit=0.0)) + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry and entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_answers[qas_id].append(output) + + if args.version_2_with_negative: + if not args.joint_head: + score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit + else: + score_diff = null_vals[qas_id][0] + if score_diff > args.null_score_diff_threshold: + answers[qas_id] = "" + else: + answers[qas_id] = best_non_null_entry.text + else: + answers[qas_id] = nbest_answers[qas_id][0]['text'] + + return answers, nbest_answers + + +def get_answer_text(example, feature, pred, args): + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging) + return final_text + + +def get_valid_prelim_predictions_joint_head(start_indices, end_indices, feature, result, args): + _PrelimPrediction = collections.namedtuple( + "PrelimPrediction", + ["start_index", "end_index", "start_logit", "end_logit"]) + prelim_predictions = [] + # for start_index in start_indices: + + for i in range(args.beam_size): + start_index = start_indices[i] + for j in range(args.beam_size): + # for end_index in end_indices: + end_index = end_indices[i * args.beam_size + j] + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > args.max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[i], # start_index], + end_logit=result.end_logits[i * args.beam_size + j])) # end_index])) + return prelim_predictions + + +def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args): + _PrelimPrediction = collections.namedtuple( + "PrelimPrediction", + ["start_index", "end_index", "start_logit", "end_logit"]) + prelim_predictions = [] + for start_index in start_indices: + for end_index in end_indices: + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > args.max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + return prelim_predictions + + +def match_results(examples, features, results): + unique_f_ids = set([f.unique_id for f in features]) + unique_r_ids = set([r.unique_id for r in results]) + matching_ids = unique_f_ids & unique_r_ids + features = [f for f in features if f.unique_id in matching_ids] + results = [r for r in results if r.unique_id in matching_ids] + features.sort(key=lambda x: x.unique_id) + results.sort(key=lambda x: x.unique_id) + + for f, r in zip(features, results): # original code assumes strict ordering of examples. TODO: rewrite this + yield examples[f.example_index], f, r + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indices(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indices = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indices.append(index_and_score[i][0]) + return best_indices + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs diff --git a/modelzoo/ELECTRA/tokenization.py b/modelzoo/ELECTRA/tokenization.py new file mode 100644 index 00000000..47421c2d --- /dev/null +++ b/modelzoo/ELECTRA/tokenization.py @@ -0,0 +1,68 @@ +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tokenization_utils import BertTokenizer + +from tokenization_utils import BertTokenizer + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt", + "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt", + "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt", + "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt", + "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt", + "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "google/electra-small-generator": 512, + "google/electra-base-generator": 512, + "google/electra-large-generator": 512, + "google/electra-small-discriminator": 512, + "google/electra-base-discriminator": 512, + "google/electra-large-discriminator": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "google/electra-small-generator": {"do_lower_case": True}, + "google/electra-base-generator": {"do_lower_case": True}, + "google/electra-large-generator": {"do_lower_case": True}, + "google/electra-small-discriminator": {"do_lower_case": True}, + "google/electra-base-discriminator": {"do_lower_case": True}, + "google/electra-large-discriminator": {"do_lower_case": True}, +} + + +class ElectraTokenizer(BertTokenizer): + r""" + Constructs an Electra tokenizer. + :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end + tokenization: punctuation splitting + wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + + diff --git a/modelzoo/ELECTRA/tokenization_utils.py b/modelzoo/ELECTRA/tokenization_utils.py new file mode 100644 index 00000000..928532c6 --- /dev/null +++ b/modelzoo/ELECTRA/tokenization_utils.py @@ -0,0 +1,2415 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +import copy +import functools +import itertools +import json +import logging +import operator +import os +import re +import collections +import unicodedata + +from collections import UserDict, defaultdict +from contextlib import contextmanager +from typing import List, Optional, Sequence, Tuple, Union + +from tokenizers import AddedToken, Encoding +from tokenizers.implementations import BaseTokenizer + +from file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available + + +if is_tf_available(): + import tensorflow as tf +if is_torch_available(): + import torch + +logger = logging.getLogger(__name__) + +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + + +# Define type aliases +TextInput = str +TextPairInput = Tuple[str, str] +PreTokenizedInput = List[str] +PreTokenizedInputPair = Tuple[List[str], List[str]] + + +def flatten(x: Sequence): + """ + Flatten the provided (potentially nested) sequence + + Args: + x (Sequence): Potentially nested sequence to flatten + + Returns: + list: Flattened sequence + """ + + return functools.reduce(operator.iconcat, x, []) + + +@contextmanager +def truncate_and_pad( + tokenizer: BaseTokenizer, + max_length: int, + stride: int, + strategy: str, + pad_to_max_length: bool, + padding_side: str, + pad_token_id: int, + pad_token_type_id: int, + pad_token: str, +): + """ + This contextmanager is in charge of defining the truncation and the padding strategies and then + restore the tokenizer settings afterwards. + + This contextmanager assumes the provider tokenizer has no padding / truncation strategy + before the managed section. If your tokenizer set a padding / truncation strategy before, + then it will be reset to no padding/truncation when exiting the managed section. + + Args: + tokenizer (BaseTokenizer): The tokenizer which will be used + max_length (int): The maximum size of the sequence + stride (int): The stride to use when handling overflow + strategy (str): Overflowing logic to use + pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length + padding_side (str): "left" or "right" indicating the direction the output sequence will be padded + pad_token_id (int): The integer representation of the padding token to use + pad_token_type_id (int): The integer representation of the padding token type to use + pad_token (str): The string representation of the padding token to use + + Returns: + + """ + + # Handle all the truncation and padding stuff + if max_length is not None: + tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy) + + if pad_to_max_length and (pad_token and pad_token_id >= 0): + tokenizer.enable_padding( + max_length=max_length, + direction=padding_side, + pad_id=pad_token_id, + pad_type_id=pad_token_type_id, + pad_token=pad_token, + ) + elif pad_to_max_length: + logger.warning( + "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" + "To remove this error, you can add a new pad token and then resize model embedding:\n" + "\ttokenizer.pad_token = ''\n\tmodel.resize_token_embeddings(len(tokenizer))".format( + pad_token, pad_token_id + ) + ) + + yield + + if max_length is not None: + tokenizer.no_truncation() + + if pad_to_max_length and (pad_token and pad_token_id >= 0): + tokenizer.no_padding() + + +class BatchEncoding(UserDict): + """ + Data structure derived from Dictionary holding all the required information to forward through + a model. + + In addition, this structure expose utility methods to map from word/char space to token space. + """ + + def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None): + super().__init__(data) + + if isinstance(encoding, Encoding): + encoding = [encoding] + + self._encodings = encoding + + def __getitem__(self, item: Union[int, str]) -> Encoding: + if isinstance(item, str): + return self.data[item] + elif self._encodings is not None: + return self._encodings[item] + else: + raise KeyError("int index is supported only on {} from a Rust tokenizer".format(type(self).__name__)) + + def __getattr__(self, item: str): + return self.data[item] + + @property + def encodings(self) -> Optional[List[Encoding]]: + """ + Return the list all encoding from the tokenization process + + Returns: List[Encoding] or None if input was tokenized through Python tokenizer + """ + return self._encodings + + def keys(self): + return self.data.keys() + + def values(self): + return self.data.values() + + def items(self): + return self.data.items() + + def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]: + """ + Find the Offsets of the token containing the character at the specified position + + Args: + sentence: Index of the sentence relative to the batch provided to the tokenizer + char: Char index to get the relative token offsets + + Returns: + tuple: (token start, token end) + + """ + + if not self._encodings: + raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers") + return self[sentence].char_to_token_offsets(char) + + def char_to_token(self, sentence: int, char: int) -> int: + """ + Return the index of the token at position of the given char. + + Args: + sentence (int): Index of the sentence relative to the batch provided to the tokenizer + char (int): Char index to get the relative token offsets + + Returns: + int: Integer referring to the position of the token in the returned set of tokens for the sentence + """ + + if not self._encodings: + raise ValueError("char_to_token() is not available when using Python based tokenizers") + return self[sentence].char_to_token(char) + + def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]: + """ + Find the Offsets of the word containing the character at the specified position + + Args: + sentence (int): Index of the sentence relative to the batch provided to the tokenizer + char (int): Char index to get the relative token offsets + + Returns: + tuple: (word start, word end) representing the first and last characters of the word + """ + + if not self._encodings: + raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers") + return self[sentence].char_to_word_offsets(char) + + def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]: + """ + Find the Offsets of the word containing the token at the given index + + Args: + sentence (int): Index of the sentence relative to the batch provided to the tokenizer + index (int): Index of the token to map to the original word offsets + + Returns: + Optional[tuple]: (word start, word end) or None + """ + + if not self._encodings: + raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers") + return self[sentence].token_to_word_offsets(index) + + +class SpecialTokensMixin: + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + + def __init__(self, **kwargs): + + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._pad_token_type_id = 0 + self._additional_special_tokens = [] + + for key, value in kwargs.items(): + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) + elif isinstance(value, AddedToken): + setattr(self, key, str(value)) + elif isinstance(value, str): + setattr(self, key, value) + else: + raise TypeError( + "special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) + ) + + @property + def bos_token(self): + """ Beginning of sentence token (string). Log an error if used while not having been set. """ + if self._bos_token is None: + logger.error("Using bos_token, but it is not set yet.") + return self._bos_token + + @property + def eos_token(self): + """ End of sentence token (string). Log an error if used while not having been set. """ + if self._eos_token is None: + logger.error("Using eos_token, but it is not set yet.") + return self._eos_token + + @property + def unk_token(self): + """ Unknown token (string). Log an error if used while not having been set. """ + if self._unk_token is None: + logger.error("Using unk_token, but it is not set yet.") + return self._unk_token + + @property + def sep_token(self): + """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + if self._sep_token is None: + logger.error("Using sep_token, but it is not set yet.") + return self._sep_token + + @property + def pad_token(self): + """ Padding token (string). Log an error if used while not having been set. """ + if self._pad_token is None: + logger.error("Using pad_token, but it is not set yet.") + return self._pad_token + + @property + def cls_token(self): + """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + if self._cls_token is None: + logger.error("Using cls_token, but it is not set yet.") + return self._cls_token + + @property + def mask_token(self): + """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + if self._mask_token is None: + logger.error("Using mask_token, but it is not set yet.") + return self._mask_token + + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ + if self._additional_special_tokens is None: + logger.error("Using additional_special_tokens, but it is not set yet.") + return self._additional_special_tokens + + @bos_token.setter + def bos_token(self, value): + self._bos_token = value + + @eos_token.setter + def eos_token(self, value): + self._eos_token = value + + @unk_token.setter + def unk_token(self, value): + self._unk_token = value + + @sep_token.setter + def sep_token(self, value): + self._sep_token = value + + @pad_token.setter + def pad_token(self, value): + self._pad_token = value + + @cls_token.setter + def cls_token(self, value): + self._cls_token = value + + @mask_token.setter + def mask_token(self, value): + self._mask_token = value + + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.bos_token) + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.eos_token) + + @property + def unk_token_id(self): + """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.unk_token) + + @property + def sep_token_id(self): + """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.sep_token) + + @property + def pad_token_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.pad_token) + + @property + def pad_token_type_id(self): + """ Id of the padding token type in the vocabulary.""" + return self._pad_token_type_id + + @property + def cls_token_id(self): + """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.cls_token) + + @property + def mask_token_id(self): + """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.mask_token) + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.additional_special_tokens) + + @property + def special_tokens_map(self): + """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their + values ('', ''...) + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens(self): + """ List all the special tokens ('', ''...) mapped to class attributes + (cls_token, unk_token...). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ List the vocabulary indices of the special tokens ('', ''...) mapped to + class attributes (cls_token, unk_token...). + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + +class PreTrainedTokenizer(SpecialTokensMixin): + """ Base class for all tokenizers. + Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. + + This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + + Class attributes (overridden by derived classes): + + - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). + - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. + - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. + - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. + + Parameters: + + - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` + + - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` + + - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` + + - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` + + - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` + + - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` + + - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` + + - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` + """ + + vocab_files_names = {} + pretrained_vocab_files_map = {} + pretrained_init_configuration = {} + max_model_input_sizes = {} + model_input_names = ["token_type_ids", "attention_mask"] + + padding_side = "right" + + NO_PAD_TOKEN_FOR_BATCH_MSG = ( + "No padding token is set for this model, therefore no batch can be made with uneven " + "sequences. Set a padding token or adjust the lengths of the sequences building the " + "batch so that every sequence is of the same length." + ) + + UNEVEN_SEQUENCES_FOR_BATCH_MSG = ( + "The sequences building the batch are not of the same size, no tensor " + "can be built. Set `pad_to_max_length=True` to pad the smaller sequences" + "up to the larger sequence's length." + ) + + @property + def vocab_size(self) -> int: + """ Size of the base vocabulary (without the added tokens) """ + raise NotImplementedError + + @property + def is_fast(self): + return False + + def get_vocab(self): + """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ + raise NotImplementedError() + + def __init__(self, max_len=None, **kwargs): + + super().__init__(**kwargs) + + self.max_len = max_len if max_len is not None else int(1e12) + + # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop("padding_side", self.padding_side) + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + + # Added tokens + self.added_tokens_encoder = {} + self.unique_added_tokens_encoder = set() + self.added_tokens_decoder = {} + + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + self.init_kwargs = {} + + def __len__(self): + """ Size of the full vocabulary with the added tokens """ + return self.vocab_size + len(self.added_tokens_encoder) + + @classmethod + def from_pretrained(cls, *inputs, **kwargs): + r""" + Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. + + Args: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. + - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. + + kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer + + # Download vocabulary from S3 and cache. + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == '' + + """ + return cls._from_pretrained(*inputs, **kwargs) + + @classmethod + def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + + s3_models = list(cls.max_model_input_sizes.keys()) + vocab_files = {} + init_configuration = {} + if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket + for file_id, map_list in cls.pretrained_vocab_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + if ( + cls.pretrained_init_configuration + and pretrained_model_name_or_path in cls.pretrained_init_configuration + ): + init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() + else: + # Get the vocabulary from local files + logger.info( + "Model name '{}' not found in model shortcut name list ({}). " + "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( + pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path + ) + ) + + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + if len(cls.vocab_files_names) > 1: + raise ValueError( + "Calling {}.from_pretrained() with the path to a single file or url is not supported." + "Use a model identifier or the path to a directory instead.".format(cls.__name__) + ) + logger.warning( + "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( + cls.__name__ + ) + ) + file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path + else: + # At this point pretrained_model_name_or_path is either a directory or a model identifier name + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + } + # Look for the tokenizer main vocabulary files + the additional tokens files + for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): + if os.path.isdir(pretrained_model_name_or_path): + full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + else: + full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name) + + vocab_files[file_id] = full_file_name + + # Get files from url, cache, or disk depending on the case + try: + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + else: + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + ) + except EnvironmentError: + if pretrained_model_name_or_path in s3_models: + msg = "Couldn't reach server at '{}' to download vocabulary files." + else: + msg = ( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path or url to a directory containing vocabulary files " + "named {}, but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + raise EnvironmentError(msg) + + if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): + raise EnvironmentError( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " + "named {} but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + for file_id, file_path in vocab_files.items(): + if file_path == resolved_vocab_files[file_id]: + logger.info("loading file {}".format(file_path)) + else: + logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + if tokenizer_config_file is not None: + with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: + init_kwargs = json.load(tokenizer_config_handle) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + init_kwargs = init_configuration + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Set max length if needed + if pretrained_model_name_or_path in cls.max_model_input_sizes: + # if we're using a pretrained model, ensure the tokenizer + # wont index sequences longer than the number of positional embeddings + max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] + if max_len is not None and isinstance(max_len, (int, float)): + init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + if special_tokens_map_file is not None: + with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + for key, value in special_tokens_map.items(): + if key not in init_kwargs: + init_kwargs[key] = value + + # Instantiate tokenizer. + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except OSError: + raise OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) + + # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` + tokenizer.init_inputs = init_inputs + tokenizer.init_kwargs = init_kwargs + + # update unique_added_tokens_encoder with special tokens for correct tokenization + tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens)) + + # Add supplementary tokens. + if added_tokens_file is not None: + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} + tokenizer.added_tokens_encoder.update(added_tok_encoder) + tokenizer.added_tokens_decoder.update(added_tok_decoder) + tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys())) + + return tokenizer + + def save_pretrained(self, save_directory): + """ Save the tokenizer vocabulary files together with: + - added tokens, + - special-tokens-to-class-attributes-mapping, + - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). + + This won't save modifications other than (added tokens and special token mapping) you may have + applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation). + + This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + if not os.path.isdir(save_directory): + logger.error("Saving directory ({}) should be a directory".format(save_directory)) + return + + special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) + added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) + tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + if len(self.init_inputs) > 0: + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + with open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + with open(special_tokens_map_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) + + if len(self.added_tokens_encoder) > 0: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) + f.write(out_str) + + vocab_files = self.save_vocabulary(save_directory) + + return vocab_files + (special_tokens_map_file, added_tokens_file) + + def save_vocabulary(self, save_directory): + """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens + and special token mappings. + + Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + raise NotImplementedError + + def add_tokens(self, new_tokens): + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the + vocabulary, they are added to it with indices starting from length of the current vocabulary. + + Args: + new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + """ + if not new_tokens: + return 0 + + if not isinstance(new_tokens, list): + new_tokens = [new_tokens] + + to_add_tokens = [] + for token in new_tokens: + assert isinstance(token, str) + if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: + token = token.lower() + if ( + token != self.unk_token + and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) + and token not in to_add_tokens + ): + to_add_tokens.append(token) + logger.info("Adding %s to the vocabulary", token) + + added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens)) + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} + self.added_tokens_encoder.update(added_tok_encoder) + self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens)) + self.added_tokens_decoder.update(added_tok_decoder) + + return len(to_add_tokens) + + def num_special_tokens_to_add(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + Note: + This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this + inside your training loop. + + Args: + pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the + number of added tokens in the case of a single sequence if set to False. + + Returns: + Number of tokens added to sequences + """ + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def add_special_tokens(self, special_tokens_dict): + """ + Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them + to class attributes. If special tokens are NOT in the vocabulary, they are added + to it (indexed starting from the last index of the current vocabulary). + + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - special tokens are carefully handled by the tokenizer (they are never split) + - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + + Args: + special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: + [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + ``additional_special_tokens``]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + + special_tokens_dict = {'cls_token': ''} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + + assert tokenizer.cls_token == '' + """ + if not special_tokens_dict: + return 0 + + added_tokens = 0 + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) + added_tokens += self.add_tokens(value) + else: + assert isinstance(value, str) + added_tokens += self.add_tokens([value]) + logger.info("Assigning %s to the %s key of the tokenizer", value, key) + setattr(self, key, value) + + return added_tokens + + def tokenize(self, text: TextInput, **kwargs): + """ Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Take care of added tokens. + + text: The sequence to be encoded. + add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence + begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`. + **kwargs: passed to the `prepare_for_tokenization` preprocessing method. + """ + all_special_tokens = self.all_special_tokens + text = self.prepare_for_tokenization(text, **kwargs) + + def lowercase_text(t): + # convert non-special tokens to lowercase + escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] + pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" + return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) + + if self.init_kwargs.get("do_lower_case", False): + text = lowercase_text(text) + + def split_on_token(tok, text): + result = [] + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + sub_text = sub_text.rstrip() + if i == 0 and not sub_text: + result += [tok] + elif i == len(split_text) - 1: + if sub_text: + result += [sub_text] + else: + pass + else: + if sub_text: + result += [sub_text] + result += [tok] + return result + + def split_on_tokens(tok_list, text): + if not text.strip(): + return [] + if not tok_list: + return self._tokenize(text) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self.unique_added_tokens_encoder: + tokenized_text += split_on_token(tok, sub_text) + else: + tokenized_text += [sub_text] + text_list = tokenized_text + + return list( + itertools.chain.from_iterable( + ( + self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token] + for token in tokenized_text + ) + ) + ) + + added_tokens = self.unique_added_tokens_encoder + tokenized_text = split_on_tokens(added_tokens, text) + return tokenized_text + + def _tokenize(self, text, **kwargs): + """ Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens): + """ Converts a single token, or a sequence of tokens, (str) in a single integer id + (resp. a sequence of ids), using the vocabulary. + """ + if tokens is None: + return None + + if isinstance(tokens, str): + return self._convert_token_to_id_with_added_voc(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id_with_added_voc(token)) + return ids + + def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + + if token in self.added_tokens_encoder: + return self.added_tokens_encoder[token] + return self._convert_token_to_id(token) + + def _convert_token_to_id(self, token): + raise NotImplementedError + + def encode( + self, + text: TextInput, + text_pair: Optional[TextInput] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + **kwargs + ): + """ + Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + + Args: + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + **kwargs: passed to the `self.tokenize()` method + """ + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + max_length=max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + def encode_plus( + self, + text: TextInput, + text_pair: Optional[TextInput] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + is_pretokenized: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + **kwargs + ) -> BatchEncoding: + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Set to True to indicate the input is already tokenized + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on + Rust-based tokenizers inheriting from PreTrainedTokenizerFast. + **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + attention_mask: list[int] if return_attention_mask is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + # Throw an error if we can pad because there is no padding token + if pad_to_max_length and self.pad_token_id is None: + raise ValueError( + "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" + ) + + first_ids = get_input_ids(text) + second_ids = get_input_ids(text_pair) if text_pair is not None else None + + return self.prepare_for_model( + first_ids, + pair_ids=second_ids, + max_length=max_length, + pad_to_max_length=pad_to_max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + ) + + def batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair] + ], + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + is_pretokenized: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_masks: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_masks: bool = False, + return_offsets_mapping: bool = False, + return_input_lengths: bool = False, + **kwargs + ) -> BatchEncoding: + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`): + Batch of sequences or pair of sequences to be encoded. + This can be a list of string/string-sequences/int-sequences or a list of pair of + string/string-sequences/int-sequence (see details in encode_plus) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Set to True to indicate the input is already tokenized + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on + Rust-based tokenizers inheriting from PreTrainedTokenizerFast. + return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set the resulting dictionary will include the length of each sample + **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[List[int]], + token_type_ids: list[List[int]] if return_token_type_ids is True (default) + attention_mask: list[List[int]] if return_attention_mask is True (default) + overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + # Throw an error if we can pad because there is no padding token + if pad_to_max_length and self.pad_token_id is None: + raise ValueError( + "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + input_ids = [] + for ids_or_pair_ids in batch_text_or_text_pairs: + if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized: + ids, pair_ids = ids_or_pair_ids + else: + ids, pair_ids = ids_or_pair_ids, None + + first_ids = get_input_ids(ids) + second_ids = get_input_ids(pair_ids) if pair_ids is not None else None + input_ids.append((first_ids, second_ids)) + + if max_length is None and pad_to_max_length: + + def total_sequence_length(input_pairs): + first_ids, second_ids = input_pairs + return len(first_ids) + ( + self.num_special_tokens_to_add() + if second_ids is None + else (len(second_ids) + self.num_special_tokens_to_add(pair=True)) + ) + + max_length = max([total_sequence_length(ids) for ids in input_ids]) + + batch_outputs = {} + for first_ids, second_ids in input_ids: + # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by + # the model. It adds special tokens, truncates sequences if overflowing while taking into account + # the special tokens and manages a window stride for overflowing tokens + outputs = self.prepare_for_model( + first_ids, + pair_ids=second_ids, + max_length=max_length, + pad_to_max_length=pad_to_max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_attention_mask=return_attention_masks, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_masks, + ) + + # Append the non-padded length to the output + if return_input_lengths: + outputs["input_len"] = len(outputs["input_ids"]) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + if return_tensors is not None: + + # Do the tensor conversion in batch + for key, value in batch_outputs.items(): + if return_tensors == "tf" and is_tf_available(): + try: + batch_outputs[key] = tf.constant(value) + except ValueError: + if None in [item for sequence in value for item in sequence]: + raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) + else: + raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) + elif return_tensors == "pt" and is_torch_available(): + try: + batch_outputs[key] = torch.tensor(value) + except ValueError: + raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) + except RuntimeError: + if None in [item for sequence in value for item in sequence]: + raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) + else: + raise + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) + + return BatchEncoding(batch_outputs) + + def prepare_for_model( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + max_length: Optional[int] = None, + add_special_tokens: bool = True, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + ): + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + It adds special tokens, truncates + sequences if overflowing while taking into account the special tokens and manages a window stride for + overflowing tokens + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential + list of inputs. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + ``input_ids``: list of token ids to be fed to a model + ``token_type_ids``: list of token type ids to be fed to a model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Handle max sequence length + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + if max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Handle special_tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) + + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + + if max_length and len(encoded_inputs["input_ids"]) > max_length: + encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] + if return_token_type_ids: + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] + + if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + "for this model ({} > {}). Running this sequence through the model will result in " + "indexing errors".format(len(ids), self.max_len) + ) + + needs_to_be_padded = pad_to_max_length and ( + max_length + and len(encoded_inputs["input_ids"]) < max_length + or max_length is None + and len(encoded_inputs["input_ids"]) < self.max_len + and self.max_len <= 10000 + ) + + if pad_to_max_length and max_length is None and self.max_len > 10000: + logger.warning( + "Sequence can't be padded as no maximum length is specified and the model maximum length is too high." + ) + + if needs_to_be_padded: + difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) + + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] + + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + + elif return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + + # Prepare inputs as tensors if asked + if return_tensors == "tf" and is_tf_available(): + encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) + + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]]) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]]) + + elif return_tensors == "pt" and is_torch_available(): + encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) + + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]]) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]]) + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) + + return BatchEncoding(encoded_inputs) + + def prepare_for_tokenization(self, text, **kwargs): + """ Performs any necessary transformations before tokenization """ + return text + + def truncate_sequences( + self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0 + ): + """Truncates a sequence pair in place to the maximum length. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == "longest_first": + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == "only_first": + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == "only_second": + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == "do_not_truncate": + raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """ Converts a single index or a sequence of indices (integers) in a token " + (resp.) a sequence of tokens (str), using the vocabulary and added tokens. + + Args: + skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + """ + if isinstance(ids, int): + if ids in self.added_tokens_decoder: + return self.added_tokens_decoder[ids] + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + if index in self.added_tokens_decoder: + tokens.append(self.added_tokens_decoder[index]) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index): + raise NotImplementedError + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. + The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) + but we often want to remove sub-word tokenization artifacts at the same time. + """ + return " ".join(self.convert_ids_to_tokens(tokens)) + + def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + """ + Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. + skip_special_tokens: if set to True, will replace special tokens. + clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + """ + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separatly for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + text = " ".join(sub_texts) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + @staticmethod + def clean_up_tokenization(out_string): + """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + """ + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" do not", " don't") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string + + +def trim_batch( + input_ids, pad_token_id, attention_mask=None, +): + """Remove columns that are populated exclusively by pad_token_id""" + keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) + if attention_mask is None: + return input_ids[:, keep_column_mask] + else: + return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask]) + + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", + "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", + "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "bert-base-finnish-cased-v1": 512, + "bert-base-finnish-uncased-v1": 512, + "bert-base-dutch-cased": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "bert-base-finnish-cased-v1": {"do_lower_case": False}, + "bert-base-finnish-uncased-v1": {"do_lower_case": True}, + "bert-base-dutch-cased": {"do_lower_case": False}, +} + + +# Bert Classes +class BertTokenizer(PreTrainedTokenizer): + r""" + Constructs a BERT tokenizer. Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`string`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do basic tokenization before WordPiece. + never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): + List of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/transformers/issues/328 + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + **kwargs + ): + super().__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0's). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, vocab_path): + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + vocab_path (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + + Args: + **do_lower_case**: Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + """ + never_split = self.never_split + (never_split if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/modelzoo/ELECTRA/utils.py b/modelzoo/ELECTRA/utils.py new file mode 100644 index 00000000..d3e3cc04 --- /dev/null +++ b/modelzoo/ELECTRA/utils.py @@ -0,0 +1,231 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json, pickle, sys, unicodedata, six, time, os +import horovod.tensorflow as hvd +import tensorflow as tf +import dllogger + +def get_rank(): + try: + return hvd.rank() + except: + return 0 + + +def get_world_size(): + try: + return hvd.size() + except: + return 1 + + +def is_main_process(): + return get_rank() == 0 + + +def format_step(step): + if isinstance(step, str): + return step + s = "" + if len(step) == 1: + s += "Training Iteration: {} ".format(step[0]) + return s + if len(step) > 0: + s += "Training Epoch: {} ".format(step[0]) + if len(step) > 1: + s += "Training Iteration: {} ".format(step[1]) + return s + + +def load_json(path): + with tf.io.gfile.GFile(path, "r") as f: + return json.load(f) + + +def write_json(o, path): + if "/" in path: + tf.io.gfile.makedirs(path.rsplit("/", 1)[0]) + with tf.io.gfile.GFile(path, "w") as f: + json.dump(o, f) + + +def load_pickle(path): + with tf.io.gfile.GFile(path, "rb") as f: + return pickle.load(f) + + +def write_pickle(o, path): + if "/" in path: + tf.io.gfile.makedirs(path.rsplit("/", 1)[0]) + with tf.io.gfile.GFile(path, "wb") as f: + pickle.dump(o, f, -1) + + +def mkdir(path): + if not tf.io.gfile.exists(path): + tf.io.gfile.makedirs(path) + + +def rmrf(path): + if tf.io.gfile.exists(path): + tf.io.gfile.rmtree(path) + + +def rmkdir(path): + rmrf(path) + mkdir(path) + + +def log(*args, **kwargs): + all_rank = kwargs.pop("all_rank", False) + if not all_rank and not is_main_process(): + return + msg = " ".join(map(str, args)) + sys.stdout.write(msg + "\n") + sys.stdout.flush() + + +def log_config(config): + for key, value in sorted(config.__dict__.items()): + log(key, value) + log() + + +def heading(*args): + log(80 * "=") + log(*args) + log(80 * "=") + + +def nest_dict(d, prefixes, delim="_"): + """Go from {prefix_key: value} to {prefix: {key: value}}.""" + nested = {} + for k, v in d.items(): + for prefix in prefixes: + if k.startswith(prefix + delim): + if prefix not in nested: + nested[prefix] = {} + nested[prefix][k.split(delim, 1)[1]] = v + else: + nested[k] = v + return nested + + +def flatten_dict(d, delim="_"): + """Go from {prefix: {key: value}} to {prefix_key: value}.""" + flattened = {} + for k, v in d.items(): + if isinstance(v, dict): + for k2, v2 in v.items(): + flattened[k + delim + k2] = v2 + else: + flattened[k] = v + return flattened + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def get_readable_time(elapsed): + d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(':')] + d -= 1 + return '{:2d}h{:2d}m{:2d}s'.format(24*d + h, m, s) + +def setup_logger(args): + os.makedirs(args.log_dir, exist_ok=True) + if not args.json_summary: + log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log'.format(get_rank())) + else: + log_path = "{}_rank{}".format(args.json_summary, get_rank()) + + if is_main_process(): + dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path), + dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) + else: + dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path)]) + + for k,v in vars(args).items(): + dllogger.log(step='PARAMETER', data={k:v}, verbosity=0) + + container_setup_info = { + 'NVIDIA_TENSORFLOW_VERSION': os.environ.get('NVIDIA_TENSORFLOW_VERSION'), + 'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'), + 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'), + 'NCCL_VERSION': os.environ.get('NCCL_VERSION'), + 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'), + 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'), + 'CUDA_VERSION': os.environ.get('CUDA_VERSION'), + 'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'), + 'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'), + 'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'), + } + dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) + +def postprocess_dllog(args): + if not args.json_summary: + log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log') + else: + log_path = str(args.json_summary) + "_rank{}" + logfiles = [open(log_path.format(i), 'r') for i in range(get_world_size())] + + if not args.json_summary: + log_path = os.path.join(args.log_dir, 'dllogger.log') + else: + log_path = str(args.json_summary) + + with open(log_path, 'w') as dest_file: + for lines in zip(*[f.readlines() for f in logfiles]): + json_lines = [json.loads(l[5:]) for l in lines] + + assert all(x['type'] == json_lines[0]['type'] for x in json_lines) + if json_lines[0]['type'] != 'LOG': + dest_file.write(lines[0]) + continue + + assert all(x['step'] == json_lines[0]['step'] for x in json_lines) + if json_lines[0]['step'] == 'PARAMETER': + dest_file.write(lines[0]) + else: + d = dict.fromkeys(json_lines[0]['data']) + for k in d.keys(): + vs = [line['data'][k] for line in json_lines] + d[k] = sum(vs)/len(vs) + json_lines[0]['data'] = d + dest_file.write('DLLL ') + dest_file.write(json.dumps(json_lines[0])) + dest_file.write('\n') + + for l in logfiles: + l.close() diff --git a/modelzoo/ELECTRA/vocab/vocab.txt b/modelzoo/ELECTRA/vocab/vocab.txt new file mode 100755 index 00000000..fb140275 --- /dev/null +++ b/modelzoo/ELECTRA/vocab/vocab.txt @@ -0,0 +1,30522 @@ +[PAD] +[unused0] +[unused1] +[unused2] +[unused3] +[unused4] +[unused5] +[unused6] +[unused7] +[unused8] +[unused9] +[unused10] +[unused11] +[unused12] +[unused13] +[unused14] +[unused15] +[unused16] +[unused17] +[unused18] +[unused19] +[unused20] +[unused21] +[unused22] +[unused23] +[unused24] +[unused25] +[unused26] +[unused27] +[unused28] +[unused29] +[unused30] +[unused31] +[unused32] +[unused33] +[unused34] +[unused35] +[unused36] +[unused37] +[unused38] +[unused39] +[unused40] +[unused41] +[unused42] +[unused43] +[unused44] +[unused45] +[unused46] +[unused47] +[unused48] +[unused49] +[unused50] +[unused51] +[unused52] +[unused53] +[unused54] +[unused55] +[unused56] +[unused57] +[unused58] +[unused59] +[unused60] +[unused61] +[unused62] +[unused63] +[unused64] +[unused65] +[unused66] +[unused67] +[unused68] +[unused69] +[unused70] +[unused71] +[unused72] +[unused73] +[unused74] +[unused75] +[unused76] +[unused77] +[unused78] +[unused79] +[unused80] +[unused81] +[unused82] +[unused83] +[unused84] +[unused85] +[unused86] +[unused87] +[unused88] +[unused89] +[unused90] +[unused91] +[unused92] +[unused93] +[unused94] +[unused95] +[unused96] +[unused97] +[unused98] +[UNK] +[CLS] +[SEP] +[MASK] +[unused99] +[unused100] +[unused101] +[unused102] +[unused103] +[unused104] +[unused105] +[unused106] +[unused107] +[unused108] +[unused109] +[unused110] +[unused111] +[unused112] +[unused113] +[unused114] +[unused115] +[unused116] +[unused117] +[unused118] +[unused119] +[unused120] +[unused121] +[unused122] +[unused123] +[unused124] +[unused125] +[unused126] +[unused127] +[unused128] +[unused129] +[unused130] +[unused131] +[unused132] +[unused133] +[unused134] +[unused135] +[unused136] +[unused137] +[unused138] +[unused139] +[unused140] +[unused141] +[unused142] +[unused143] +[unused144] +[unused145] +[unused146] +[unused147] +[unused148] +[unused149] +[unused150] +[unused151] +[unused152] +[unused153] +[unused154] +[unused155] +[unused156] +[unused157] +[unused158] +[unused159] +[unused160] +[unused161] +[unused162] +[unused163] +[unused164] +[unused165] +[unused166] +[unused167] +[unused168] +[unused169] +[unused170] +[unused171] +[unused172] +[unused173] +[unused174] +[unused175] +[unused176] +[unused177] +[unused178] +[unused179] +[unused180] +[unused181] +[unused182] +[unused183] +[unused184] +[unused185] +[unused186] +[unused187] +[unused188] +[unused189] +[unused190] +[unused191] +[unused192] +[unused193] +[unused194] +[unused195] +[unused196] +[unused197] +[unused198] +[unused199] +[unused200] +[unused201] +[unused202] +[unused203] +[unused204] +[unused205] +[unused206] +[unused207] +[unused208] +[unused209] +[unused210] +[unused211] +[unused212] +[unused213] +[unused214] +[unused215] +[unused216] +[unused217] +[unused218] +[unused219] +[unused220] +[unused221] +[unused222] +[unused223] +[unused224] +[unused225] +[unused226] +[unused227] +[unused228] +[unused229] +[unused230] +[unused231] +[unused232] +[unused233] +[unused234] +[unused235] +[unused236] +[unused237] +[unused238] +[unused239] +[unused240] +[unused241] +[unused242] +[unused243] +[unused244] +[unused245] +[unused246] +[unused247] +[unused248] +[unused249] +[unused250] +[unused251] +[unused252] +[unused253] +[unused254] +[unused255] +[unused256] +[unused257] +[unused258] +[unused259] +[unused260] +[unused261] +[unused262] +[unused263] +[unused264] +[unused265] +[unused266] +[unused267] +[unused268] +[unused269] +[unused270] +[unused271] +[unused272] +[unused273] +[unused274] +[unused275] +[unused276] +[unused277] +[unused278] +[unused279] +[unused280] +[unused281] +[unused282] +[unused283] +[unused284] +[unused285] +[unused286] +[unused287] +[unused288] +[unused289] +[unused290] +[unused291] +[unused292] +[unused293] +[unused294] +[unused295] +[unused296] +[unused297] +[unused298] +[unused299] +[unused300] +[unused301] +[unused302] +[unused303] +[unused304] +[unused305] +[unused306] +[unused307] +[unused308] +[unused309] +[unused310] +[unused311] +[unused312] +[unused313] +[unused314] +[unused315] +[unused316] +[unused317] +[unused318] +[unused319] +[unused320] +[unused321] +[unused322] +[unused323] +[unused324] +[unused325] +[unused326] +[unused327] +[unused328] +[unused329] +[unused330] +[unused331] +[unused332] +[unused333] +[unused334] +[unused335] +[unused336] +[unused337] +[unused338] +[unused339] +[unused340] +[unused341] +[unused342] +[unused343] +[unused344] +[unused345] +[unused346] +[unused347] +[unused348] +[unused349] +[unused350] +[unused351] +[unused352] +[unused353] +[unused354] +[unused355] +[unused356] +[unused357] +[unused358] +[unused359] +[unused360] +[unused361] +[unused362] +[unused363] +[unused364] +[unused365] +[unused366] +[unused367] +[unused368] +[unused369] +[unused370] +[unused371] +[unused372] +[unused373] +[unused374] +[unused375] +[unused376] +[unused377] +[unused378] +[unused379] +[unused380] +[unused381] +[unused382] +[unused383] +[unused384] +[unused385] +[unused386] +[unused387] +[unused388] +[unused389] +[unused390] +[unused391] +[unused392] +[unused393] +[unused394] +[unused395] +[unused396] +[unused397] +[unused398] +[unused399] +[unused400] +[unused401] +[unused402] +[unused403] +[unused404] +[unused405] +[unused406] +[unused407] +[unused408] +[unused409] +[unused410] +[unused411] +[unused412] +[unused413] +[unused414] +[unused415] +[unused416] +[unused417] +[unused418] +[unused419] +[unused420] +[unused421] +[unused422] +[unused423] +[unused424] +[unused425] +[unused426] +[unused427] +[unused428] +[unused429] +[unused430] +[unused431] +[unused432] +[unused433] +[unused434] +[unused435] +[unused436] +[unused437] +[unused438] +[unused439] +[unused440] +[unused441] +[unused442] +[unused443] +[unused444] +[unused445] +[unused446] +[unused447] +[unused448] +[unused449] +[unused450] +[unused451] +[unused452] +[unused453] +[unused454] +[unused455] +[unused456] +[unused457] +[unused458] +[unused459] +[unused460] +[unused461] +[unused462] +[unused463] +[unused464] +[unused465] +[unused466] +[unused467] +[unused468] +[unused469] +[unused470] +[unused471] +[unused472] +[unused473] +[unused474] +[unused475] +[unused476] +[unused477] +[unused478] +[unused479] +[unused480] +[unused481] +[unused482] +[unused483] +[unused484] +[unused485] +[unused486] +[unused487] +[unused488] +[unused489] +[unused490] +[unused491] +[unused492] +[unused493] +[unused494] +[unused495] +[unused496] +[unused497] +[unused498] +[unused499] +[unused500] +[unused501] +[unused502] +[unused503] +[unused504] +[unused505] +[unused506] +[unused507] +[unused508] +[unused509] +[unused510] +[unused511] +[unused512] +[unused513] +[unused514] +[unused515] +[unused516] +[unused517] +[unused518] +[unused519] +[unused520] +[unused521] +[unused522] +[unused523] +[unused524] +[unused525] +[unused526] +[unused527] +[unused528] +[unused529] +[unused530] +[unused531] +[unused532] +[unused533] +[unused534] +[unused535] +[unused536] +[unused537] +[unused538] +[unused539] +[unused540] +[unused541] +[unused542] +[unused543] +[unused544] +[unused545] +[unused546] +[unused547] +[unused548] +[unused549] +[unused550] +[unused551] +[unused552] +[unused553] +[unused554] +[unused555] +[unused556] +[unused557] +[unused558] +[unused559] +[unused560] +[unused561] +[unused562] +[unused563] +[unused564] +[unused565] +[unused566] +[unused567] +[unused568] +[unused569] +[unused570] +[unused571] +[unused572] +[unused573] +[unused574] +[unused575] +[unused576] +[unused577] +[unused578] +[unused579] +[unused580] +[unused581] +[unused582] +[unused583] +[unused584] +[unused585] +[unused586] +[unused587] +[unused588] +[unused589] +[unused590] +[unused591] +[unused592] +[unused593] +[unused594] +[unused595] +[unused596] +[unused597] +[unused598] +[unused599] +[unused600] +[unused601] +[unused602] +[unused603] +[unused604] +[unused605] +[unused606] +[unused607] +[unused608] +[unused609] +[unused610] +[unused611] +[unused612] +[unused613] +[unused614] +[unused615] +[unused616] +[unused617] +[unused618] +[unused619] +[unused620] +[unused621] +[unused622] +[unused623] +[unused624] +[unused625] +[unused626] +[unused627] +[unused628] +[unused629] +[unused630] +[unused631] +[unused632] +[unused633] +[unused634] +[unused635] +[unused636] +[unused637] +[unused638] +[unused639] +[unused640] +[unused641] +[unused642] +[unused643] +[unused644] +[unused645] +[unused646] +[unused647] +[unused648] +[unused649] +[unused650] +[unused651] +[unused652] +[unused653] +[unused654] +[unused655] +[unused656] +[unused657] +[unused658] +[unused659] +[unused660] +[unused661] +[unused662] +[unused663] +[unused664] +[unused665] +[unused666] +[unused667] +[unused668] +[unused669] +[unused670] +[unused671] +[unused672] +[unused673] +[unused674] +[unused675] +[unused676] +[unused677] +[unused678] +[unused679] +[unused680] +[unused681] +[unused682] +[unused683] +[unused684] +[unused685] +[unused686] +[unused687] +[unused688] +[unused689] +[unused690] +[unused691] +[unused692] +[unused693] +[unused694] +[unused695] +[unused696] +[unused697] +[unused698] +[unused699] +[unused700] +[unused701] +[unused702] +[unused703] +[unused704] +[unused705] +[unused706] +[unused707] +[unused708] +[unused709] +[unused710] +[unused711] +[unused712] +[unused713] +[unused714] +[unused715] +[unused716] +[unused717] +[unused718] +[unused719] +[unused720] +[unused721] +[unused722] +[unused723] +[unused724] +[unused725] +[unused726] +[unused727] +[unused728] +[unused729] +[unused730] +[unused731] +[unused732] +[unused733] +[unused734] +[unused735] +[unused736] +[unused737] +[unused738] +[unused739] +[unused740] +[unused741] +[unused742] +[unused743] +[unused744] +[unused745] +[unused746] +[unused747] +[unused748] +[unused749] +[unused750] +[unused751] +[unused752] +[unused753] +[unused754] +[unused755] +[unused756] +[unused757] +[unused758] +[unused759] +[unused760] +[unused761] +[unused762] +[unused763] +[unused764] +[unused765] +[unused766] +[unused767] +[unused768] +[unused769] +[unused770] +[unused771] +[unused772] +[unused773] +[unused774] +[unused775] +[unused776] +[unused777] +[unused778] +[unused779] +[unused780] +[unused781] +[unused782] +[unused783] +[unused784] +[unused785] +[unused786] +[unused787] +[unused788] +[unused789] +[unused790] +[unused791] +[unused792] +[unused793] +[unused794] +[unused795] +[unused796] +[unused797] +[unused798] +[unused799] +[unused800] +[unused801] +[unused802] +[unused803] +[unused804] +[unused805] +[unused806] +[unused807] +[unused808] +[unused809] +[unused810] +[unused811] +[unused812] +[unused813] +[unused814] +[unused815] +[unused816] +[unused817] +[unused818] +[unused819] +[unused820] +[unused821] +[unused822] +[unused823] +[unused824] +[unused825] +[unused826] +[unused827] +[unused828] +[unused829] +[unused830] +[unused831] +[unused832] +[unused833] +[unused834] +[unused835] +[unused836] +[unused837] +[unused838] +[unused839] +[unused840] +[unused841] +[unused842] +[unused843] +[unused844] +[unused845] +[unused846] +[unused847] +[unused848] +[unused849] +[unused850] +[unused851] +[unused852] +[unused853] +[unused854] +[unused855] +[unused856] +[unused857] +[unused858] +[unused859] +[unused860] +[unused861] +[unused862] +[unused863] +[unused864] +[unused865] +[unused866] +[unused867] +[unused868] +[unused869] +[unused870] +[unused871] +[unused872] +[unused873] +[unused874] +[unused875] +[unused876] +[unused877] +[unused878] +[unused879] +[unused880] +[unused881] +[unused882] +[unused883] +[unused884] +[unused885] +[unused886] +[unused887] +[unused888] +[unused889] +[unused890] +[unused891] +[unused892] +[unused893] +[unused894] +[unused895] +[unused896] +[unused897] +[unused898] +[unused899] +[unused900] +[unused901] +[unused902] +[unused903] +[unused904] +[unused905] +[unused906] +[unused907] +[unused908] +[unused909] +[unused910] +[unused911] +[unused912] +[unused913] +[unused914] +[unused915] +[unused916] +[unused917] +[unused918] +[unused919] +[unused920] +[unused921] +[unused922] +[unused923] +[unused924] +[unused925] +[unused926] +[unused927] +[unused928] +[unused929] +[unused930] +[unused931] +[unused932] +[unused933] +[unused934] +[unused935] +[unused936] +[unused937] +[unused938] +[unused939] +[unused940] +[unused941] +[unused942] +[unused943] +[unused944] +[unused945] +[unused946] +[unused947] +[unused948] +[unused949] +[unused950] +[unused951] +[unused952] +[unused953] +[unused954] +[unused955] +[unused956] +[unused957] +[unused958] +[unused959] +[unused960] +[unused961] +[unused962] +[unused963] +[unused964] +[unused965] +[unused966] +[unused967] +[unused968] +[unused969] +[unused970] +[unused971] +[unused972] +[unused973] +[unused974] +[unused975] +[unused976] +[unused977] +[unused978] +[unused979] +[unused980] +[unused981] +[unused982] +[unused983] +[unused984] +[unused985] +[unused986] +[unused987] +[unused988] +[unused989] +[unused990] +[unused991] +[unused992] +[unused993] +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¡ +¢ +£ +¤ +¥ +¦ +§ +¨ +© +ª +« +¬ +® +° +± +² +³ +´ +µ +¶ +· +¹ +º +» +¼ +½ +¾ +¿ +× +ß +æ +ð +÷ +ø +þ +đ +ħ +ı +ł +ŋ +œ +ƒ +ɐ +ɑ +ɒ +ɔ +ɕ +ə +ɛ +ɡ +ɣ +ɨ +ɪ +ɫ +ɬ +ɯ +ɲ +ɴ +ɹ +ɾ +ʀ +ʁ +ʂ +ʃ +ʉ +ʊ +ʋ +ʌ +ʎ +ʐ +ʑ +ʒ +ʔ +ʰ +ʲ +ʳ +ʷ +ʸ +ʻ +ʼ +ʾ +ʿ +ˈ +ː +ˡ +ˢ +ˣ +ˤ +α +β +γ +δ +ε +ζ +η +θ +ι +κ +λ +μ +ν +ξ +ο +π +ρ +ς +σ +τ +υ +φ +χ +ψ +ω +а +б +в +г +д +е +ж +з +и +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ђ +є +і +ј +љ +њ +ћ +ӏ +ա +բ +գ +դ +ե +թ +ի +լ +կ +հ +մ +յ +ն +ո +պ +ս +վ +տ +ր +ւ +ք +־ +א +ב +ג +ד +ה +ו +ז +ח +ט +י +ך +כ +ל +ם +מ +ן +נ +ס +ע +ף +פ +ץ +צ +ק +ר +ש +ת +، +ء +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ـ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ٹ +پ +چ +ک +گ +ں +ھ +ہ +ی +ے +अ +आ +उ +ए +क +ख +ग +च +ज +ट +ड +ण +त +थ +द +ध +न +प +ब +भ +म +य +र +ल +व +श +ष +स +ह +ा +ि +ी +ो +। +॥ +ং +অ +আ +ই +উ +এ +ও +ক +খ +গ +চ +ছ +জ +ট +ড +ণ +ত +থ +দ +ধ +ন +প +ব +ভ +ম +য +র +ল +শ +ষ +স +হ +া +ি +ী +ে +க +ச +ட +த +ந +ன +ப +ம +ய +ர +ல +ள +வ +ா +ி +ு +ே +ை +ನ +ರ +ಾ +ක +ය +ර +ල +ව +ා +ก +ง +ต +ท +น +พ +ม +ย +ร +ล +ว +ส +อ +า +เ +་ +། +ག +ང +ད +ན +པ +བ +མ +འ +ར +ལ +ས +မ +ა +ბ +გ +დ +ე +ვ +თ +ი +კ +ლ +მ +ნ +ო +რ +ს +ტ +უ +ᄀ +ᄂ +ᄃ +ᄅ +ᄆ +ᄇ +ᄉ +ᄊ +ᄋ +ᄌ +ᄎ +ᄏ +ᄐ +ᄑ +ᄒ +ᅡ +ᅢ +ᅥ +ᅦ +ᅧ +ᅩ +ᅪ +ᅭ +ᅮ +ᅯ +ᅲ +ᅳ +ᅴ +ᅵ +ᆨ +ᆫ +ᆯ +ᆷ +ᆸ +ᆼ +ᴬ +ᴮ +ᴰ +ᴵ +ᴺ +ᵀ +ᵃ +ᵇ +ᵈ +ᵉ +ᵍ +ᵏ +ᵐ +ᵒ +ᵖ +ᵗ +ᵘ +ᵢ +ᵣ +ᵤ +ᵥ +ᶜ +ᶠ +‐ +‑ +‒ +– +— +― +‖ +‘ +’ +‚ +“ +” +„ +† +‡ +• +… +‰ +′ +″ +› +‿ +⁄ +⁰ +ⁱ +⁴ +⁵ +⁶ +⁷ +⁸ +⁹ +⁺ +⁻ +ⁿ +₀ +₁ +₂ +₃ +₄ +₅ +₆ +₇ +₈ +₉ +₊ +₍ +₎ +ₐ +ₑ +ₒ +ₓ +ₕ +ₖ +ₗ +ₘ +ₙ +ₚ +ₛ +ₜ +₤ +₩ +€ +₱ +₹ +ℓ +№ +ℝ +™ +⅓ +⅔ +← +↑ +→ +↓ +↔ +↦ +⇄ +⇌ +⇒ +∂ +∅ +∆ +∇ +∈ +− +∗ +∘ +√ +∞ +∧ +∨ +∩ +∪ +≈ +≡ +≤ +≥ +⊂ +⊆ +⊕ +⊗ +⋅ +─ +│ +■ +▪ +● +★ +☆ +☉ +♠ +♣ +♥ +♦ +♭ +♯ +⟨ +⟩ +ⱼ +⺩ +⺼ +⽥ +、 +。 +〈 +〉 +《 +》 +「 +」 +『 +』 +〜 +あ +い +う +え +お +か +き +く +け +こ +さ +し +す +せ +そ +た +ち +っ +つ +て +と +な +に +ぬ +ね +の +は +ひ +ふ +へ +ほ +ま +み +む +め +も +や +ゆ +よ +ら +り +る +れ +ろ +を +ん +ァ +ア +ィ +イ +ウ +ェ +エ +オ +カ +キ +ク +ケ +コ +サ +シ +ス +セ +タ +チ +ッ +ツ +テ +ト +ナ +ニ +ノ +ハ +ヒ +フ +ヘ +ホ +マ +ミ +ム +メ +モ +ャ +ュ +ョ +ラ +リ +ル +レ +ロ +ワ +ン +・ +ー +一 +三 +上 +下 +不 +世 +中 +主 +久 +之 +也 +事 +二 +五 +井 +京 +人 +亻 +仁 +介 +代 +仮 +伊 +会 +佐 +侍 +保 +信 +健 +元 +光 +八 +公 +内 +出 +分 +前 +劉 +力 +加 +勝 +北 +区 +十 +千 +南 +博 +原 +口 +古 +史 +司 +合 +吉 +同 +名 +和 +囗 +四 +国 +國 +土 +地 +坂 +城 +堂 +場 +士 +夏 +外 +大 +天 +太 +夫 +奈 +女 +子 +学 +宀 +宇 +安 +宗 +定 +宣 +宮 +家 +宿 +寺 +將 +小 +尚 +山 +岡 +島 +崎 +川 +州 +巿 +帝 +平 +年 +幸 +广 +弘 +張 +彳 +後 +御 +德 +心 +忄 +志 +忠 +愛 +成 +我 +戦 +戸 +手 +扌 +政 +文 +新 +方 +日 +明 +星 +春 +昭 +智 +曲 +書 +月 +有 +朝 +木 +本 +李 +村 +東 +松 +林 +森 +楊 +樹 +橋 +歌 +止 +正 +武 +比 +氏 +民 +水 +氵 +氷 +永 +江 +沢 +河 +治 +法 +海 +清 +漢 +瀬 +火 +版 +犬 +王 +生 +田 +男 +疒 +発 +白 +的 +皇 +目 +相 +省 +真 +石 +示 +社 +神 +福 +禾 +秀 +秋 +空 +立 +章 +竹 +糹 +美 +義 +耳 +良 +艹 +花 +英 +華 +葉 +藤 +行 +街 +西 +見 +訁 +語 +谷 +貝 +貴 +車 +軍 +辶 +道 +郎 +郡 +部 +都 +里 +野 +金 +鈴 +镇 +長 +門 +間 +阝 +阿 +陳 +陽 +雄 +青 +面 +風 +食 +香 +馬 +高 +龍 +龸 +fi +fl +! +( +) +, +- +. +/ +: +? +~ +the +of +and +in +to +was +he +is +as +for +on +with +that +it +his +by +at +from +her +##s +she +you +had +an +were +but +be +this +are +not +my +they +one +which +or +have +him +me +first +all +also +their +has +up +who +out +been +when +after +there +into +new +two +its +##a +time +would +no +what +about +said +we +over +then +other +so +more +##e +can +if +like +back +them +only +some +could +##i +where +just +##ing +during +before +##n +do +##o +made +school +through +than +now +years +most +world +may +between +down +well +three +##d +year +while +will +##ed +##r +##y +later +##t +city +under +around +did +such +being +used +state +people +part +know +against +your +many +second +university +both +national +##er +these +don +known +off +way +until +re +how +even +get +head +... +didn +##ly +team +american +because +de +##l +born +united +film +since +still +long +work +south +us +became +any +high +again +day +family +see +right +man +eyes +house +season +war +states +including +took +life +north +same +each +called +name +much +place +however +go +four +group +another +found +won +area +here +going +10 +away +series +left +home +music +best +make +hand +number +company +several +never +last +john +000 +very +album +take +end +good +too +following +released +game +played +little +began +district +##m +old +want +those +side +held +own +early +county +ll +league +use +west +##u +face +think +##es +2010 +government +##h +march +came +small +general +town +june +##on +line +based +something +##k +september +thought +looked +along +international +2011 +air +july +club +went +january +october +our +august +april +york +12 +few +2012 +2008 +east +show +member +college +2009 +father +public +##us +come +men +five +set +station +church +##c +next +former +november +room +party +located +december +2013 +age +got +2007 +##g +system +let +love +2006 +though +every +2014 +look +song +water +century +without +body +black +night +within +great +women +single +ve +building +large +population +river +named +band +white +started +##an +once +15 +20 +should +18 +2015 +service +top +built +british +open +death +king +moved +local +times +children +february +book +why +11 +door +need +president +order +final +road +wasn +although +due +major +died +village +third +knew +2016 +asked +turned +st +wanted +say +##p +together +received +main +son +served +different +##en +behind +himself +felt +members +power +football +law +voice +play +##in +near +park +history +30 +having +2005 +16 +##man +saw +mother +##al +army +point +front +help +english +street +art +late +hands +games +award +##ia +young +14 +put +published +country +division +across +told +13 +often +ever +french +london +center +six +red +2017 +led +days +include +light +25 +find +tell +among +species +really +according +central +half +2004 +form +original +gave +office +making +enough +lost +full +opened +must +included +live +given +german +player +run +business +woman +community +cup +might +million +land +2000 +court +development +17 +short +round +ii +km +seen +class +story +always +become +sure +research +almost +director +council +la +##2 +career +things +using +island +##z +couldn +car +##is +24 +close +force +##1 +better +free +support +control +field +students +2003 +education +married +##b +nothing +worked +others +record +big +inside +level +anything +continued +give +james +##3 +military +established +non +returned +feel +does +title +written +thing +feet +william +far +co +association +hard +already +2002 +##ra +championship +human +western +100 +##na +department +hall +role +various +production +21 +19 +heart +2001 +living +fire +version +##ers +##f +television +royal +##4 +produced +working +act +case +society +region +present +radio +period +looking +least +total +keep +england +wife +program +per +brother +mind +special +22 +##le +am +works +soon +##6 +political +george +services +taken +created +##7 +further +able +reached +david +union +joined +upon +done +important +social +information +either +##ic +##x +appeared +position +ground +lead +rock +dark +election +23 +board +france +hair +course +arms +site +police +girl +instead +real +sound +##v +words +moment +##te +someone +##8 +summer +project +announced +san +less +wrote +past +followed +##5 +blue +founded +al +finally +india +taking +records +america +##ne +1999 +design +considered +northern +god +stop +battle +toward +european +outside +described +track +today +playing +language +28 +call +26 +heard +professional +low +australia +miles +california +win +yet +green +##ie +trying +blood +##ton +southern +science +maybe +everything +match +square +27 +mouth +video +race +recorded +leave +above +##9 +daughter +points +space +1998 +museum +change +middle +common +##0 +move +tv +post +##ta +lake +seven +tried +elected +closed +ten +paul +minister +##th +months +start +chief +return +canada +person +sea +release +similar +modern +brought +rest +hit +formed +mr +##la +1997 +floor +event +doing +thomas +1996 +robert +care +killed +training +star +week +needed +turn +finished +railway +rather +news +health +sent +example +ran +term +michael +coming +currently +yes +forces +despite +gold +areas +50 +stage +fact +29 +dead +says +popular +2018 +originally +germany +probably +developed +result +pulled +friend +stood +money +running +mi +signed +word +songs +child +eventually +met +tour +average +teams +minutes +festival +current +deep +kind +1995 +decided +usually +eastern +seemed +##ness +episode +bed +added +table +indian +private +charles +route +available +idea +throughout +centre +addition +appointed +style +1994 +books +eight +construction +press +mean +wall +friends +remained +schools +study +##ch +##um +institute +oh +chinese +sometimes +events +possible +1992 +australian +type +brown +forward +talk +process +food +debut +seat +performance +committee +features +character +arts +herself +else +lot +strong +russian +range +hours +peter +arm +##da +morning +dr +sold +##ry +quickly +directed +1993 +guitar +china +##w +31 +list +##ma +performed +media +uk +players +smile +##rs +myself +40 +placed +coach +province +towards +wouldn +leading +whole +boy +official +designed +grand +census +##el +europe +attack +japanese +henry +1991 +##re +##os +cross +getting +alone +action +lower +network +wide +washington +japan +1990 +hospital +believe +changed +sister +##ar +hold +gone +sir +hadn +ship +##ka +studies +academy +shot +rights +below +base +bad +involved +kept +largest +##ist +bank +future +especially +beginning +mark +movement +section +female +magazine +plan +professor +lord +longer +##ian +sat +walked +hill +actually +civil +energy +model +families +size +thus +aircraft +completed +includes +data +captain +##or +fight +vocals +featured +richard +bridge +fourth +1989 +officer +stone +hear +##ism +means +medical +groups +management +self +lips +competition +entire +lived +technology +leaving +federal +tournament +bit +passed +hot +independent +awards +kingdom +mary +spent +fine +doesn +reported +##ling +jack +fall +raised +itself +stay +true +studio +1988 +sports +replaced +paris +systems +saint +leader +theatre +whose +market +capital +parents +spanish +canadian +earth +##ity +cut +degree +writing +bay +christian +awarded +natural +higher +bill +##as +coast +provided +previous +senior +ft +valley +organization +stopped +onto +countries +parts +conference +queen +security +interest +saying +allowed +master +earlier +phone +matter +smith +winning +try +happened +moving +campaign +los +##ley +breath +nearly +mid +1987 +certain +girls +date +italian +african +standing +fell +artist +##ted +shows +deal +mine +industry +1986 +##ng +everyone +republic +provide +collection +library +student +##ville +primary +owned +older +via +heavy +1st +makes +##able +attention +anyone +africa +##ri +stated +length +ended +fingers +command +staff +skin +foreign +opening +governor +okay +medal +kill +sun +cover +job +1985 +introduced +chest +hell +feeling +##ies +success +meet +reason +standard +meeting +novel +1984 +trade +source +buildings +##land +rose +guy +goal +##ur +chapter +native +husband +previously +unit +limited +entered +weeks +producer +operations +mountain +takes +covered +forced +related +roman +complete +successful +key +texas +cold +##ya +channel +1980 +traditional +films +dance +clear +approximately +500 +nine +van +prince +question +active +tracks +ireland +regional +silver +author +personal +sense +operation +##ine +economic +1983 +holding +twenty +isbn +additional +speed +hour +edition +regular +historic +places +whom +shook +movie +km² +secretary +prior +report +chicago +read +foundation +view +engine +scored +1982 +units +ask +airport +property +ready +immediately +lady +month +listed +contract +##de +manager +themselves +lines +##ki +navy +writer +meant +##ts +runs +##ro +practice +championships +singer +glass +commission +required +forest +starting +culture +generally +giving +access +attended +test +couple +stand +catholic +martin +caught +executive +##less +eye +##ey +thinking +chair +quite +shoulder +1979 +hope +decision +plays +defeated +municipality +whether +structure +offered +slowly +pain +ice +direction +##ion +paper +mission +1981 +mostly +200 +noted +individual +managed +nature +lives +plant +##ha +helped +except +studied +computer +figure +relationship +issue +significant +loss +die +smiled +gun +ago +highest +1972 +##am +male +bring +goals +mexico +problem +distance +commercial +completely +location +annual +famous +drive +1976 +neck +1978 +surface +caused +italy +understand +greek +highway +wrong +hotel +comes +appearance +joseph +double +issues +musical +companies +castle +income +review +assembly +bass +initially +parliament +artists +experience +1974 +particular +walk +foot +engineering +talking +window +dropped +##ter +miss +baby +boys +break +1975 +stars +edge +remember +policy +carried +train +stadium +bar +sex +angeles +evidence +##ge +becoming +assistant +soviet +1977 +upper +step +wing +1970 +youth +financial +reach +##ll +actor +numerous +##se +##st +nodded +arrived +##ation +minute +##nt +believed +sorry +complex +beautiful +victory +associated +temple +1968 +1973 +chance +perhaps +metal +##son +1945 +bishop +##et +lee +launched +particularly +tree +le +retired +subject +prize +contains +yeah +theory +empire +##ce +suddenly +waiting +trust +recording +##to +happy +terms +camp +champion +1971 +religious +pass +zealand +names +2nd +port +ancient +tom +corner +represented +watch +legal +anti +justice +cause +watched +brothers +45 +material +changes +simply +response +louis +fast +##ting +answer +60 +historical +1969 +stories +straight +create +feature +increased +rate +administration +virginia +el +activities +cultural +overall +winner +programs +basketball +legs +guard +beyond +cast +doctor +mm +flight +results +remains +cost +effect +winter +##ble +larger +islands +problems +chairman +grew +commander +isn +1967 +pay +failed +selected +hurt +fort +box +regiment +majority +journal +35 +edward +plans +##ke +##ni +shown +pretty +irish +characters +directly +scene +likely +operated +allow +spring +##j +junior +matches +looks +mike +houses +fellow +##tion +beach +marriage +##ham +##ive +rules +oil +65 +florida +expected +nearby +congress +sam +peace +recent +iii +wait +subsequently +cell +##do +variety +serving +agreed +please +poor +joe +pacific +attempt +wood +democratic +piece +prime +##ca +rural +mile +touch +appears +township +1964 +1966 +soldiers +##men +##ized +1965 +pennsylvania +closer +fighting +claimed +score +jones +physical +editor +##ous +filled +genus +specific +sitting +super +mom +##va +therefore +supported +status +fear +cases +store +meaning +wales +minor +spain +tower +focus +vice +frank +follow +parish +separate +golden +horse +fifth +remaining +branch +32 +presented +stared +##id +uses +secret +forms +##co +baseball +exactly +##ck +choice +note +discovered +travel +composed +truth +russia +ball +color +kiss +dad +wind +continue +ring +referred +numbers +digital +greater +##ns +metres +slightly +direct +increase +1960 +responsible +crew +rule +trees +troops +##no +broke +goes +individuals +hundred +weight +creek +sleep +memory +defense +provides +ordered +code +value +jewish +windows +1944 +safe +judge +whatever +corps +realized +growing +pre +##ga +cities +alexander +gaze +lies +spread +scott +letter +showed +situation +mayor +transport +watching +workers +extended +##li +expression +normal +##ment +chart +multiple +border +##ba +host +##ner +daily +mrs +walls +piano +##ko +heat +cannot +##ate +earned +products +drama +era +authority +seasons +join +grade +##io +sign +difficult +machine +1963 +territory +mainly +##wood +stations +squadron +1962 +stepped +iron +19th +##led +serve +appear +sky +speak +broken +charge +knowledge +kilometres +removed +ships +article +campus +simple +##ty +pushed +britain +##ve +leaves +recently +cd +soft +boston +latter +easy +acquired +poland +##sa +quality +officers +presence +planned +nations +mass +broadcast +jean +share +image +influence +wild +offer +emperor +electric +reading +headed +ability +promoted +yellow +ministry +1942 +throat +smaller +politician +##by +latin +spoke +cars +williams +males +lack +pop +80 +##ier +acting +seeing +consists +##ti +estate +1961 +pressure +johnson +newspaper +jr +chris +olympics +online +conditions +beat +elements +walking +vote +##field +needs +carolina +text +featuring +global +block +shirt +levels +francisco +purpose +females +et +dutch +duke +ahead +gas +twice +safety +serious +turning +highly +lieutenant +firm +maria +amount +mixed +daniel +proposed +perfect +agreement +affairs +3rd +seconds +contemporary +paid +1943 +prison +save +kitchen +label +administrative +intended +constructed +academic +nice +teacher +races +1956 +formerly +corporation +ben +nation +issued +shut +1958 +drums +housing +victoria +seems +opera +1959 +graduated +function +von +mentioned +picked +build +recognized +shortly +protection +picture +notable +exchange +elections +1980s +loved +percent +racing +fish +elizabeth +garden +volume +hockey +1941 +beside +settled +##ford +1940 +competed +replied +drew +1948 +actress +marine +scotland +steel +glanced +farm +steve +1957 +risk +tonight +positive +magic +singles +effects +gray +screen +dog +##ja +residents +bus +sides +none +secondary +literature +polish +destroyed +flying +founder +households +1939 +lay +reserve +usa +gallery +##ler +1946 +industrial +younger +approach +appearances +urban +ones +1950 +finish +avenue +powerful +fully +growth +page +honor +jersey +projects +advanced +revealed +basic +90 +infantry +pair +equipment +visit +33 +evening +search +grant +effort +solo +treatment +buried +republican +primarily +bottom +owner +1970s +israel +gives +jim +dream +bob +remain +spot +70 +notes +produce +champions +contact +ed +soul +accepted +ways +del +##ally +losing +split +price +capacity +basis +trial +questions +##ina +1955 +20th +guess +officially +memorial +naval +initial +##ization +whispered +median +engineer +##ful +sydney +##go +columbia +strength +300 +1952 +tears +senate +00 +card +asian +agent +1947 +software +44 +draw +warm +supposed +com +pro +##il +transferred +leaned +##at +candidate +escape +mountains +asia +potential +activity +entertainment +seem +traffic +jackson +murder +36 +slow +product +orchestra +haven +agency +bbc +taught +website +comedy +unable +storm +planning +albums +rugby +environment +scientific +grabbed +protect +##hi +boat +typically +1954 +1953 +damage +principal +divided +dedicated +mount +ohio +##berg +pick +fought +driver +##der +empty +shoulders +sort +thank +berlin +prominent +account +freedom +necessary +efforts +alex +headquarters +follows +alongside +des +simon +andrew +suggested +operating +learning +steps +1949 +sweet +technical +begin +easily +34 +teeth +speaking +settlement +scale +##sh +renamed +ray +max +enemy +semi +joint +compared +##rd +scottish +leadership +analysis +offers +georgia +pieces +captured +animal +deputy +guest +organized +##lin +tony +combined +method +challenge +1960s +huge +wants +battalion +sons +rise +crime +types +facilities +telling +path +1951 +platform +sit +1990s +##lo +tells +assigned +rich +pull +##ot +commonly +alive +##za +letters +concept +conducted +wearing +happen +bought +becomes +holy +gets +ocean +defeat +languages +purchased +coffee +occurred +titled +##q +declared +applied +sciences +concert +sounds +jazz +brain +##me +painting +fleet +tax +nick +##ius +michigan +count +animals +leaders +episodes +##line +content +##den +birth +##it +clubs +64 +palace +critical +refused +fair +leg +laughed +returning +surrounding +participated +formation +lifted +pointed +connected +rome +medicine +laid +taylor +santa +powers +adam +tall +shared +focused +knowing +yards +entrance +falls +##wa +calling +##ad +sources +chosen +beneath +resources +yard +##ite +nominated +silence +zone +defined +##que +gained +thirty +38 +bodies +moon +##ard +adopted +christmas +widely +register +apart +iran +premier +serves +du +unknown +parties +##les +generation +##ff +continues +quick +fields +brigade +quiet +teaching +clothes +impact +weapons +partner +flat +theater +supreme +1938 +37 +relations +##tor +plants +suffered +1936 +wilson +kids +begins +##age +1918 +seats +armed +internet +models +worth +laws +400 +communities +classes +background +knows +thanks +quarter +reaching +humans +carry +killing +format +kong +hong +setting +75 +architecture +disease +railroad +inc +possibly +wish +arthur +thoughts +harry +doors +density +##di +crowd +illinois +stomach +tone +unique +reports +anyway +##ir +liberal +der +vehicle +thick +dry +drug +faced +largely +facility +theme +holds +creation +strange +colonel +##mi +revolution +bell +politics +turns +silent +rail +relief +independence +combat +shape +write +determined +sales +learned +4th +finger +oxford +providing +1937 +heritage +fiction +situated +designated +allowing +distribution +hosted +##est +sight +interview +estimated +reduced +##ria +toronto +footballer +keeping +guys +damn +claim +motion +sport +sixth +stayed +##ze +en +rear +receive +handed +twelve +dress +audience +granted +brazil +##well +spirit +##ated +noticed +etc +olympic +representative +eric +tight +trouble +reviews +drink +vampire +missing +roles +ranked +newly +household +finals +wave +critics +##ee +phase +massachusetts +pilot +unlike +philadelphia +bright +guns +crown +organizations +roof +42 +respectively +clearly +tongue +marked +circle +fox +korea +bronze +brian +expanded +sexual +supply +yourself +inspired +labour +fc +##ah +reference +vision +draft +connection +brand +reasons +1935 +classic +driving +trip +jesus +cells +entry +1920 +neither +trail +claims +atlantic +orders +labor +nose +afraid +identified +intelligence +calls +cancer +attacked +passing +stephen +positions +imperial +grey +jason +39 +sunday +48 +swedish +avoid +extra +uncle +message +covers +allows +surprise +materials +fame +hunter +##ji +1930 +citizens +figures +davis +environmental +confirmed +shit +titles +di +performing +difference +acts +attacks +##ov +existing +votes +opportunity +nor +shop +entirely +trains +opposite +pakistan +##pa +develop +resulted +representatives +actions +reality +pressed +##ish +barely +wine +conversation +faculty +northwest +ends +documentary +nuclear +stock +grace +sets +eat +alternative +##ps +bag +resulting +creating +surprised +cemetery +1919 +drop +finding +sarah +cricket +streets +tradition +ride +1933 +exhibition +target +ear +explained +rain +composer +injury +apartment +municipal +educational +occupied +netherlands +clean +billion +constitution +learn +1914 +maximum +classical +francis +lose +opposition +jose +ontario +bear +core +hills +rolled +ending +drawn +permanent +fun +##tes +##lla +lewis +sites +chamber +ryan +##way +scoring +height +1934 +##house +lyrics +staring +55 +officials +1917 +snow +oldest +##tic +orange +##ger +qualified +interior +apparently +succeeded +thousand +dinner +lights +existence +fans +heavily +41 +greatest +conservative +send +bowl +plus +enter +catch +##un +economy +duty +1929 +speech +authorities +princess +performances +versions +shall +graduate +pictures +effective +remembered +poetry +desk +crossed +starring +starts +passenger +sharp +##ant +acres +ass +weather +falling +rank +fund +supporting +check +adult +publishing +heads +cm +southeast +lane +##burg +application +bc +##ura +les +condition +transfer +prevent +display +ex +regions +earl +federation +cool +relatively +answered +besides +1928 +obtained +portion +##town +mix +##ding +reaction +liked +dean +express +peak +1932 +##tte +counter +religion +chain +rare +miller +convention +aid +lie +vehicles +mobile +perform +squad +wonder +lying +crazy +sword +##ping +attempted +centuries +weren +philosophy +category +##ize +anna +interested +47 +sweden +wolf +frequently +abandoned +kg +literary +alliance +task +entitled +##ay +threw +promotion +factory +tiny +soccer +visited +matt +fm +achieved +52 +defence +internal +persian +43 +methods +##ging +arrested +otherwise +cambridge +programming +villages +elementary +districts +rooms +criminal +conflict +worry +trained +1931 +attempts +waited +signal +bird +truck +subsequent +programme +##ol +ad +49 +communist +details +faith +sector +patrick +carrying +laugh +##ss +controlled +korean +showing +origin +fuel +evil +1927 +##ent +brief +identity +darkness +address +pool +missed +publication +web +planet +ian +anne +wings +invited +##tt +briefly +standards +kissed +##be +ideas +climate +causing +walter +worse +albert +articles +winners +desire +aged +northeast +dangerous +gate +doubt +1922 +wooden +multi +##ky +poet +rising +funding +46 +communications +communication +violence +copies +prepared +ford +investigation +skills +1924 +pulling +electronic +##ak +##ial +##han +containing +ultimately +offices +singing +understanding +restaurant +tomorrow +fashion +christ +ward +da +pope +stands +5th +flow +studios +aired +commissioned +contained +exist +fresh +americans +##per +wrestling +approved +kid +employed +respect +suit +1925 +angel +asking +increasing +frame +angry +selling +1950s +thin +finds +##nd +temperature +statement +ali +explain +inhabitants +towns +extensive +narrow +51 +jane +flowers +images +promise +somewhere +object +fly +closely +##ls +1912 +bureau +cape +1926 +weekly +presidential +legislative +1921 +##ai +##au +launch +founding +##ny +978 +##ring +artillery +strike +un +institutions +roll +writers +landing +chose +kevin +anymore +pp +##ut +attorney +fit +dan +billboard +receiving +agricultural +breaking +sought +dave +admitted +lands +mexican +##bury +charlie +specifically +hole +iv +howard +credit +moscow +roads +accident +1923 +proved +wear +struck +hey +guards +stuff +slid +expansion +1915 +cat +anthony +##kin +melbourne +opposed +sub +southwest +architect +failure +plane +1916 +##ron +map +camera +tank +listen +regarding +wet +introduction +metropolitan +link +ep +fighter +inch +grown +gene +anger +fixed +buy +dvd +khan +domestic +worldwide +chapel +mill +functions +examples +##head +developing +1910 +turkey +hits +pocket +antonio +papers +grow +unless +circuit +18th +concerned +attached +journalist +selection +journey +converted +provincial +painted +hearing +aren +bands +negative +aside +wondered +knight +lap +survey +ma +##ow +noise +billy +##ium +shooting +guide +bedroom +priest +resistance +motor +homes +sounded +giant +##mer +150 +scenes +equal +comic +patients +hidden +solid +actual +bringing +afternoon +touched +funds +wedding +consisted +marie +canal +sr +kim +treaty +turkish +recognition +residence +cathedral +broad +knees +incident +shaped +fired +norwegian +handle +cheek +contest +represent +##pe +representing +beauty +##sen +birds +advantage +emergency +wrapped +drawing +notice +pink +broadcasting +##ong +somehow +bachelor +seventh +collected +registered +establishment +alan +assumed +chemical +personnel +roger +retirement +jeff +portuguese +wore +tied +device +threat +progress +advance +##ised +banks +hired +manchester +nfl +teachers +structures +forever +##bo +tennis +helping +saturday +sale +applications +junction +hip +incorporated +neighborhood +dressed +ceremony +##ds +influenced +hers +visual +stairs +decades +inner +kansas +hung +hoped +gain +scheduled +downtown +engaged +austria +clock +norway +certainly +pale +protected +1913 +victor +employees +plate +putting +surrounded +##ists +finishing +blues +tropical +##ries +minnesota +consider +philippines +accept +54 +retrieved +1900 +concern +anderson +properties +institution +gordon +successfully +vietnam +##dy +backing +outstanding +muslim +crossing +folk +producing +usual +demand +occurs +observed +lawyer +educated +##ana +kelly +string +pleasure +budget +items +quietly +colorado +philip +typical +##worth +derived +600 +survived +asks +mental +##ide +56 +jake +jews +distinguished +ltd +1911 +sri +extremely +53 +athletic +loud +thousands +worried +shadow +transportation +horses +weapon +arena +importance +users +tim +objects +contributed +dragon +douglas +aware +senator +johnny +jordan +sisters +engines +flag +investment +samuel +shock +capable +clark +row +wheel +refers +session +familiar +biggest +wins +hate +maintained +drove +hamilton +request +expressed +injured +underground +churches +walker +wars +tunnel +passes +stupid +agriculture +softly +cabinet +regarded +joining +indiana +##ea +##ms +push +dates +spend +behavior +woods +protein +gently +chase +morgan +mention +burning +wake +combination +occur +mirror +leads +jimmy +indeed +impossible +singapore +paintings +covering +##nes +soldier +locations +attendance +sell +historian +wisconsin +invasion +argued +painter +diego +changing +egypt +##don +experienced +inches +##ku +missouri +vol +grounds +spoken +switzerland +##gan +reform +rolling +ha +forget +massive +resigned +burned +allen +tennessee +locked +values +improved +##mo +wounded +universe +sick +dating +facing +pack +purchase +user +##pur +moments +##ul +merged +anniversary +1908 +coal +brick +understood +causes +dynasty +queensland +establish +stores +crisis +promote +hoping +views +cards +referee +extension +##si +raise +arizona +improve +colonial +formal +charged +##rt +palm +lucky +hide +rescue +faces +95 +feelings +candidates +juan +##ell +goods +6th +courses +weekend +59 +luke +cash +fallen +##om +delivered +affected +installed +carefully +tries +swiss +hollywood +costs +lincoln +responsibility +##he +shore +file +proper +normally +maryland +assistance +jump +constant +offering +friendly +waters +persons +realize +contain +trophy +800 +partnership +factor +58 +musicians +cry +bound +oregon +indicated +hero +houston +medium +##ure +consisting +somewhat +##ara +57 +cycle +##che +beer +moore +frederick +gotten +eleven +worst +weak +approached +arranged +chin +loan +universal +bond +fifteen +pattern +disappeared +##ney +translated +##zed +lip +arab +capture +interests +insurance +##chi +shifted +cave +prix +warning +sections +courts +coat +plot +smell +feed +golf +favorite +maintain +knife +vs +voted +degrees +finance +quebec +opinion +translation +manner +ruled +operate +productions +choose +musician +discovery +confused +tired +separated +stream +techniques +committed +attend +ranking +kings +throw +passengers +measure +horror +fan +mining +sand +danger +salt +calm +decade +dam +require +runner +##ik +rush +associate +greece +##ker +rivers +consecutive +matthew +##ski +sighed +sq +documents +steam +edited +closing +tie +accused +1905 +##ini +islamic +distributed +directors +organisation +bruce +7th +breathing +mad +lit +arrival +concrete +taste +08 +composition +shaking +faster +amateur +adjacent +stating +1906 +twin +flew +##ran +tokyo +publications +##tone +obviously +ridge +storage +1907 +carl +pages +concluded +desert +driven +universities +ages +terminal +sequence +borough +250 +constituency +creative +cousin +economics +dreams +margaret +notably +reduce +montreal +mode +17th +ears +saved +jan +vocal +##ica +1909 +andy +##jo +riding +roughly +threatened +##ise +meters +meanwhile +landed +compete +repeated +grass +czech +regularly +charges +tea +sudden +appeal +##ung +solution +describes +pierre +classification +glad +parking +##ning +belt +physics +99 +rachel +add +hungarian +participate +expedition +damaged +gift +childhood +85 +fifty +##red +mathematics +jumped +letting +defensive +mph +##ux +##gh +testing +##hip +hundreds +shoot +owners +matters +smoke +israeli +kentucky +dancing +mounted +grandfather +emma +designs +profit +argentina +##gs +truly +li +lawrence +cole +begun +detroit +willing +branches +smiling +decide +miami +enjoyed +recordings +##dale +poverty +ethnic +gay +##bi +gary +arabic +09 +accompanied +##one +##ons +fishing +determine +residential +acid +##ary +alice +returns +starred +mail +##ang +jonathan +strategy +##ue +net +forty +cook +businesses +equivalent +commonwealth +distinct +ill +##cy +seriously +##ors +##ped +shift +harris +replace +rio +imagine +formula +ensure +##ber +additionally +scheme +conservation +occasionally +purposes +feels +favor +##and +##ore +1930s +contrast +hanging +hunt +movies +1904 +instruments +victims +danish +christopher +busy +demon +sugar +earliest +colony +studying +balance +duties +##ks +belgium +slipped +carter +05 +visible +stages +iraq +fifa +##im +commune +forming +zero +07 +continuing +talked +counties +legend +bathroom +option +tail +clay +daughters +afterwards +severe +jaw +visitors +##ded +devices +aviation +russell +kate +##vi +entering +subjects +##ino +temporary +swimming +forth +smooth +ghost +audio +bush +operates +rocks +movements +signs +eddie +##tz +ann +voices +honorary +06 +memories +dallas +pure +measures +racial +promised +66 +harvard +ceo +16th +parliamentary +indicate +benefit +flesh +dublin +louisiana +1902 +1901 +patient +sleeping +1903 +membership +coastal +medieval +wanting +element +scholars +rice +62 +limit +survive +makeup +rating +definitely +collaboration +obvious +##tan +boss +ms +baron +birthday +linked +soil +diocese +##lan +ncaa +##mann +offensive +shell +shouldn +waist +##tus +plain +ross +organ +resolution +manufacturing +adding +relative +kennedy +98 +whilst +moth +marketing +gardens +crash +72 +heading +partners +credited +carlos +moves +cable +##zi +marshall +##out +depending +bottle +represents +rejected +responded +existed +04 +jobs +denmark +lock +##ating +treated +graham +routes +talent +commissioner +drugs +secure +tests +reign +restored +photography +##gi +contributions +oklahoma +designer +disc +grin +seattle +robin +paused +atlanta +unusual +##gate +praised +las +laughing +satellite +hungary +visiting +##sky +interesting +factors +deck +poems +norman +##water +stuck +speaker +rifle +domain +premiered +##her +dc +comics +actors +01 +reputation +eliminated +8th +ceiling +prisoners +script +##nce +leather +austin +mississippi +rapidly +admiral +parallel +charlotte +guilty +tools +gender +divisions +fruit +##bs +laboratory +nelson +fantasy +marry +rapid +aunt +tribe +requirements +aspects +suicide +amongst +adams +bone +ukraine +abc +kick +sees +edinburgh +clothing +column +rough +gods +hunting +broadway +gathered +concerns +##ek +spending +ty +12th +snapped +requires +solar +bones +cavalry +##tta +iowa +drinking +waste +index +franklin +charity +thompson +stewart +tip +flash +landscape +friday +enjoy +singh +poem +listening +##back +eighth +fred +differences +adapted +bomb +ukrainian +surgery +corporate +masters +anywhere +##more +waves +odd +sean +portugal +orleans +dick +debate +kent +eating +puerto +cleared +96 +expect +cinema +97 +guitarist +blocks +electrical +agree +involving +depth +dying +panel +struggle +##ged +peninsula +adults +novels +emerged +vienna +metro +debuted +shoes +tamil +songwriter +meets +prove +beating +instance +heaven +scared +sending +marks +artistic +passage +superior +03 +significantly +shopping +##tive +retained +##izing +malaysia +technique +cheeks +##ola +warren +maintenance +destroy +extreme +allied +120 +appearing +##yn +fill +advice +alabama +qualifying +policies +cleveland +hat +battery +smart +authors +10th +soundtrack +acted +dated +lb +glance +equipped +coalition +funny +outer +ambassador +roy +possibility +couples +campbell +dna +loose +ethan +supplies +1898 +gonna +88 +monster +##res +shake +agents +frequency +springs +dogs +practices +61 +gang +plastic +easier +suggests +gulf +blade +exposed +colors +industries +markets +pan +nervous +electoral +charts +legislation +ownership +##idae +mac +appointment +shield +copy +assault +socialist +abbey +monument +license +throne +employment +jay +93 +replacement +charter +cloud +powered +suffering +accounts +oak +connecticut +strongly +wright +colour +crystal +13th +context +welsh +networks +voiced +gabriel +jerry +##cing +forehead +mp +##ens +manage +schedule +totally +remix +##ii +forests +occupation +print +nicholas +brazilian +strategic +vampires +engineers +76 +roots +seek +correct +instrumental +und +alfred +backed +hop +##des +stanley +robinson +traveled +wayne +welcome +austrian +achieve +67 +exit +rates +1899 +strip +whereas +##cs +sing +deeply +adventure +bobby +rick +jamie +careful +components +cap +useful +personality +knee +##shi +pushing +hosts +02 +protest +ca +ottoman +symphony +##sis +63 +boundary +1890 +processes +considering +considerable +tons +##work +##ft +##nia +cooper +trading +dear +conduct +91 +illegal +apple +revolutionary +holiday +definition +harder +##van +jacob +circumstances +destruction +##lle +popularity +grip +classified +liverpool +donald +baltimore +flows +seeking +honour +approval +92 +mechanical +till +happening +statue +critic +increasingly +immediate +describe +commerce +stare +##ster +indonesia +meat +rounds +boats +baker +orthodox +depression +formally +worn +naked +claire +muttered +sentence +11th +emily +document +77 +criticism +wished +vessel +spiritual +bent +virgin +parker +minimum +murray +lunch +danny +printed +compilation +keyboards +false +blow +belonged +68 +raising +78 +cutting +##board +pittsburgh +##up +9th +shadows +81 +hated +indigenous +jon +15th +barry +scholar +ah +##zer +oliver +##gy +stick +susan +meetings +attracted +spell +romantic +##ver +ye +1895 +photo +demanded +customers +##ac +1896 +logan +revival +keys +modified +commanded +jeans +##ious +upset +raw +phil +detective +hiding +resident +vincent +##bly +experiences +diamond +defeating +coverage +lucas +external +parks +franchise +helen +bible +successor +percussion +celebrated +il +lift +profile +clan +romania +##ied +mills +##su +nobody +achievement +shrugged +fault +1897 +rhythm +initiative +breakfast +carbon +700 +69 +lasted +violent +74 +wound +ken +killer +gradually +filmed +°c +dollars +processing +94 +remove +criticized +guests +sang +chemistry +##vin +legislature +disney +##bridge +uniform +escaped +integrated +proposal +purple +denied +liquid +karl +influential +morris +nights +stones +intense +experimental +twisted +71 +84 +##ld +pace +nazi +mitchell +ny +blind +reporter +newspapers +14th +centers +burn +basin +forgotten +surviving +filed +collections +monastery +losses +manual +couch +description +appropriate +merely +tag +missions +sebastian +restoration +replacing +triple +73 +elder +julia +warriors +benjamin +julian +convinced +stronger +amazing +declined +versus +merchant +happens +output +finland +bare +barbara +absence +ignored +dawn +injuries +##port +producers +##ram +82 +luis +##ities +kw +admit +expensive +electricity +nba +exception +symbol +##ving +ladies +shower +sheriff +characteristics +##je +aimed +button +ratio +effectively +summit +angle +jury +bears +foster +vessels +pants +executed +evans +dozen +advertising +kicked +patrol +1889 +competitions +lifetime +principles +athletics +##logy +birmingham +sponsored +89 +rob +nomination +1893 +acoustic +##sm +creature +longest +##tra +credits +harbor +dust +josh +##so +territories +milk +infrastructure +completion +thailand +indians +leon +archbishop +##sy +assist +pitch +blake +arrangement +girlfriend +serbian +operational +hence +sad +scent +fur +dj +sessions +hp +refer +rarely +##ora +exists +1892 +##ten +scientists +dirty +penalty +burst +portrait +seed +79 +pole +limits +rival +1894 +stable +alpha +grave +constitutional +alcohol +arrest +flower +mystery +devil +architectural +relationships +greatly +habitat +##istic +larry +progressive +remote +cotton +##ics +##ok +preserved +reaches +##ming +cited +86 +vast +scholarship +decisions +cbs +joy +teach +1885 +editions +knocked +eve +searching +partly +participation +gap +animated +fate +excellent +##ett +na +87 +alternate +saints +youngest +##ily +climbed +##ita +##tors +suggest +##ct +discussion +staying +choir +lakes +jacket +revenue +nevertheless +peaked +instrument +wondering +annually +managing +neil +1891 +signing +terry +##ice +apply +clinical +brooklyn +aim +catherine +fuck +farmers +figured +ninth +pride +hugh +evolution +ordinary +involvement +comfortable +shouted +tech +encouraged +taiwan +representation +sharing +##lia +##em +panic +exact +cargo +competing +fat +cried +83 +1920s +occasions +pa +cabin +borders +utah +marcus +##isation +badly +muscles +##ance +victorian +transition +warner +bet +permission +##rin +slave +terrible +similarly +shares +seth +uefa +possession +medals +benefits +colleges +lowered +perfectly +mall +transit +##ye +##kar +publisher +##ened +harrison +deaths +elevation +##ae +asleep +machines +sigh +ash +hardly +argument +occasion +parent +leo +decline +1888 +contribution +##ua +concentration +1000 +opportunities +hispanic +guardian +extent +emotions +hips +mason +volumes +bloody +controversy +diameter +steady +mistake +phoenix +identify +violin +##sk +departure +richmond +spin +funeral +enemies +1864 +gear +literally +connor +random +sergeant +grab +confusion +1865 +transmission +informed +op +leaning +sacred +suspended +thinks +gates +portland +luck +agencies +yours +hull +expert +muscle +layer +practical +sculpture +jerusalem +latest +lloyd +statistics +deeper +recommended +warrior +arkansas +mess +supports +greg +eagle +1880 +recovered +rated +concerts +rushed +##ano +stops +eggs +files +premiere +keith +##vo +delhi +turner +pit +affair +belief +paint +##zing +mate +##ach +##ev +victim +##ology +withdrew +bonus +styles +fled +##ud +glasgow +technologies +funded +nbc +adaptation +##ata +portrayed +cooperation +supporters +judges +bernard +justin +hallway +ralph +##ick +graduating +controversial +distant +continental +spider +bite +##ho +recognize +intention +mixing +##ese +egyptian +bow +tourism +suppose +claiming +tiger +dominated +participants +vi +##ru +nurse +partially +tape +##rum +psychology +##rn +essential +touring +duo +voting +civilian +emotional +channels +##king +apparent +hebrew +1887 +tommy +carrier +intersection +beast +hudson +##gar +##zo +lab +nova +bench +discuss +costa +##ered +detailed +behalf +drivers +unfortunately +obtain +##lis +rocky +##dae +siege +friendship +honey +##rian +1861 +amy +hang +posted +governments +collins +respond +wildlife +preferred +operator +##po +laura +pregnant +videos +dennis +suspected +boots +instantly +weird +automatic +businessman +alleged +placing +throwing +ph +mood +1862 +perry +venue +jet +remainder +##lli +##ci +passion +biological +boyfriend +1863 +dirt +buffalo +ron +segment +fa +abuse +##era +genre +thrown +stroke +colored +stress +exercise +displayed +##gen +struggled +##tti +abroad +dramatic +wonderful +thereafter +madrid +component +widespread +##sed +tale +citizen +todd +monday +1886 +vancouver +overseas +forcing +crying +descent +##ris +discussed +substantial +ranks +regime +1870 +provinces +switch +drum +zane +ted +tribes +proof +lp +cream +researchers +volunteer +manor +silk +milan +donated +allies +venture +principle +delivery +enterprise +##ves +##ans +bars +traditionally +witch +reminded +copper +##uk +pete +inter +links +colin +grinned +elsewhere +competitive +frequent +##oy +scream +##hu +tension +texts +submarine +finnish +defending +defend +pat +detail +1884 +affiliated +stuart +themes +villa +periods +tool +belgian +ruling +crimes +answers +folded +licensed +resort +demolished +hans +lucy +1881 +lion +traded +photographs +writes +craig +##fa +trials +generated +beth +noble +debt +percentage +yorkshire +erected +ss +viewed +grades +confidence +ceased +islam +telephone +retail +##ible +chile +m² +roberts +sixteen +##ich +commented +hampshire +innocent +dual +pounds +checked +regulations +afghanistan +sung +rico +liberty +assets +bigger +options +angels +relegated +tribute +wells +attending +leaf +##yan +butler +romanian +forum +monthly +lisa +patterns +gmina +##tory +madison +hurricane +rev +##ians +bristol +##ula +elite +valuable +disaster +democracy +awareness +germans +freyja +##ins +loop +absolutely +paying +populations +maine +sole +prayer +spencer +releases +doorway +bull +##ani +lover +midnight +conclusion +##sson +thirteen +lily +mediterranean +##lt +nhl +proud +sample +##hill +drummer +guinea +##ova +murphy +climb +##ston +instant +attributed +horn +ain +railways +steven +##ao +autumn +ferry +opponent +root +traveling +secured +corridor +stretched +tales +sheet +trinity +cattle +helps +indicates +manhattan +murdered +fitted +1882 +gentle +grandmother +mines +shocked +vegas +produces +##light +caribbean +##ou +belong +continuous +desperate +drunk +historically +trio +waved +raf +dealing +nathan +bat +murmured +interrupted +residing +scientist +pioneer +harold +aaron +##net +delta +attempting +minority +mini +believes +chorus +tend +lots +eyed +indoor +load +shots +updated +jail +##llo +concerning +connecting +wealth +##ved +slaves +arrive +rangers +sufficient +rebuilt +##wick +cardinal +flood +muhammad +whenever +relation +runners +moral +repair +viewers +arriving +revenge +punk +assisted +bath +fairly +breathe +lists +innings +illustrated +whisper +nearest +voters +clinton +ties +ultimate +screamed +beijing +lions +andre +fictional +gathering +comfort +radar +suitable +dismissed +hms +ban +pine +wrist +atmosphere +voivodeship +bid +timber +##ned +##nan +giants +##ane +cameron +recovery +uss +identical +categories +switched +serbia +laughter +noah +ensemble +therapy +peoples +touching +##off +locally +pearl +platforms +everywhere +ballet +tables +lanka +herbert +outdoor +toured +derek +1883 +spaces +contested +swept +1878 +exclusive +slight +connections +##dra +winds +prisoner +collective +bangladesh +tube +publicly +wealthy +thai +##ys +isolated +select +##ric +insisted +pen +fortune +ticket +spotted +reportedly +animation +enforcement +tanks +110 +decides +wider +lowest +owen +##time +nod +hitting +##hn +gregory +furthermore +magazines +fighters +solutions +##ery +pointing +requested +peru +reed +chancellor +knights +mask +worker +eldest +flames +reduction +1860 +volunteers +##tis +reporting +##hl +wire +advisory +endemic +origins +settlers +pursue +knock +consumer +1876 +eu +compound +creatures +mansion +sentenced +ivan +deployed +guitars +frowned +involves +mechanism +kilometers +perspective +shops +maps +terminus +duncan +alien +fist +bridges +##pers +heroes +fed +derby +swallowed +##ros +patent +sara +illness +characterized +adventures +slide +hawaii +jurisdiction +##op +organised +##side +adelaide +walks +biology +se +##ties +rogers +swing +tightly +boundaries +##rie +prepare +implementation +stolen +##sha +certified +colombia +edwards +garage +##mm +recalled +##ball +rage +harm +nigeria +breast +##ren +furniture +pupils +settle +##lus +cuba +balls +client +alaska +21st +linear +thrust +celebration +latino +genetic +terror +##cia +##ening +lightning +fee +witness +lodge +establishing +skull +##ique +earning +hood +##ei +rebellion +wang +sporting +warned +missile +devoted +activist +porch +worship +fourteen +package +1871 +decorated +##shire +housed +##ock +chess +sailed +doctors +oscar +joan +treat +garcia +harbour +jeremy +##ire +traditions +dominant +jacques +##gon +##wan +relocated +1879 +amendment +sized +companion +simultaneously +volleyball +spun +acre +increases +stopping +loves +belongs +affect +drafted +tossed +scout +battles +1875 +filming +shoved +munich +tenure +vertical +romance +pc +##cher +argue +##ical +craft +ranging +www +opens +honest +tyler +yesterday +virtual +##let +muslims +reveal +snake +immigrants +radical +screaming +speakers +firing +saving +belonging +ease +lighting +prefecture +blame +farmer +hungry +grows +rubbed +beam +sur +subsidiary +##cha +armenian +sao +dropping +conventional +##fer +microsoft +reply +qualify +spots +1867 +sweat +festivals +##ken +immigration +physician +discover +exposure +sandy +explanation +isaac +implemented +##fish +hart +initiated +connect +stakes +presents +heights +householder +pleased +tourist +regardless +slip +closest +##ction +surely +sultan +brings +riley +preparation +aboard +slammed +baptist +experiment +ongoing +interstate +organic +playoffs +##ika +1877 +130 +##tar +hindu +error +tours +tier +plenty +arrangements +talks +trapped +excited +sank +ho +athens +1872 +denver +welfare +suburb +athletes +trick +diverse +belly +exclusively +yelled +1868 +##med +conversion +##ette +1874 +internationally +computers +conductor +abilities +sensitive +hello +dispute +measured +globe +rocket +prices +amsterdam +flights +tigers +inn +municipalities +emotion +references +3d +##mus +explains +airlines +manufactured +pm +archaeological +1873 +interpretation +devon +comment +##ites +settlements +kissing +absolute +improvement +suite +impressed +barcelona +sullivan +jefferson +towers +jesse +julie +##tin +##lu +grandson +hi +gauge +regard +rings +interviews +trace +raymond +thumb +departments +burns +serial +bulgarian +scores +demonstrated +##ix +1866 +kyle +alberta +underneath +romanized +##ward +relieved +acquisition +phrase +cliff +reveals +han +cuts +merger +custom +##dar +nee +gilbert +graduation +##nts +assessment +cafe +difficulty +demands +swung +democrat +jennifer +commons +1940s +grove +##yo +completing +focuses +sum +substitute +bearing +stretch +reception +##py +reflected +essentially +destination +pairs +##ched +survival +resource +##bach +promoting +doubles +messages +tear +##down +##fully +parade +florence +harvey +incumbent +partial +framework +900 +pedro +frozen +procedure +olivia +controls +##mic +shelter +personally +temperatures +##od +brisbane +tested +sits +marble +comprehensive +oxygen +leonard +##kov +inaugural +iranian +referring +quarters +attitude +##ivity +mainstream +lined +mars +dakota +norfolk +unsuccessful +##° +explosion +helicopter +congressional +##sing +inspector +bitch +seal +departed +divine +##ters +coaching +examination +punishment +manufacturer +sink +columns +unincorporated +signals +nevada +squeezed +dylan +dining +photos +martial +manuel +eighteen +elevator +brushed +plates +ministers +ivy +congregation +##len +slept +specialized +taxes +curve +restricted +negotiations +likes +statistical +arnold +inspiration +execution +bold +intermediate +significance +margin +ruler +wheels +gothic +intellectual +dependent +listened +eligible +buses +widow +syria +earn +cincinnati +collapsed +recipient +secrets +accessible +philippine +maritime +goddess +clerk +surrender +breaks +playoff +database +##ified +##lon +ideal +beetle +aspect +soap +regulation +strings +expand +anglo +shorter +crosses +retreat +tough +coins +wallace +directions +pressing +##oon +shipping +locomotives +comparison +topics +nephew +##mes +distinction +honors +travelled +sierra +ibn +##over +fortress +sa +recognised +carved +1869 +clients +##dan +intent +##mar +coaches +describing +bread +##ington +beaten +northwestern +##ona +merit +youtube +collapse +challenges +em +historians +objective +submitted +virus +attacking +drake +assume +##ere +diseases +marc +stem +leeds +##cus +##ab +farming +glasses +##lock +visits +nowhere +fellowship +relevant +carries +restaurants +experiments +101 +constantly +bases +targets +shah +tenth +opponents +verse +territorial +##ira +writings +corruption +##hs +instruction +inherited +reverse +emphasis +##vic +employee +arch +keeps +rabbi +watson +payment +uh +##ala +nancy +##tre +venice +fastest +sexy +banned +adrian +properly +ruth +touchdown +dollar +boards +metre +circles +edges +favour +comments +ok +travels +liberation +scattered +firmly +##ular +holland +permitted +diesel +kenya +den +originated +##ral +demons +resumed +dragged +rider +##rus +servant +blinked +extend +torn +##ias +##sey +input +meal +everybody +cylinder +kinds +camps +##fe +bullet +logic +##wn +croatian +evolved +healthy +fool +chocolate +wise +preserve +pradesh +##ess +respective +1850 +##ew +chicken +artificial +gross +corresponding +convicted +cage +caroline +dialogue +##dor +narrative +stranger +mario +br +christianity +failing +trent +commanding +buddhist +1848 +maurice +focusing +yale +bike +altitude +##ering +mouse +revised +##sley +veteran +##ig +pulls +theology +crashed +campaigns +legion +##ability +drag +excellence +customer +cancelled +intensity +excuse +##lar +liga +participating +contributing +printing +##burn +variable +##rk +curious +bin +legacy +renaissance +##my +symptoms +binding +vocalist +dancer +##nie +grammar +gospel +democrats +ya +enters +sc +diplomatic +hitler +##ser +clouds +mathematical +quit +defended +oriented +##heim +fundamental +hardware +impressive +equally +convince +confederate +guilt +chuck +sliding +##ware +magnetic +narrowed +petersburg +bulgaria +otto +phd +skill +##ama +reader +hopes +pitcher +reservoir +hearts +automatically +expecting +mysterious +bennett +extensively +imagined +seeds +monitor +fix +##ative +journalism +struggling +signature +ranch +encounter +photographer +observation +protests +##pin +influences +##hr +calendar +##all +cruz +croatia +locomotive +hughes +naturally +shakespeare +basement +hook +uncredited +faded +theories +approaches +dare +phillips +filling +fury +obama +##ain +efficient +arc +deliver +min +raid +breeding +inducted +leagues +efficiency +axis +montana +eagles +##ked +supplied +instructions +karen +picking +indicating +trap +anchor +practically +christians +tomb +vary +occasional +electronics +lords +readers +newcastle +faint +innovation +collect +situations +engagement +160 +claude +mixture +##feld +peer +tissue +logo +lean +##ration +°f +floors +##ven +architects +reducing +##our +##ments +rope +1859 +ottawa +##har +samples +banking +declaration +proteins +resignation +francois +saudi +advocate +exhibited +armor +twins +divorce +##ras +abraham +reviewed +jo +temporarily +matrix +physically +pulse +curled +##ena +difficulties +bengal +usage +##ban +annie +riders +certificate +##pi +holes +warsaw +distinctive +jessica +##mon +mutual +1857 +customs +circular +eugene +removal +loaded +mere +vulnerable +depicted +generations +dame +heir +enormous +lightly +climbing +pitched +lessons +pilots +nepal +ram +google +preparing +brad +louise +renowned +##₂ +liam +##ably +plaza +shaw +sophie +brilliant +bills +##bar +##nik +fucking +mainland +server +pleasant +seized +veterans +jerked +fail +beta +brush +radiation +stored +warmth +southeastern +nate +sin +raced +berkeley +joke +athlete +designation +trunk +##low +roland +qualification +archives +heels +artwork +receives +judicial +reserves +##bed +woke +installation +abu +floating +fake +lesser +excitement +interface +concentrated +addressed +characteristic +amanda +saxophone +monk +auto +##bus +releasing +egg +dies +interaction +defender +ce +outbreak +glory +loving +##bert +sequel +consciousness +http +awake +ski +enrolled +##ress +handling +rookie +brow +somebody +biography +warfare +amounts +contracts +presentation +fabric +dissolved +challenged +meter +psychological +lt +elevated +rally +accurate +##tha +hospitals +undergraduate +specialist +venezuela +exhibit +shed +nursing +protestant +fluid +structural +footage +jared +consistent +prey +##ska +succession +reflect +exile +lebanon +wiped +suspect +shanghai +resting +integration +preservation +marvel +variant +pirates +sheep +rounded +capita +sailing +colonies +manuscript +deemed +variations +clarke +functional +emerging +boxing +relaxed +curse +azerbaijan +heavyweight +nickname +editorial +rang +grid +tightened +earthquake +flashed +miguel +rushing +##ches +improvements +boxes +brooks +180 +consumption +molecular +felix +societies +repeatedly +variation +aids +civic +graphics +professionals +realm +autonomous +receiver +delayed +workshop +militia +chairs +trump +canyon +##point +harsh +extending +lovely +happiness +##jan +stake +eyebrows +embassy +wellington +hannah +##ella +sony +corners +bishops +swear +cloth +contents +xi +namely +commenced +1854 +stanford +nashville +courage +graphic +commitment +garrison +##bin +hamlet +clearing +rebels +attraction +literacy +cooking +ruins +temples +jenny +humanity +celebrate +hasn +freight +sixty +rebel +bastard +##art +newton +##ada +deer +##ges +##ching +smiles +delaware +singers +##ets +approaching +assists +flame +##ph +boulevard +barrel +planted +##ome +pursuit +##sia +consequences +posts +shallow +invitation +rode +depot +ernest +kane +rod +concepts +preston +topic +chambers +striking +blast +arrives +descendants +montgomery +ranges +worlds +##lay +##ari +span +chaos +praise +##ag +fewer +1855 +sanctuary +mud +fbi +##ions +programmes +maintaining +unity +harper +bore +handsome +closure +tournaments +thunder +nebraska +linda +facade +puts +satisfied +argentine +dale +cork +dome +panama +##yl +1858 +tasks +experts +##ates +feeding +equation +##las +##ida +##tu +engage +bryan +##ax +um +quartet +melody +disbanded +sheffield +blocked +gasped +delay +kisses +maggie +connects +##non +sts +poured +creator +publishers +##we +guided +ellis +extinct +hug +gaining +##ord +complicated +##bility +poll +clenched +investigate +##use +thereby +quantum +spine +cdp +humor +kills +administered +semifinals +##du +encountered +ignore +##bu +commentary +##maker +bother +roosevelt +140 +plains +halfway +flowing +cultures +crack +imprisoned +neighboring +airline +##ses +##view +##mate +##ec +gather +wolves +marathon +transformed +##ill +cruise +organisations +carol +punch +exhibitions +numbered +alarm +ratings +daddy +silently +##stein +queens +colours +impression +guidance +liu +tactical +##rat +marshal +della +arrow +##ings +rested +feared +tender +owns +bitter +advisor +escort +##ides +spare +farms +grants +##ene +dragons +encourage +colleagues +cameras +##und +sucked +pile +spirits +prague +statements +suspension +landmark +fence +torture +recreation +bags +permanently +survivors +pond +spy +predecessor +bombing +coup +##og +protecting +transformation +glow +##lands +##book +dug +priests +andrea +feat +barn +jumping +##chen +##ologist +##con +casualties +stern +auckland +pipe +serie +revealing +ba +##bel +trevor +mercy +spectrum +yang +consist +governing +collaborated +possessed +epic +comprises +blew +shane +##ack +lopez +honored +magical +sacrifice +judgment +perceived +hammer +mtv +baronet +tune +das +missionary +sheets +350 +neutral +oral +threatening +attractive +shade +aims +seminary +##master +estates +1856 +michel +wounds +refugees +manufacturers +##nic +mercury +syndrome +porter +##iya +##din +hamburg +identification +upstairs +purse +widened +pause +cared +breathed +affiliate +santiago +prevented +celtic +fisher +125 +recruited +byzantine +reconstruction +farther +##mp +diet +sake +au +spite +sensation +##ert +blank +separation +105 +##hon +vladimir +armies +anime +##lie +accommodate +orbit +cult +sofia +archive +##ify +##box +founders +sustained +disorder +honours +northeastern +mia +crops +violet +threats +blanket +fires +canton +followers +southwestern +prototype +voyage +assignment +altered +moderate +protocol +pistol +##eo +questioned +brass +lifting +1852 +math +authored +##ual +doug +dimensional +dynamic +##san +1851 +pronounced +grateful +quest +uncomfortable +boom +presidency +stevens +relating +politicians +chen +barrier +quinn +diana +mosque +tribal +cheese +palmer +portions +sometime +chester +treasure +wu +bend +download +millions +reforms +registration +##osa +consequently +monitoring +ate +preliminary +brandon +invented +ps +eaten +exterior +intervention +ports +documented +log +displays +lecture +sally +favourite +##itz +vermont +lo +invisible +isle +breed +##ator +journalists +relay +speaks +backward +explore +midfielder +actively +stefan +procedures +cannon +blond +kenneth +centered +servants +chains +libraries +malcolm +essex +henri +slavery +##hal +facts +fairy +coached +cassie +cats +washed +cop +##fi +announcement +item +2000s +vinyl +activated +marco +frontier +growled +curriculum +##das +loyal +accomplished +leslie +ritual +kenny +##00 +vii +napoleon +hollow +hybrid +jungle +stationed +friedrich +counted +##ulated +platinum +theatrical +seated +col +rubber +glen +1840 +diversity +healing +extends +id +provisions +administrator +columbus +##oe +tributary +te +assured +org +##uous +prestigious +examined +lectures +grammy +ronald +associations +bailey +allan +essays +flute +believing +consultant +proceedings +travelling +1853 +kit +kerala +yugoslavia +buddy +methodist +##ith +burial +centres +batman +##nda +discontinued +bo +dock +stockholm +lungs +severely +##nk +citing +manga +##ugh +steal +mumbai +iraqi +robot +celebrity +bride +broadcasts +abolished +pot +joel +overhead +franz +packed +reconnaissance +johann +acknowledged +introduce +handled +doctorate +developments +drinks +alley +palestine +##nis +##aki +proceeded +recover +bradley +grain +patch +afford +infection +nationalist +legendary +##ath +interchange +virtually +gen +gravity +exploration +amber +vital +wishes +powell +doctrine +elbow +screenplay +##bird +contribute +indonesian +pet +creates +##com +enzyme +kylie +discipline +drops +manila +hunger +##ien +layers +suffer +fever +bits +monica +keyboard +manages +##hood +searched +appeals +##bad +testament +grande +reid +##war +beliefs +congo +##ification +##dia +si +requiring +##via +casey +1849 +regret +streak +rape +depends +syrian +sprint +pound +tourists +upcoming +pub +##xi +tense +##els +practiced +echo +nationwide +guild +motorcycle +liz +##zar +chiefs +desired +elena +bye +precious +absorbed +relatives +booth +pianist +##mal +citizenship +exhausted +wilhelm +##ceae +##hed +noting +quarterback +urge +hectares +##gue +ace +holly +##tal +blonde +davies +parked +sustainable +stepping +twentieth +airfield +galaxy +nest +chip +##nell +tan +shaft +paulo +requirement +##zy +paradise +tobacco +trans +renewed +vietnamese +##cker +##ju +suggesting +catching +holmes +enjoying +md +trips +colt +holder +butterfly +nerve +reformed +cherry +bowling +trailer +carriage +goodbye +appreciate +toy +joshua +interactive +enabled +involve +##kan +collar +determination +bunch +facebook +recall +shorts +superintendent +episcopal +frustration +giovanni +nineteenth +laser +privately +array +circulation +##ovic +armstrong +deals +painful +permit +discrimination +##wi +aires +retiring +cottage +ni +##sta +horizon +ellen +jamaica +ripped +fernando +chapters +playstation +patron +lecturer +navigation +behaviour +genes +georgian +export +solomon +rivals +swift +seventeen +rodriguez +princeton +independently +sox +1847 +arguing +entity +casting +hank +criteria +oakland +geographic +milwaukee +reflection +expanding +conquest +dubbed +##tv +halt +brave +brunswick +doi +arched +curtis +divorced +predominantly +somerset +streams +ugly +zoo +horrible +curved +buenos +fierce +dictionary +vector +theological +unions +handful +stability +chan +punjab +segments +##lly +altar +ignoring +gesture +monsters +pastor +##stone +thighs +unexpected +operators +abruptly +coin +compiled +associates +improving +migration +pin +##ose +compact +collegiate +reserved +##urs +quarterfinals +roster +restore +assembled +hurry +oval +##cies +1846 +flags +martha +##del +victories +sharply +##rated +argues +deadly +neo +drawings +symbols +performer +##iel +griffin +restrictions +editing +andrews +java +journals +arabia +compositions +dee +pierce +removing +hindi +casino +runway +civilians +minds +nasa +hotels +##zation +refuge +rent +retain +potentially +conferences +suburban +conducting +##tto +##tions +##tle +descended +massacre +##cal +ammunition +terrain +fork +souls +counts +chelsea +durham +drives +cab +##bank +perth +realizing +palestinian +finn +simpson +##dal +betty +##ule +moreover +particles +cardinals +tent +evaluation +extraordinary +##oid +inscription +##works +wednesday +chloe +maintains +panels +ashley +trucks +##nation +cluster +sunlight +strikes +zhang +##wing +dialect +canon +##ap +tucked +##ws +collecting +##mas +##can +##sville +maker +quoted +evan +franco +aria +buying +cleaning +eva +closet +provision +apollo +clinic +rat +##ez +necessarily +ac +##gle +##ising +venues +flipped +cent +spreading +trustees +checking +authorized +##sco +disappointed +##ado +notion +duration +trumpet +hesitated +topped +brussels +rolls +theoretical +hint +define +aggressive +repeat +wash +peaceful +optical +width +allegedly +mcdonald +strict +copyright +##illa +investors +mar +jam +witnesses +sounding +miranda +michelle +privacy +hugo +harmony +##pp +valid +lynn +glared +nina +102 +headquartered +diving +boarding +gibson +##ncy +albanian +marsh +routine +dealt +enhanced +er +intelligent +substance +targeted +enlisted +discovers +spinning +observations +pissed +smoking +rebecca +capitol +visa +varied +costume +seemingly +indies +compensation +surgeon +thursday +arsenal +westminster +suburbs +rid +anglican +##ridge +knots +foods +alumni +lighter +fraser +whoever +portal +scandal +##ray +gavin +advised +instructor +flooding +terrorist +##ale +teenage +interim +senses +duck +teen +thesis +abby +eager +overcome +##ile +newport +glenn +rises +shame +##cc +prompted +priority +forgot +bomber +nicolas +protective +360 +cartoon +katherine +breeze +lonely +trusted +henderson +richardson +relax +banner +candy +palms +remarkable +##rio +legends +cricketer +essay +ordained +edmund +rifles +trigger +##uri +##away +sail +alert +1830 +audiences +penn +sussex +siblings +pursued +indianapolis +resist +rosa +consequence +succeed +avoided +1845 +##ulation +inland +##tie +##nna +counsel +profession +chronicle +hurried +##una +eyebrow +eventual +bleeding +innovative +cure +##dom +committees +accounting +con +scope +hardy +heather +tenor +gut +herald +codes +tore +scales +wagon +##oo +luxury +tin +prefer +fountain +triangle +bonds +darling +convoy +dried +traced +beings +troy +accidentally +slam +findings +smelled +joey +lawyers +outcome +steep +bosnia +configuration +shifting +toll +brook +performers +lobby +philosophical +construct +shrine +aggregate +boot +cox +phenomenon +savage +insane +solely +reynolds +lifestyle +##ima +nationally +holdings +consideration +enable +edgar +mo +mama +##tein +fights +relegation +chances +atomic +hub +conjunction +awkward +reactions +currency +finale +kumar +underwent +steering +elaborate +gifts +comprising +melissa +veins +reasonable +sunshine +chi +solve +trails +inhabited +elimination +ethics +huh +ana +molly +consent +apartments +layout +marines +##ces +hunters +bulk +##oma +hometown +##wall +##mont +cracked +reads +neighbouring +withdrawn +admission +wingspan +damned +anthology +lancashire +brands +batting +forgive +cuban +awful +##lyn +104 +dimensions +imagination +##ade +dante +##ship +tracking +desperately +goalkeeper +##yne +groaned +workshops +confident +burton +gerald +milton +circus +uncertain +slope +copenhagen +sophia +fog +philosopher +portraits +accent +cycling +varying +gripped +larvae +garrett +specified +scotia +mature +luther +kurt +rap +##kes +aerial +750 +ferdinand +heated +es +transported +##shan +safely +nonetheless +##orn +##gal +motors +demanding +##sburg +startled +##brook +ally +generate +caps +ghana +stained +demo +mentions +beds +ap +afterward +diary +##bling +utility +##iro +richards +1837 +conspiracy +conscious +shining +footsteps +observer +cyprus +urged +loyalty +developer +probability +olive +upgraded +gym +miracle +insects +graves +1844 +ourselves +hydrogen +amazon +katie +tickets +poets +##pm +planes +##pan +prevention +witnessed +dense +jin +randy +tang +warehouse +monroe +bang +archived +elderly +investigations +alec +granite +mineral +conflicts +controlling +aboriginal +carlo +##zu +mechanics +stan +stark +rhode +skirt +est +##berry +bombs +respected +##horn +imposed +limestone +deny +nominee +memphis +grabbing +disabled +##als +amusement +aa +frankfurt +corn +referendum +varies +slowed +disk +firms +unconscious +incredible +clue +sue +##zhou +twist +##cio +joins +idaho +chad +developers +computing +destroyer +103 +mortal +tucker +kingston +choices +yu +carson +1800 +os +whitney +geneva +pretend +dimension +staged +plateau +maya +##une +freestyle +##bc +rovers +hiv +##ids +tristan +classroom +prospect +##hus +honestly +diploma +lied +thermal +auxiliary +feast +unlikely +iata +##tel +morocco +pounding +treasury +lithuania +considerably +1841 +dish +1812 +geological +matching +stumbled +destroying +marched +brien +advances +cake +nicole +belle +settling +measuring +directing +##mie +tuesday +bassist +capabilities +stunned +fraud +torpedo +##list +##phone +anton +wisdom +surveillance +ruined +##ulate +lawsuit +healthcare +theorem +halls +trend +aka +horizontal +dozens +acquire +lasting +swim +hawk +gorgeous +fees +vicinity +decrease +adoption +tactics +##ography +pakistani +##ole +draws +##hall +willie +burke +heath +algorithm +integral +powder +elliott +brigadier +jackie +tate +varieties +darker +##cho +lately +cigarette +specimens +adds +##ree +##ensis +##inger +exploded +finalist +cia +murders +wilderness +arguments +nicknamed +acceptance +onwards +manufacture +robertson +jets +tampa +enterprises +blog +loudly +composers +nominations +1838 +ai +malta +inquiry +automobile +hosting +viii +rays +tilted +grief +museums +strategies +furious +euro +equality +cohen +poison +surrey +wireless +governed +ridiculous +moses +##esh +##room +vanished +##ito +barnes +attract +morrison +istanbul +##iness +absent +rotation +petition +janet +##logical +satisfaction +custody +deliberately +observatory +comedian +surfaces +pinyin +novelist +strictly +canterbury +oslo +monks +embrace +ibm +jealous +photograph +continent +dorothy +marina +doc +excess +holden +allegations +explaining +stack +avoiding +lance +storyline +majesty +poorly +spike +dos +bradford +raven +travis +classics +proven +voltage +pillow +fists +butt +1842 +interpreted +##car +1839 +gage +telegraph +lens +promising +expelled +casual +collector +zones +##min +silly +nintendo +##kh +##bra +downstairs +chef +suspicious +afl +flies +vacant +uganda +pregnancy +condemned +lutheran +estimates +cheap +decree +saxon +proximity +stripped +idiot +deposits +contrary +presenter +magnus +glacier +im +offense +edwin +##ori +upright +##long +bolt +##ois +toss +geographical +##izes +environments +delicate +marking +abstract +xavier +nails +windsor +plantation +occurring +equity +saskatchewan +fears +drifted +sequences +vegetation +revolt +##stic +1843 +sooner +fusion +opposing +nato +skating +1836 +secretly +ruin +lease +##oc +edit +##nne +flora +anxiety +ruby +##ological +##mia +tel +bout +taxi +emmy +frost +rainbow +compounds +foundations +rainfall +assassination +nightmare +dominican +##win +achievements +deserve +orlando +intact +armenia +##nte +calgary +valentine +106 +marion +proclaimed +theodore +bells +courtyard +thigh +gonzalez +console +troop +minimal +monte +everyday +##ence +##if +supporter +terrorism +buck +openly +presbyterian +activists +carpet +##iers +rubbing +uprising +##yi +cute +conceived +legally +##cht +millennium +cello +velocity +ji +rescued +cardiff +1835 +rex +concentrate +senators +beard +rendered +glowing +battalions +scouts +competitors +sculptor +catalogue +arctic +ion +raja +bicycle +wow +glancing +lawn +##woman +gentleman +lighthouse +publish +predicted +calculated +##val +variants +##gne +strain +##ui +winston +deceased +##nus +touchdowns +brady +caleb +sinking +echoed +crush +hon +blessed +protagonist +hayes +endangered +magnitude +editors +##tine +estimate +responsibilities +##mel +backup +laying +consumed +sealed +zurich +lovers +frustrated +##eau +ahmed +kicking +mit +treasurer +1832 +biblical +refuse +terrified +pump +agrees +genuine +imprisonment +refuses +plymouth +##hen +lou +##nen +tara +trembling +antarctic +ton +learns +##tas +crap +crucial +faction +atop +##borough +wrap +lancaster +odds +hopkins +erik +lyon +##eon +bros +##ode +snap +locality +tips +empress +crowned +cal +acclaimed +chuckled +##ory +clara +sends +mild +towel +##fl +##day +##а +wishing +assuming +interviewed +##bal +##die +interactions +eden +cups +helena +##lf +indie +beck +##fire +batteries +filipino +wizard +parted +##lam +traces +##born +rows +idol +albany +delegates +##ees +##sar +discussions +##ex +notre +instructed +belgrade +highways +suggestion +lauren +possess +orientation +alexandria +abdul +beats +salary +reunion +ludwig +alright +wagner +intimate +pockets +slovenia +hugged +brighton +merchants +cruel +stole +trek +slopes +repairs +enrollment +politically +underlying +promotional +counting +boeing +##bb +isabella +naming +##и +keen +bacteria +listing +separately +belfast +ussr +450 +lithuanian +anybody +ribs +sphere +martinez +cock +embarrassed +proposals +fragments +nationals +##fs +##wski +premises +fin +1500 +alpine +matched +freely +bounded +jace +sleeve +##af +gaming +pier +populated +evident +##like +frances +flooded +##dle +frightened +pour +trainer +framed +visitor +challenging +pig +wickets +##fold +infected +email +##pes +arose +##aw +reward +ecuador +oblast +vale +ch +shuttle +##usa +bach +rankings +forbidden +cornwall +accordance +salem +consumers +bruno +fantastic +toes +machinery +resolved +julius +remembering +propaganda +iceland +bombardment +tide +contacts +wives +##rah +concerto +macdonald +albania +implement +daisy +tapped +sudan +helmet +angela +mistress +##lic +crop +sunk +finest +##craft +hostile +##ute +##tsu +boxer +fr +paths +adjusted +habit +ballot +supervision +soprano +##zen +bullets +wicked +sunset +regiments +disappear +lamp +performs +app +##gia +##oa +rabbit +digging +incidents +entries +##cion +dishes +##oi +introducing +##ati +##fied +freshman +slot +jill +tackles +baroque +backs +##iest +lone +sponsor +destiny +altogether +convert +##aro +consensus +shapes +demonstration +basically +feminist +auction +artifacts +##bing +strongest +twitter +halifax +2019 +allmusic +mighty +smallest +precise +alexandra +viola +##los +##ille +manuscripts +##illo +dancers +ari +managers +monuments +blades +barracks +springfield +maiden +consolidated +electron +##end +berry +airing +wheat +nobel +inclusion +blair +payments +geography +bee +cc +eleanor +react +##hurst +afc +manitoba +##yu +su +lineup +fitness +recreational +investments +airborne +disappointment +##dis +edmonton +viewing +##row +renovation +##cast +infant +bankruptcy +roses +aftermath +pavilion +##yer +carpenter +withdrawal +ladder +##hy +discussing +popped +reliable +agreements +rochester +##abad +curves +bombers +220 +rao +reverend +decreased +choosing +107 +stiff +consulting +naples +crawford +tracy +ka +ribbon +cops +##lee +crushed +deciding +unified +teenager +accepting +flagship +explorer +poles +sanchez +inspection +revived +skilled +induced +exchanged +flee +locals +tragedy +swallow +loading +hanna +demonstrate +##ela +salvador +flown +contestants +civilization +##ines +wanna +rhodes +fletcher +hector +knocking +considers +##ough +nash +mechanisms +sensed +mentally +walt +unclear +##eus +renovated +madame +##cks +crews +governmental +##hin +undertaken +monkey +##ben +##ato +fatal +armored +copa +caves +governance +grasp +perception +certification +froze +damp +tugged +wyoming +##rg +##ero +newman +##lor +nerves +curiosity +graph +115 +##ami +withdraw +tunnels +dull +meredith +moss +exhibits +neighbors +communicate +accuracy +explored +raiders +republicans +secular +kat +superman +penny +criticised +##tch +freed +update +conviction +wade +ham +likewise +delegation +gotta +doll +promises +technological +myth +nationality +resolve +convent +##mark +sharon +dig +sip +coordinator +entrepreneur +fold +##dine +capability +councillor +synonym +blown +swan +cursed +1815 +jonas +haired +sofa +canvas +keeper +rivalry +##hart +rapper +speedway +swords +postal +maxwell +estonia +potter +recurring +##nn +##ave +errors +##oni +cognitive +1834 +##² +claws +nadu +roberto +bce +wrestler +ellie +##ations +infinite +ink +##tia +presumably +finite +staircase +108 +noel +patricia +nacional +##cation +chill +eternal +tu +preventing +prussia +fossil +limbs +##logist +ernst +frog +perez +rene +##ace +pizza +prussian +##ios +##vy +molecules +regulatory +answering +opinions +sworn +lengths +supposedly +hypothesis +upward +habitats +seating +ancestors +drank +yield +hd +synthesis +researcher +modest +##var +mothers +peered +voluntary +homeland +##the +acclaim +##igan +static +valve +luxembourg +alto +carroll +fe +receptor +norton +ambulance +##tian +johnston +catholics +depicting +jointly +elephant +gloria +mentor +badge +ahmad +distinguish +remarked +councils +precisely +allison +advancing +detection +crowded +##10 +cooperative +ankle +mercedes +dagger +surrendered +pollution +commit +subway +jeffrey +lesson +sculptures +provider +##fication +membrane +timothy +rectangular +fiscal +heating +teammate +basket +particle +anonymous +deployment +##ple +missiles +courthouse +proportion +shoe +sec +##ller +complaints +forbes +blacks +abandon +remind +sizes +overwhelming +autobiography +natalie +##awa +risks +contestant +countryside +babies +scorer +invaded +enclosed +proceed +hurling +disorders +##cu +reflecting +continuously +cruiser +graduates +freeway +investigated +ore +deserved +maid +blocking +phillip +jorge +shakes +dove +mann +variables +lacked +burden +accompanying +que +consistently +organizing +provisional +complained +endless +##rm +tubes +juice +georges +krishna +mick +labels +thriller +##uch +laps +arcade +sage +snail +##table +shannon +fi +laurence +seoul +vacation +presenting +hire +churchill +surprisingly +prohibited +savannah +technically +##oli +170 +##lessly +testimony +suited +speeds +toys +romans +mlb +flowering +measurement +talented +kay +settings +charleston +expectations +shattered +achieving +triumph +ceremonies +portsmouth +lanes +mandatory +loser +stretching +cologne +realizes +seventy +cornell +careers +webb +##ulating +americas +budapest +ava +suspicion +##ison +yo +conrad +##hai +sterling +jessie +rector +##az +1831 +transform +organize +loans +christine +volcanic +warrant +slender +summers +subfamily +newer +danced +dynamics +rhine +proceeds +heinrich +gastropod +commands +sings +facilitate +easter +ra +positioned +responses +expense +fruits +yanked +imported +25th +velvet +vic +primitive +tribune +baldwin +neighbourhood +donna +rip +hay +pr +##uro +1814 +espn +welcomed +##aria +qualifier +glare +highland +timing +##cted +shells +eased +geometry +louder +exciting +slovakia +##sion +##iz +##lot +savings +prairie +##ques +marching +rafael +tonnes +##lled +curtain +preceding +shy +heal +greene +worthy +##pot +detachment +bury +sherman +##eck +reinforced +seeks +bottles +contracted +duchess +outfit +walsh +##sc +mickey +##ase +geoffrey +archer +squeeze +dawson +eliminate +invention +##enberg +neal +##eth +stance +dealer +coral +maple +retire +polo +simplified +##ht +1833 +hid +watts +backwards +jules +##oke +genesis +mt +frames +rebounds +burma +woodland +moist +santos +whispers +drained +subspecies +##aa +streaming +ulster +burnt +correspondence +maternal +gerard +denis +stealing +##load +genius +duchy +##oria +inaugurated +momentum +suits +placement +sovereign +clause +thames +##hara +confederation +reservation +sketch +yankees +lets +rotten +charm +hal +verses +ultra +commercially +dot +salon +citation +adopt +winnipeg +mist +allocated +cairo +##boy +jenkins +interference +objectives +##wind +1820 +portfolio +armoured +sectors +##eh +initiatives +##world +integrity +exercises +robe +tap +ab +gazed +##tones +distracted +rulers +111 +favorable +jerome +tended +cart +factories +##eri +diplomat +valued +gravel +charitable +##try +calvin +exploring +chang +shepherd +terrace +pdf +pupil +##ural +reflects +ups +##rch +governors +shelf +depths +##nberg +trailed +crest +tackle +##nian +##ats +hatred +##kai +clare +makers +ethiopia +longtime +detected +embedded +lacking +slapped +rely +thomson +anticipation +iso +morton +successive +agnes +screenwriter +straightened +philippe +playwright +haunted +licence +iris +intentions +sutton +112 +logical +correctly +##weight +branded +licked +tipped +silva +ricky +narrator +requests +##ents +greeted +supernatural +cow +##wald +lung +refusing +employer +strait +gaelic +liner +##piece +zoe +sabha +##mba +driveway +harvest +prints +bates +reluctantly +threshold +algebra +ira +wherever +coupled +240 +assumption +picks +##air +designers +raids +gentlemen +##ean +roller +blowing +leipzig +locks +screw +dressing +strand +##lings +scar +dwarf +depicts +##nu +nods +##mine +differ +boris +##eur +yuan +flip +##gie +mob +invested +questioning +applying +##ture +shout +##sel +gameplay +blamed +illustrations +bothered +weakness +rehabilitation +##of +##zes +envelope +rumors +miners +leicester +subtle +kerry +##ico +ferguson +##fu +premiership +ne +##cat +bengali +prof +catches +remnants +dana +##rily +shouting +presidents +baltic +ought +ghosts +dances +sailors +shirley +fancy +dominic +##bie +madonna +##rick +bark +buttons +gymnasium +ashes +liver +toby +oath +providence +doyle +evangelical +nixon +cement +carnegie +embarked +hatch +surroundings +guarantee +needing +pirate +essence +##bee +filter +crane +hammond +projected +immune +percy +twelfth +##ult +regent +doctoral +damon +mikhail +##ichi +lu +critically +elect +realised +abortion +acute +screening +mythology +steadily +##fc +frown +nottingham +kirk +wa +minneapolis +##rra +module +algeria +mc +nautical +encounters +surprising +statues +availability +shirts +pie +alma +brows +munster +mack +soup +crater +tornado +sanskrit +cedar +explosive +bordered +dixon +planets +stamp +exam +happily +##bble +carriers +kidnapped +##vis +accommodation +emigrated +##met +knockout +correspondent +violation +profits +peaks +lang +specimen +agenda +ancestry +pottery +spelling +equations +obtaining +ki +linking +1825 +debris +asylum +##20 +buddhism +teddy +##ants +gazette +##nger +##sse +dental +eligibility +utc +fathers +averaged +zimbabwe +francesco +coloured +hissed +translator +lynch +mandate +humanities +mackenzie +uniforms +lin +##iana +##gio +asset +mhz +fitting +samantha +genera +wei +rim +beloved +shark +riot +entities +expressions +indo +carmen +slipping +owing +abbot +neighbor +sidney +##av +rats +recommendations +encouraging +squadrons +anticipated +commanders +conquered +##oto +donations +diagnosed +##mond +divide +##iva +guessed +decoration +vernon +auditorium +revelation +conversations +##kers +##power +herzegovina +dash +alike +protested +lateral +herman +accredited +mg +##gent +freeman +mel +fiji +crow +crimson +##rine +livestock +##pped +humanitarian +bored +oz +whip +##lene +##ali +legitimate +alter +grinning +spelled +anxious +oriental +wesley +##nin +##hole +carnival +controller +detect +##ssa +bowed +educator +kosovo +macedonia +##sin +occupy +mastering +stephanie +janeiro +para +unaware +nurses +noon +135 +cam +hopefully +ranger +combine +sociology +polar +rica +##eer +neill +##sman +holocaust +##ip +doubled +lust +1828 +109 +decent +cooling +unveiled +##card +1829 +nsw +homer +chapman +meyer +##gin +dive +mae +reagan +expertise +##gled +darwin +brooke +sided +prosecution +investigating +comprised +petroleum +genres +reluctant +differently +trilogy +johns +vegetables +corpse +highlighted +lounge +pension +unsuccessfully +elegant +aided +ivory +beatles +amelia +cain +dubai +sunny +immigrant +babe +click +##nder +underwater +pepper +combining +mumbled +atlas +horns +accessed +ballad +physicians +homeless +gestured +rpm +freak +louisville +corporations +patriots +prizes +rational +warn +modes +decorative +overnight +din +troubled +phantom +##ort +monarch +sheer +##dorf +generals +guidelines +organs +addresses +##zon +enhance +curling +parishes +cord +##kie +linux +caesar +deutsche +bavaria +##bia +coleman +cyclone +##eria +bacon +petty +##yama +##old +hampton +diagnosis +1824 +throws +complexity +rita +disputed +##₃ +pablo +##sch +marketed +trafficking +##ulus +examine +plague +formats +##oh +vault +faithful +##bourne +webster +##ox +highlights +##ient +##ann +phones +vacuum +sandwich +modeling +##gated +bolivia +clergy +qualities +isabel +##nas +##ars +wears +screams +reunited +annoyed +bra +##ancy +##rate +differential +transmitter +tattoo +container +poker +##och +excessive +resides +cowboys +##tum +augustus +trash +providers +statute +retreated +balcony +reversed +void +storey +preceded +masses +leap +laughs +neighborhoods +wards +schemes +falcon +santo +battlefield +pad +ronnie +thread +lesbian +venus +##dian +beg +sandstone +daylight +punched +gwen +analog +stroked +wwe +acceptable +measurements +dec +toxic +##kel +adequate +surgical +economist +parameters +varsity +##sberg +quantity +ella +##chy +##rton +countess +generating +precision +diamonds +expressway +ga +##ı +1821 +uruguay +talents +galleries +expenses +scanned +colleague +outlets +ryder +lucien +##ila +paramount +##bon +syracuse +dim +fangs +gown +sweep +##sie +toyota +missionaries +websites +##nsis +sentences +adviser +val +trademark +spells +##plane +patience +starter +slim +##borg +toe +incredibly +shoots +elliot +nobility +##wyn +cowboy +endorsed +gardner +tendency +persuaded +organisms +emissions +kazakhstan +amused +boring +chips +themed +##hand +llc +constantinople +chasing +systematic +guatemala +borrowed +erin +carey +##hard +highlands +struggles +1810 +##ifying +##ced +wong +exceptions +develops +enlarged +kindergarten +castro +##ern +##rina +leigh +zombie +juvenile +##most +consul +##nar +sailor +hyde +clarence +intensive +pinned +nasty +useless +jung +clayton +stuffed +exceptional +ix +apostolic +230 +transactions +##dge +exempt +swinging +cove +religions +##ash +shields +dairy +bypass +190 +pursuing +bug +joyce +bombay +chassis +southampton +chat +interact +redesignated +##pen +nascar +pray +salmon +rigid +regained +malaysian +grim +publicity +constituted +capturing +toilet +delegate +purely +tray +drift +loosely +striker +weakened +trinidad +mitch +itv +defines +transmitted +ming +scarlet +nodding +fitzgerald +fu +narrowly +sp +tooth +standings +virtue +##₁ +##wara +##cting +chateau +gloves +lid +##nel +hurting +conservatory +##pel +sinclair +reopened +sympathy +nigerian +strode +advocated +optional +chronic +discharge +##rc +suck +compatible +laurel +stella +shi +fails +wage +dodge +128 +informal +sorts +levi +buddha +villagers +##aka +chronicles +heavier +summoned +gateway +3000 +eleventh +jewelry +translations +accordingly +seas +##ency +fiber +pyramid +cubic +dragging +##ista +caring +##ops +android +contacted +lunar +##dt +kai +lisbon +patted +1826 +sacramento +theft +madagascar +subtropical +disputes +ta +holidays +piper +willow +mare +cane +itunes +newfoundland +benny +companions +dong +raj +observe +roar +charming +plaque +tibetan +fossils +enacted +manning +bubble +tina +tanzania +##eda +##hir +funk +swamp +deputies +cloak +ufc +scenario +par +scratch +metals +anthem +guru +engaging +specially +##boat +dialects +nineteen +cecil +duet +disability +messenger +unofficial +##lies +defunct +eds +moonlight +drainage +surname +puzzle +honda +switching +conservatives +mammals +knox +broadcaster +sidewalk +cope +##ried +benson +princes +peterson +##sal +bedford +sharks +eli +wreck +alberto +gasp +archaeology +lgbt +teaches +securities +madness +compromise +waving +coordination +davidson +visions +leased +possibilities +eighty +jun +fernandez +enthusiasm +assassin +sponsorship +reviewer +kingdoms +estonian +laboratories +##fy +##nal +applies +verb +celebrations +##zzo +rowing +lightweight +sadness +submit +mvp +balanced +dude +##vas +explicitly +metric +magnificent +mound +brett +mohammad +mistakes +irregular +##hing +##ass +sanders +betrayed +shipped +surge +##enburg +reporters +termed +georg +pity +verbal +bulls +abbreviated +enabling +appealed +##are +##atic +sicily +sting +heel +sweetheart +bart +spacecraft +brutal +monarchy +##tter +aberdeen +cameo +diane +##ub +survivor +clyde +##aries +complaint +##makers +clarinet +delicious +chilean +karnataka +coordinates +1818 +panties +##rst +pretending +ar +dramatically +kiev +bella +tends +distances +113 +catalog +launching +instances +telecommunications +portable +lindsay +vatican +##eim +angles +aliens +marker +stint +screens +bolton +##rne +judy +wool +benedict +plasma +europa +spark +imaging +filmmaker +swiftly +##een +contributor +##nor +opted +stamps +apologize +financing +butter +gideon +sophisticated +alignment +avery +chemicals +yearly +speculation +prominence +professionally +##ils +immortal +institutional +inception +wrists +identifying +tribunal +derives +gains +##wo +papal +preference +linguistic +vince +operative +brewery +##ont +unemployment +boyd +##ured +##outs +albeit +prophet +1813 +bi +##rr +##face +##rad +quarterly +asteroid +cleaned +radius +temper +##llen +telugu +jerk +viscount +menu +##ote +glimpse +##aya +yacht +hawaiian +baden +##rl +laptop +readily +##gu +monetary +offshore +scots +watches +##yang +##arian +upgrade +needle +xbox +lea +encyclopedia +flank +fingertips +##pus +delight +teachings +confirm +roth +beaches +midway +winters +##iah +teasing +daytime +beverly +gambling +bonnie +##backs +regulated +clement +hermann +tricks +knot +##shing +##uring +##vre +detached +ecological +owed +specialty +byron +inventor +bats +stays +screened +unesco +midland +trim +affection +##ander +##rry +jess +thoroughly +feedback +##uma +chennai +strained +heartbeat +wrapping +overtime +pleaded +##sworth +mon +leisure +oclc +##tate +##ele +feathers +angelo +thirds +nuts +surveys +clever +gill +commentator +##dos +darren +rides +gibraltar +##nc +##mu +dissolution +dedication +shin +meals +saddle +elvis +reds +chaired +taller +appreciation +functioning +niece +favored +advocacy +robbie +criminals +suffolk +yugoslav +passport +constable +congressman +hastings +vera +##rov +consecrated +sparks +ecclesiastical +confined +##ovich +muller +floyd +nora +1822 +paved +1827 +cumberland +ned +saga +spiral +##flow +appreciated +yi +collaborative +treating +similarities +feminine +finishes +##ib +jade +import +##nse +##hot +champagne +mice +securing +celebrities +helsinki +attributes +##gos +cousins +phases +ache +lucia +gandhi +submission +vicar +spear +shine +tasmania +biting +detention +constitute +tighter +seasonal +##gus +terrestrial +matthews +##oka +effectiveness +parody +philharmonic +##onic +1816 +strangers +encoded +consortium +guaranteed +regards +shifts +tortured +collision +supervisor +inform +broader +insight +theaters +armour +emeritus +blink +incorporates +mapping +##50 +##ein +handball +flexible +##nta +substantially +generous +thief +##own +carr +loses +1793 +prose +ucla +romeo +generic +metallic +realization +damages +mk +commissioners +zach +default +##ther +helicopters +lengthy +stems +spa +partnered +spectators +rogue +indication +penalties +teresa +1801 +sen +##tric +dalton +##wich +irving +photographic +##vey +dell +deaf +peters +excluded +unsure +##vable +patterson +crawled +##zio +resided +whipped +latvia +slower +ecole +pipes +employers +maharashtra +comparable +va +textile +pageant +##gel +alphabet +binary +irrigation +chartered +choked +antoine +offs +waking +supplement +##wen +quantities +demolition +regain +locate +urdu +folks +alt +114 +##mc +scary +andreas +whites +##ava +classrooms +mw +aesthetic +publishes +valleys +guides +cubs +johannes +bryant +conventions +affecting +##itt +drain +awesome +isolation +prosecutor +ambitious +apology +captive +downs +atmospheric +lorenzo +aisle +beef +foul +##onia +kidding +composite +disturbed +illusion +natives +##ffer +emi +rockets +riverside +wartime +painters +adolf +melted +##ail +uncertainty +simulation +hawks +progressed +meantime +builder +spray +breach +unhappy +regina +russians +##urg +determining +##tation +tram +1806 +##quin +aging +##12 +1823 +garion +rented +mister +diaz +terminated +clip +1817 +depend +nervously +disco +owe +defenders +shiva +notorious +disbelief +shiny +worcester +##gation +##yr +trailing +undertook +islander +belarus +limitations +watershed +fuller +overlooking +utilized +raphael +1819 +synthetic +breakdown +klein +##nate +moaned +memoir +lamb +practicing +##erly +cellular +arrows +exotic +##graphy +witches +117 +charted +rey +hut +hierarchy +subdivision +freshwater +giuseppe +aloud +reyes +qatar +marty +sideways +utterly +sexually +jude +prayers +mccarthy +softball +blend +damien +##gging +##metric +wholly +erupted +lebanese +negro +revenues +tasted +comparative +teamed +transaction +labeled +maori +sovereignty +parkway +trauma +gran +malay +121 +advancement +descendant +2020 +buzz +salvation +inventory +symbolic +##making +antarctica +mps +##gas +##bro +mohammed +myanmar +holt +submarines +tones +##lman +locker +patriarch +bangkok +emerson +remarks +predators +kin +afghan +confession +norwich +rental +emerge +advantages +##zel +rca +##hold +shortened +storms +aidan +##matic +autonomy +compliance +##quet +dudley +atp +##osis +1803 +motto +documentation +summary +professors +spectacular +christina +archdiocese +flashing +innocence +remake +##dell +psychic +reef +scare +employ +rs +sticks +meg +gus +leans +##ude +accompany +bergen +tomas +##iko +doom +wages +pools +##nch +##bes +breasts +scholarly +alison +outline +brittany +breakthrough +willis +realistic +##cut +##boro +competitor +##stan +pike +picnic +icon +designing +commercials +washing +villain +skiing +micro +costumes +auburn +halted +executives +##hat +logistics +cycles +vowel +applicable +barrett +exclaimed +eurovision +eternity +ramon +##umi +##lls +modifications +sweeping +disgust +##uck +torch +aviv +ensuring +rude +dusty +sonic +donovan +outskirts +cu +pathway +##band +##gun +##lines +disciplines +acids +cadet +paired +##40 +sketches +##sive +marriages +##⁺ +folding +peers +slovak +implies +admired +##beck +1880s +leopold +instinct +attained +weston +megan +horace +##ination +dorsal +ingredients +evolutionary +##its +complications +deity +lethal +brushing +levy +deserted +institutes +posthumously +delivering +telescope +coronation +motivated +rapids +luc +flicked +pays +volcano +tanner +weighed +##nica +crowds +frankie +gifted +addressing +granddaughter +winding +##rna +constantine +gomez +##front +landscapes +rudolf +anthropology +slate +werewolf +##lio +astronomy +circa +rouge +dreaming +sack +knelt +drowned +naomi +prolific +tracked +freezing +herb +##dium +agony +randall +twisting +wendy +deposit +touches +vein +wheeler +##bbled +##bor +batted +retaining +tire +presently +compare +specification +daemon +nigel +##grave +merry +recommendation +czechoslovakia +sandra +ng +roma +##sts +lambert +inheritance +sheikh +winchester +cries +examining +##yle +comeback +cuisine +nave +##iv +ko +retrieve +tomatoes +barker +polished +defining +irene +lantern +personalities +begging +tract +swore +1809 +175 +##gic +omaha +brotherhood +##rley +haiti +##ots +exeter +##ete +##zia +steele +dumb +pearson +210 +surveyed +elisabeth +trends +##ef +fritz +##rf +premium +bugs +fraction +calmly +viking +##birds +tug +inserted +unusually +##ield +confronted +distress +crashing +brent +turks +resign +##olo +cambodia +gabe +sauce +##kal +evelyn +116 +extant +clusters +quarry +teenagers +luna +##lers +##ister +affiliation +drill +##ashi +panthers +scenic +libya +anita +strengthen +inscriptions +##cated +lace +sued +judith +riots +##uted +mint +##eta +preparations +midst +dub +challenger +##vich +mock +cf +displaced +wicket +breaths +enables +schmidt +analyst +##lum +ag +highlight +automotive +axe +josef +newark +sufficiently +resembles +50th +##pal +flushed +mum +traits +##ante +commodore +incomplete +warming +titular +ceremonial +ethical +118 +celebrating +eighteenth +cao +lima +medalist +mobility +strips +snakes +##city +miniature +zagreb +barton +escapes +umbrella +automated +doubted +differs +cooled +georgetown +dresden +cooked +fade +wyatt +rna +jacobs +carlton +abundant +stereo +boost +madras +inning +##hia +spur +ip +malayalam +begged +osaka +groan +escaping +charging +dose +vista +##aj +bud +papa +communists +advocates +edged +tri +##cent +resemble +peaking +necklace +fried +montenegro +saxony +goose +glances +stuttgart +curator +recruit +grocery +sympathetic +##tting +##fort +127 +lotus +randolph +ancestor +##rand +succeeding +jupiter +1798 +macedonian +##heads +hiking +1808 +handing +fischer +##itive +garbage +node +##pies +prone +singular +papua +inclined +attractions +italia +pouring +motioned +grandma +garnered +jacksonville +corp +ego +ringing +aluminum +##hausen +ordering +##foot +drawer +traders +synagogue +##play +##kawa +resistant +wandering +fragile +fiona +teased +var +hardcore +soaked +jubilee +decisive +exposition +mercer +poster +valencia +hale +kuwait +1811 +##ises +##wr +##eed +tavern +gamma +122 +johan +##uer +airways +amino +gil +##ury +vocational +domains +torres +##sp +generator +folklore +outcomes +##keeper +canberra +shooter +fl +beams +confrontation +##lling +##gram +feb +aligned +forestry +pipeline +jax +motorway +conception +decay +##tos +coffin +##cott +stalin +1805 +escorted +minded +##nam +sitcom +purchasing +twilight +veronica +additions +passive +tensions +straw +123 +frequencies +1804 +refugee +cultivation +##iate +christie +clary +bulletin +crept +disposal +##rich +##zong +processor +crescent +##rol +bmw +emphasized +whale +nazis +aurora +##eng +dwelling +hauled +sponsors +toledo +mega +ideology +theatres +tessa +cerambycidae +saves +turtle +cone +suspects +kara +rusty +yelling +greeks +mozart +shades +cocked +participant +##tro +shire +spit +freeze +necessity +##cos +inmates +nielsen +councillors +loaned +uncommon +omar +peasants +botanical +offspring +daniels +formations +jokes +1794 +pioneers +sigma +licensing +##sus +wheelchair +polite +1807 +liquor +pratt +trustee +##uta +forewings +balloon +##zz +kilometre +camping +explicit +casually +shawn +foolish +teammates +nm +hassan +carrie +judged +satisfy +vanessa +knives +selective +cnn +flowed +##lice +eclipse +stressed +eliza +mathematician +cease +cultivated +##roy +commissions +browns +##ania +destroyers +sheridan +meadow +##rius +minerals +##cial +downstream +clash +gram +memoirs +ventures +baha +seymour +archie +midlands +edith +fare +flynn +invite +canceled +tiles +stabbed +boulder +incorporate +amended +camden +facial +mollusk +unreleased +descriptions +yoga +grabs +550 +raises +ramp +shiver +##rose +coined +pioneering +tunes +qing +warwick +tops +119 +melanie +giles +##rous +wandered +##inal +annexed +nov +30th +unnamed +##ished +organizational +airplane +normandy +stoke +whistle +blessing +violations +chased +holders +shotgun +##ctic +outlet +reactor +##vik +tires +tearing +shores +fortified +mascot +constituencies +nc +columnist +productive +tibet +##rta +lineage +hooked +oct +tapes +judging +cody +##gger +hansen +kashmir +triggered +##eva +solved +cliffs +##tree +resisted +anatomy +protesters +transparent +implied +##iga +injection +mattress +excluding +##mbo +defenses +helpless +devotion +##elli +growl +liberals +weber +phenomena +atoms +plug +##iff +mortality +apprentice +howe +convincing +aaa +swimmer +barber +leone +promptly +sodium +def +nowadays +arise +##oning +gloucester +corrected +dignity +norm +erie +##ders +elders +evacuated +sylvia +compression +##yar +hartford +pose +backpack +reasoning +accepts +24th +wipe +millimetres +marcel +##oda +dodgers +albion +1790 +overwhelmed +aerospace +oaks +1795 +showcase +acknowledge +recovering +nolan +ashe +hurts +geology +fashioned +disappearance +farewell +swollen +shrug +marquis +wimbledon +124 +rue +1792 +commemorate +reduces +experiencing +inevitable +calcutta +intel +##court +murderer +sticking +fisheries +imagery +bloom +280 +brake +##inus +gustav +hesitation +memorable +po +viral +beans +accidents +tunisia +antenna +spilled +consort +treatments +aye +perimeter +##gard +donation +hostage +migrated +banker +addiction +apex +lil +trout +##ously +conscience +##nova +rams +sands +genome +passionate +troubles +##lets +##set +amid +##ibility +##ret +higgins +exceed +vikings +##vie +payne +##zan +muscular +##ste +defendant +sucking +##wal +ibrahim +fuselage +claudia +vfl +europeans +snails +interval +##garh +preparatory +statewide +tasked +lacrosse +viktor +##lation +angola +##hra +flint +implications +employs +teens +patrons +stall +weekends +barriers +scrambled +nucleus +tehran +jenna +parsons +lifelong +robots +displacement +5000 +##bles +precipitation +##gt +knuckles +clutched +1802 +marrying +ecology +marx +accusations +declare +scars +kolkata +mat +meadows +bermuda +skeleton +finalists +vintage +crawl +coordinate +affects +subjected +orchestral +mistaken +##tc +mirrors +dipped +relied +260 +arches +candle +##nick +incorporating +wildly +fond +basilica +owl +fringe +rituals +whispering +stirred +feud +tertiary +slick +goat +honorable +whereby +skip +ricardo +stripes +parachute +adjoining +submerged +synthesizer +##gren +intend +positively +ninety +phi +beaver +partition +fellows +alexis +prohibition +carlisle +bizarre +fraternity +##bre +doubts +icy +cbc +aquatic +sneak +sonny +combines +airports +crude +supervised +spatial +merge +alfonso +##bic +corrupt +scan +undergo +##ams +disabilities +colombian +comparing +dolphins +perkins +##lish +reprinted +unanimous +bounced +hairs +underworld +midwest +semester +bucket +paperback +miniseries +coventry +demise +##leigh +demonstrations +sensor +rotating +yan +##hler +arrange +soils +##idge +hyderabad +labs +##dr +brakes +grandchildren +##nde +negotiated +rover +ferrari +continuation +directorate +augusta +stevenson +counterpart +gore +##rda +nursery +rican +ave +collectively +broadly +pastoral +repertoire +asserted +discovering +nordic +styled +fiba +cunningham +harley +middlesex +survives +tumor +tempo +zack +aiming +lok +urgent +##rade +##nto +devils +##ement +contractor +turin +##wl +##ool +bliss +repaired +simmons +moan +astronomical +cr +negotiate +lyric +1890s +lara +bred +clad +angus +pbs +##ience +engineered +posed +##lk +hernandez +possessions +elbows +psychiatric +strokes +confluence +electorate +lifts +campuses +lava +alps +##ep +##ution +##date +physicist +woody +##page +##ographic +##itis +juliet +reformation +sparhawk +320 +complement +suppressed +jewel +##½ +floated +##kas +continuity +sadly +##ische +inability +melting +scanning +paula +flour +judaism +safer +vague +##lm +solving +curb +##stown +financially +gable +bees +expired +miserable +cassidy +dominion +1789 +cupped +145 +robbery +facto +amos +warden +resume +tallest +marvin +ing +pounded +usd +declaring +gasoline +##aux +darkened +270 +650 +sophomore +##mere +erection +gossip +televised +risen +dial +##eu +pillars +##link +passages +profound +##tina +arabian +ashton +silicon +nail +##ead +##lated +##wer +##hardt +fleming +firearms +ducked +circuits +blows +waterloo +titans +##lina +atom +fireplace +cheshire +financed +activation +algorithms +##zzi +constituent +catcher +cherokee +partnerships +sexuality +platoon +tragic +vivian +guarded +whiskey +meditation +poetic +##late +##nga +##ake +porto +listeners +dominance +kendra +mona +chandler +factions +22nd +salisbury +attitudes +derivative +##ido +##haus +intake +paced +javier +illustrator +barrels +bias +cockpit +burnett +dreamed +ensuing +##anda +receptors +someday +hawkins +mattered +##lal +slavic +1799 +jesuit +cameroon +wasted +tai +wax +lowering +victorious +freaking +outright +hancock +librarian +sensing +bald +calcium +myers +tablet +announcing +barack +shipyard +pharmaceutical +##uan +greenwich +flush +medley +patches +wolfgang +pt +speeches +acquiring +exams +nikolai +##gg +hayden +kannada +##type +reilly +##pt +waitress +abdomen +devastated +capped +pseudonym +pharmacy +fulfill +paraguay +1796 +clicked +##trom +archipelago +syndicated +##hman +lumber +orgasm +rejection +clifford +lorraine +advent +mafia +rodney +brock +##ght +##used +##elia +cassette +chamberlain +despair +mongolia +sensors +developmental +upstream +##eg +##alis +spanning +165 +trombone +basque +seeded +interred +renewable +rhys +leapt +revision +molecule +##ages +chord +vicious +nord +shivered +23rd +arlington +debts +corpus +sunrise +bays +blackburn +centimetres +##uded +shuddered +gm +strangely +gripping +cartoons +isabelle +orbital +##ppa +seals +proving +##lton +refusal +strengthened +bust +assisting +baghdad +batsman +portrayal +mara +pushes +spears +og +##cock +reside +nathaniel +brennan +1776 +confirmation +caucus +##worthy +markings +yemen +nobles +ku +lazy +viewer +catalan +encompasses +sawyer +##fall +sparked +substances +patents +braves +arranger +evacuation +sergio +persuade +dover +tolerance +penguin +cum +jockey +insufficient +townships +occupying +declining +plural +processed +projection +puppet +flanders +introduces +liability +##yon +gymnastics +antwerp +taipei +hobart +candles +jeep +wes +observers +126 +chaplain +bundle +glorious +##hine +hazel +flung +sol +excavations +dumped +stares +sh +bangalore +triangular +icelandic +intervals +expressing +turbine +##vers +songwriting +crafts +##igo +jasmine +ditch +rite +##ways +entertaining +comply +sorrow +wrestlers +basel +emirates +marian +rivera +helpful +##some +caution +downward +networking +##atory +##tered +darted +genocide +emergence +replies +specializing +spokesman +convenient +unlocked +fading +augustine +concentrations +resemblance +elijah +investigator +andhra +##uda +promotes +bean +##rrell +fleeing +wan +simone +announcer +##ame +##bby +lydia +weaver +132 +residency +modification +##fest +stretches +##ast +alternatively +nat +lowe +lacks +##ented +pam +tile +concealed +inferior +abdullah +residences +tissues +vengeance +##ided +moisture +peculiar +groove +zip +bologna +jennings +ninja +oversaw +zombies +pumping +batch +livingston +emerald +installations +1797 +peel +nitrogen +rama +##fying +##star +schooling +strands +responding +werner +##ost +lime +casa +accurately +targeting +##rod +underway +##uru +hemisphere +lester +##yard +occupies +2d +griffith +angrily +reorganized +##owing +courtney +deposited +##dd +##30 +estadio +##ifies +dunn +exiled +##ying +checks +##combe +##о +##fly +successes +unexpectedly +blu +assessed +##flower +##ه +observing +sacked +spiders +kn +##tail +mu +nodes +prosperity +audrey +divisional +155 +broncos +tangled +adjust +feeds +erosion +paolo +surf +directory +snatched +humid +admiralty +screwed +gt +reddish +##nese +modules +trench +lamps +bind +leah +bucks +competes +##nz +##form +transcription +##uc +isles +violently +clutching +pga +cyclist +inflation +flats +ragged +unnecessary +##hian +stubborn +coordinated +harriet +baba +disqualified +330 +insect +wolfe +##fies +reinforcements +rocked +duel +winked +embraced +bricks +##raj +hiatus +defeats +pending +brightly +jealousy +##xton +##hm +##uki +lena +gdp +colorful +##dley +stein +kidney +##shu +underwear +wanderers +##haw +##icus +guardians +m³ +roared +habits +##wise +permits +gp +uranium +punished +disguise +bundesliga +elise +dundee +erotic +partisan +pi +collectors +float +individually +rendering +behavioral +bucharest +ser +hare +valerie +corporal +nutrition +proportional +##isa +immense +##kis +pavement +##zie +##eld +sutherland +crouched +1775 +##lp +suzuki +trades +endurance +operas +crosby +prayed +priory +rory +socially +##urn +gujarat +##pu +walton +cube +pasha +privilege +lennon +floods +thorne +waterfall +nipple +scouting +approve +##lov +minorities +voter +dwight +extensions +assure +ballroom +slap +dripping +privileges +rejoined +confessed +demonstrating +patriotic +yell +investor +##uth +pagan +slumped +squares +##cle +##kins +confront +bert +embarrassment +##aid +aston +urging +sweater +starr +yuri +brains +williamson +commuter +mortar +structured +selfish +exports +##jon +cds +##him +unfinished +##rre +mortgage +destinations +##nagar +canoe +solitary +buchanan +delays +magistrate +fk +##pling +motivation +##lier +##vier +recruiting +assess +##mouth +malik +antique +1791 +pius +rahman +reich +tub +zhou +smashed +airs +galway +xii +conditioning +honduras +discharged +dexter +##pf +lionel +129 +debates +lemon +tiffany +volunteered +dom +dioxide +procession +devi +sic +tremendous +advertisements +colts +transferring +verdict +hanover +decommissioned +utter +relate +pac +racism +##top +beacon +limp +similarity +terra +occurrence +ant +##how +becky +capt +updates +armament +richie +pal +##graph +halloween +mayo +##ssen +##bone +cara +serena +fcc +dolls +obligations +##dling +violated +lafayette +jakarta +exploitation +##ime +infamous +iconic +##lah +##park +kitty +moody +reginald +dread +spill +crystals +olivier +modeled +bluff +equilibrium +separating +notices +ordnance +extinction +onset +cosmic +attachment +sammy +expose +privy +anchored +##bil +abbott +admits +bending +baritone +emmanuel +policeman +vaughan +winged +climax +dresses +denny +polytechnic +mohamed +burmese +authentic +nikki +genetics +grandparents +homestead +gaza +postponed +metacritic +una +##sby +##bat +unstable +dissertation +##rial +##cian +curls +obscure +uncovered +bronx +praying +disappearing +##hoe +prehistoric +coke +turret +mutations +nonprofit +pits +monaco +##ي +##usion +prominently +dispatched +podium +##mir +uci +##uation +133 +fortifications +birthplace +kendall +##lby +##oll +preacher +rack +goodman +##rman +persistent +##ott +countless +jaime +recorder +lexington +persecution +jumps +renewal +wagons +##11 +crushing +##holder +decorations +##lake +abundance +wrath +laundry +£1 +garde +##rp +jeanne +beetles +peasant +##sl +splitting +caste +sergei +##rer +##ema +scripts +##ively +rub +satellites +##vor +inscribed +verlag +scrapped +gale +packages +chick +potato +slogan +kathleen +arabs +##culture +counterparts +reminiscent +choral +##tead +rand +retains +bushes +dane +accomplish +courtesy +closes +##oth +slaughter +hague +krakow +lawson +tailed +elias +ginger +##ttes +canopy +betrayal +rebuilding +turf +##hof +frowning +allegiance +brigades +kicks +rebuild +polls +alias +nationalism +td +rowan +audition +bowie +fortunately +recognizes +harp +dillon +horrified +##oro +renault +##tics +ropes +##α +presumed +rewarded +infrared +wiping +accelerated +illustration +##rid +presses +practitioners +badminton +##iard +detained +##tera +recognizing +relates +misery +##sies +##tly +reproduction +piercing +potatoes +thornton +esther +manners +hbo +##aan +ours +bullshit +ernie +perennial +sensitivity +illuminated +rupert +##jin +##iss +##ear +rfc +nassau +##dock +staggered +socialism +##haven +appointments +nonsense +prestige +sharma +haul +##tical +solidarity +gps +##ook +##rata +igor +pedestrian +##uit +baxter +tenants +wires +medication +unlimited +guiding +impacts +diabetes +##rama +sasha +pas +clive +extraction +131 +continually +constraints +##bilities +sonata +hunted +sixteenth +chu +planting +quote +mayer +pretended +abs +spat +##hua +ceramic +##cci +curtains +pigs +pitching +##dad +latvian +sore +dayton +##sted +##qi +patrols +slice +playground +##nted +shone +stool +apparatus +inadequate +mates +treason +##ija +desires +##liga +##croft +somalia +laurent +mir +leonardo +oracle +grape +obliged +chevrolet +thirteenth +stunning +enthusiastic +##ede +accounted +concludes +currents +basil +##kovic +drought +##rica +mai +##aire +shove +posting +##shed +pilgrimage +humorous +packing +fry +pencil +wines +smells +144 +marilyn +aching +newest +clung +bon +neighbours +sanctioned +##pie +mug +##stock +drowning +##mma +hydraulic +##vil +hiring +reminder +lilly +investigators +##ncies +sour +##eous +compulsory +packet +##rion +##graphic +##elle +cannes +##inate +depressed +##rit +heroic +importantly +theresa +##tled +conway +saturn +marginal +rae +##xia +corresponds +royce +pact +jasper +explosives +packaging +aluminium +##ttered +denotes +rhythmic +spans +assignments +hereditary +outlined +originating +sundays +lad +reissued +greeting +beatrice +##dic +pillar +marcos +plots +handbook +alcoholic +judiciary +avant +slides +extract +masculine +blur +##eum +##force +homage +trembled +owens +hymn +trey +omega +signaling +socks +accumulated +reacted +attic +theo +lining +angie +distraction +primera +talbot +##key +1200 +ti +creativity +billed +##hey +deacon +eduardo +identifies +proposition +dizzy +gunner +hogan +##yam +##pping +##hol +ja +##chan +jensen +reconstructed +##berger +clearance +darius +##nier +abe +harlem +plea +dei +circled +emotionally +notation +fascist +neville +exceeded +upwards +viable +ducks +##fo +workforce +racer +limiting +shri +##lson +possesses +1600 +kerr +moths +devastating +laden +disturbing +locking +##cture +gal +fearing +accreditation +flavor +aide +1870s +mountainous +##baum +melt +##ures +motel +texture +servers +soda +##mb +herd +##nium +erect +puzzled +hum +peggy +examinations +gould +testified +geoff +ren +devised +sacks +##law +denial +posters +grunted +cesar +tutor +ec +gerry +offerings +byrne +falcons +combinations +ct +incoming +pardon +rocking +26th +avengers +flared +mankind +seller +uttar +loch +nadia +stroking +exposing +##hd +fertile +ancestral +instituted +##has +noises +prophecy +taxation +eminent +vivid +pol +##bol +dart +indirect +multimedia +notebook +upside +displaying +adrenaline +referenced +geometric +##iving +progression +##ddy +blunt +announce +##far +implementing +##lav +aggression +liaison +cooler +cares +headache +plantations +gorge +dots +impulse +thickness +ashamed +averaging +kathy +obligation +precursor +137 +fowler +symmetry +thee +225 +hears +##rai +undergoing +ads +butcher +bowler +##lip +cigarettes +subscription +goodness +##ically +browne +##hos +##tech +kyoto +donor +##erty +damaging +friction +drifting +expeditions +hardened +prostitution +152 +fauna +blankets +claw +tossing +snarled +butterflies +recruits +investigative +coated +healed +138 +communal +hai +xiii +academics +boone +psychologist +restless +lahore +stephens +mba +brendan +foreigners +printer +##pc +ached +explode +27th +deed +scratched +dared +##pole +cardiac +1780 +okinawa +proto +commando +compelled +oddly +electrons +##base +replica +thanksgiving +##rist +sheila +deliberate +stafford +tidal +representations +hercules +ou +##path +##iated +kidnapping +lenses +##tling +deficit +samoa +mouths +consuming +computational +maze +granting +smirk +razor +fixture +ideals +inviting +aiden +nominal +##vs +issuing +julio +pitt +ramsey +docks +##oss +exhaust +##owed +bavarian +draped +anterior +mating +ethiopian +explores +noticing +##nton +discarded +convenience +hoffman +endowment +beasts +cartridge +mormon +paternal +probe +sleeves +interfere +lump +deadline +##rail +jenks +bulldogs +scrap +alternating +justified +reproductive +nam +seize +descending +secretariat +kirby +coupe +grouped +smash +panther +sedan +tapping +##18 +lola +cheer +germanic +unfortunate +##eter +unrelated +##fan +subordinate +##sdale +suzanne +advertisement +##ility +horsepower +##lda +cautiously +discourse +luigi +##mans +##fields +noun +prevalent +mao +schneider +everett +surround +governorate +kira +##avia +westward +##take +misty +rails +sustainability +134 +unused +##rating +packs +toast +unwilling +regulate +thy +suffrage +nile +awe +assam +definitions +travelers +affordable +##rb +conferred +sells +undefeated +beneficial +torso +basal +repeating +remixes +##pass +bahrain +cables +fang +##itated +excavated +numbering +statutory +##rey +deluxe +##lian +forested +ramirez +derbyshire +zeus +slamming +transfers +astronomer +banana +lottery +berg +histories +bamboo +##uchi +resurrection +posterior +bowls +vaguely +##thi +thou +preserving +tensed +offence +##inas +meyrick +callum +ridden +watt +langdon +tying +lowland +snorted +daring +truman +##hale +##girl +aura +overly +filing +weighing +goa +infections +philanthropist +saunders +eponymous +##owski +latitude +perspectives +reviewing +mets +commandant +radial +##kha +flashlight +reliability +koch +vowels +amazed +ada +elaine +supper +##rth +##encies +predator +debated +soviets +cola +##boards +##nah +compartment +crooked +arbitrary +fourteenth +##ctive +havana +majors +steelers +clips +profitable +ambush +exited +packers +##tile +nude +cracks +fungi +##е +limb +trousers +josie +shelby +tens +frederic +##ος +definite +smoothly +constellation +insult +baton +discs +lingering +##nco +conclusions +lent +staging +becker +grandpa +shaky +##tron +einstein +obstacles +sk +adverse +elle +economically +##moto +mccartney +thor +dismissal +motions +readings +nostrils +treatise +##pace +squeezing +evidently +prolonged +1783 +venezuelan +je +marguerite +beirut +takeover +shareholders +##vent +denise +digit +airplay +norse +##bbling +imaginary +pills +hubert +blaze +vacated +eliminating +##ello +vine +mansfield +##tty +retrospective +barrow +borne +clutch +bail +forensic +weaving +##nett +##witz +desktop +citadel +promotions +worrying +dorset +ieee +subdivided +##iating +manned +expeditionary +pickup +synod +chuckle +185 +barney +##rz +##ffin +functionality +karachi +litigation +meanings +uc +lick +turbo +anders +##ffed +execute +curl +oppose +ankles +typhoon +##د +##ache +##asia +linguistics +compassion +pressures +grazing +perfection +##iting +immunity +monopoly +muddy +backgrounds +136 +namibia +francesca +monitors +attracting +stunt +tuition +##ии +vegetable +##mates +##quent +mgm +jen +complexes +forts +##ond +cellar +bites +seventeenth +royals +flemish +failures +mast +charities +##cular +peruvian +capitals +macmillan +ipswich +outward +frigate +postgraduate +folds +employing +##ouse +concurrently +fiery +##tai +contingent +nightmares +monumental +nicaragua +##kowski +lizard +mal +fielding +gig +reject +##pad +harding +##ipe +coastline +##cin +##nos +beethoven +humphrey +innovations +##tam +##nge +norris +doris +solicitor +huang +obey +141 +##lc +niagara +##tton +shelves +aug +bourbon +curry +nightclub +specifications +hilton +##ndo +centennial +dispersed +worm +neglected +briggs +sm +font +kuala +uneasy +plc +##nstein +##bound +##aking +##burgh +awaiting +pronunciation +##bbed +##quest +eh +optimal +zhu +raped +greens +presided +brenda +worries +##life +venetian +marxist +turnout +##lius +refined +braced +sins +grasped +sunderland +nickel +speculated +lowell +cyrillic +communism +fundraising +resembling +colonists +mutant +freddie +usc +##mos +gratitude +##run +mural +##lous +chemist +wi +reminds +28th +steals +tess +pietro +##ingen +promoter +ri +microphone +honoured +rai +sant +##qui +feather +##nson +burlington +kurdish +terrorists +deborah +sickness +##wed +##eet +hazard +irritated +desperation +veil +clarity +##rik +jewels +xv +##gged +##ows +##cup +berkshire +unfair +mysteries +orchid +winced +exhaustion +renovations +stranded +obe +infinity +##nies +adapt +redevelopment +thanked +registry +olga +domingo +noir +tudor +ole +##atus +commenting +behaviors +##ais +crisp +pauline +probable +stirling +wigan +##bian +paralympics +panting +surpassed +##rew +luca +barred +pony +famed +##sters +cassandra +waiter +carolyn +exported +##orted +andres +destructive +deeds +jonah +castles +vacancy +suv +##glass +1788 +orchard +yep +famine +belarusian +sprang +##forth +skinny +##mis +administrators +rotterdam +zambia +zhao +boiler +discoveries +##ride +##physics +lucius +disappointing +outreach +spoon +##frame +qualifications +unanimously +enjoys +regency +##iidae +stade +realism +veterinary +rodgers +dump +alain +chestnut +castile +censorship +rumble +gibbs +##itor +communion +reggae +inactivated +logs +loads +##houses +homosexual +##iano +ale +informs +##cas +phrases +plaster +linebacker +ambrose +kaiser +fascinated +850 +limerick +recruitment +forge +mastered +##nding +leinster +rooted +threaten +##strom +borneo +##hes +suggestions +scholarships +propeller +documentaries +patronage +coats +constructing +invest +neurons +comet +entirety +shouts +identities +annoying +unchanged +wary +##antly +##ogy +neat +oversight +##kos +phillies +replay +constance +##kka +incarnation +humble +skies +minus +##acy +smithsonian +##chel +guerrilla +jar +cadets +##plate +surplus +audit +##aru +cracking +joanna +louisa +pacing +##lights +intentionally +##iri +diner +nwa +imprint +australians +tong +unprecedented +bunker +naive +specialists +ark +nichols +railing +leaked +pedal +##uka +shrub +longing +roofs +v8 +captains +neural +tuned +##ntal +##jet +emission +medina +frantic +codex +definitive +sid +abolition +intensified +stocks +enrique +sustain +genoa +oxide +##written +clues +cha +##gers +tributaries +fragment +venom +##rity +##ente +##sca +muffled +vain +sire +laos +##ingly +##hana +hastily +snapping +surfaced +sentiment +motive +##oft +contests +approximate +mesa +luckily +dinosaur +exchanges +propelled +accord +bourne +relieve +tow +masks +offended +##ues +cynthia +##mmer +rains +bartender +zinc +reviewers +lois +##sai +legged +arrogant +rafe +rosie +comprise +handicap +blockade +inlet +lagoon +copied +drilling +shelley +petals +##inian +mandarin +obsolete +##inated +onward +arguably +productivity +cindy +praising +seldom +busch +discusses +raleigh +shortage +ranged +stanton +encouragement +firstly +conceded +overs +temporal +##uke +cbe +##bos +woo +certainty +pumps +##pton +stalked +##uli +lizzie +periodic +thieves +weaker +##night +gases +shoving +chooses +wc +##chemical +prompting +weights +##kill +robust +flanked +sticky +hu +tuberculosis +##eb +##eal +christchurch +resembled +wallet +reese +inappropriate +pictured +distract +fixing +fiddle +giggled +burger +heirs +hairy +mechanic +torque +apache +obsessed +chiefly +cheng +logging +##tag +extracted +meaningful +numb +##vsky +gloucestershire +reminding +##bay +unite +##lit +breeds +diminished +clown +glove +1860s +##ن +##ug +archibald +focal +freelance +sliced +depiction +##yk +organism +switches +sights +stray +crawling +##ril +lever +leningrad +interpretations +loops +anytime +reel +alicia +delighted +##ech +inhaled +xiv +suitcase +bernie +vega +licenses +northampton +exclusion +induction +monasteries +racecourse +homosexuality +##right +##sfield +##rky +dimitri +michele +alternatives +ions +commentators +genuinely +objected +pork +hospitality +fencing +stephan +warships +peripheral +wit +drunken +wrinkled +quentin +spends +departing +chung +numerical +spokesperson +##zone +johannesburg +caliber +killers +##udge +assumes +neatly +demographic +abigail +bloc +##vel +mounting +##lain +bentley +slightest +xu +recipients +##jk +merlin +##writer +seniors +prisons +blinking +hindwings +flickered +kappa +##hel +80s +strengthening +appealing +brewing +gypsy +mali +lashes +hulk +unpleasant +harassment +bio +treaties +predict +instrumentation +pulp +troupe +boiling +mantle +##ffe +ins +##vn +dividing +handles +verbs +##onal +coconut +senegal +340 +thorough +gum +momentarily +##sto +cocaine +panicked +destined +##turing +teatro +denying +weary +captained +mans +##hawks +##code +wakefield +bollywood +thankfully +##16 +cyril +##wu +amendments +##bahn +consultation +stud +reflections +kindness +1787 +internally +##ovo +tex +mosaic +distribute +paddy +seeming +143 +##hic +piers +##15 +##mura +##verse +popularly +winger +kang +sentinel +mccoy +##anza +covenant +##bag +verge +fireworks +suppress +thrilled +dominate +##jar +swansea +##60 +142 +reconciliation +##ndi +stiffened +cue +dorian +##uf +damascus +amor +ida +foremost +##aga +porsche +unseen +dir +##had +##azi +stony +lexi +melodies +##nko +angular +integer +podcast +ants +inherent +jaws +justify +persona +##olved +josephine +##nr +##ressed +customary +flashes +gala +cyrus +glaring +backyard +ariel +physiology +greenland +html +stir +avon +atletico +finch +methodology +ked +##lent +mas +catholicism +townsend +branding +quincy +fits +containers +1777 +ashore +aragon +##19 +forearm +poisoning +##sd +adopting +conquer +grinding +amnesty +keller +finances +evaluate +forged +lankan +instincts +##uto +guam +bosnian +photographed +workplace +desirable +protector +##dog +allocation +intently +encourages +willy +##sten +bodyguard +electro +brighter +##ν +bihar +##chev +lasts +opener +amphibious +sal +verde +arte +##cope +captivity +vocabulary +yields +##tted +agreeing +desmond +pioneered +##chus +strap +campaigned +railroads +##ович +emblem +##dre +stormed +501 +##ulous +marijuana +northumberland +##gn +##nath +bowen +landmarks +beaumont +##qua +danube +##bler +attorneys +th +ge +flyers +critique +villains +cass +mutation +acc +##0s +colombo +mckay +motif +sampling +concluding +syndicate +##rell +neon +stables +ds +warnings +clint +mourning +wilkinson +##tated +merrill +leopard +evenings +exhaled +emil +sonia +ezra +discrete +stove +farrell +fifteenth +prescribed +superhero +##rier +worms +helm +wren +##duction +##hc +expo +##rator +hq +unfamiliar +antony +prevents +acceleration +fiercely +mari +painfully +calculations +cheaper +ign +clifton +irvine +davenport +mozambique +##np +pierced +##evich +wonders +##wig +##cate +##iling +crusade +ware +##uel +enzymes +reasonably +mls +##coe +mater +ambition +bunny +eliot +kernel +##fin +asphalt +headmaster +torah +aden +lush +pins +waived +##care +##yas +joao +substrate +enforce +##grad +##ules +alvarez +selections +epidemic +tempted +##bit +bremen +translates +ensured +waterfront +29th +forrest +manny +malone +kramer +reigning +cookies +simpler +absorption +205 +engraved +##ffy +evaluated +1778 +haze +146 +comforting +crossover +##abe +thorn +##rift +##imo +##pop +suppression +fatigue +cutter +##tr +201 +wurttemberg +##orf +enforced +hovering +proprietary +gb +samurai +syllable +ascent +lacey +tick +lars +tractor +merchandise +rep +bouncing +defendants +##yre +huntington +##ground +##oko +standardized +##hor +##hima +assassinated +nu +predecessors +rainy +liar +assurance +lyrical +##uga +secondly +flattened +ios +parameter +undercover +##mity +bordeaux +punish +ridges +markers +exodus +inactive +hesitate +debbie +nyc +pledge +savoy +nagar +offset +organist +##tium +hesse +marin +converting +##iver +diagram +propulsion +pu +validity +reverted +supportive +##dc +ministries +clans +responds +proclamation +##inae +##ø +##rea +ein +pleading +patriot +sf +birch +islanders +strauss +hates +##dh +brandenburg +concession +rd +##ob +1900s +killings +textbook +antiquity +cinematography +wharf +embarrassing +setup +creed +farmland +inequality +centred +signatures +fallon +370 +##ingham +##uts +ceylon +gazing +directive +laurie +##tern +globally +##uated +##dent +allah +excavation +threads +##cross +148 +frantically +icc +utilize +determines +respiratory +thoughtful +receptions +##dicate +merging +chandra +seine +147 +builders +builds +diagnostic +dev +visibility +goddamn +analyses +dhaka +cho +proves +chancel +concurrent +curiously +canadians +pumped +restoring +1850s +turtles +jaguar +sinister +spinal +traction +declan +vows +1784 +glowed +capitalism +swirling +install +universidad +##lder +##oat +soloist +##genic +##oor +coincidence +beginnings +nissan +dip +resorts +caucasus +combustion +infectious +##eno +pigeon +serpent +##itating +conclude +masked +salad +jew +##gr +surreal +toni +##wc +harmonica +151 +##gins +##etic +##coat +fishermen +intending +bravery +##wave +klaus +titan +wembley +taiwanese +ransom +40th +incorrect +hussein +eyelids +jp +cooke +dramas +utilities +##etta +##print +eisenhower +principally +granada +lana +##rak +openings +concord +##bl +bethany +connie +morality +sega +##mons +##nard +earnings +##kara +##cine +wii +communes +##rel +coma +composing +softened +severed +grapes +##17 +nguyen +analyzed +warlord +hubbard +heavenly +behave +slovenian +##hit +##ony +hailed +filmmakers +trance +caldwell +skye +unrest +coward +likelihood +##aging +bern +sci +taliban +honolulu +propose +##wang +1700 +browser +imagining +cobra +contributes +dukes +instinctively +conan +violinist +##ores +accessories +gradual +##amp +quotes +sioux +##dating +undertake +intercepted +sparkling +compressed +139 +fungus +tombs +haley +imposing +rests +degradation +lincolnshire +retailers +wetlands +tulsa +distributor +dungeon +nun +greenhouse +convey +atlantis +aft +exits +oman +dresser +lyons +##sti +joking +eddy +judgement +omitted +digits +##cts +##game +juniors +##rae +cents +stricken +une +##ngo +wizards +weir +breton +nan +technician +fibers +liking +royalty +##cca +154 +persia +terribly +magician +##rable +##unt +vance +cafeteria +booker +camille +warmer +##static +consume +cavern +gaps +compass +contemporaries +foyer +soothing +graveyard +maj +plunged +blush +##wear +cascade +demonstrates +ordinance +##nov +boyle +##lana +rockefeller +shaken +banjo +izzy +##ense +breathless +vines +##32 +##eman +alterations +chromosome +dwellings +feudal +mole +153 +catalonia +relics +tenant +mandated +##fm +fridge +hats +honesty +patented +raul +heap +cruisers +accusing +enlightenment +infants +wherein +chatham +contractors +zen +affinity +hc +osborne +piston +156 +traps +maturity +##rana +lagos +##zal +peering +##nay +attendant +dealers +protocols +subset +prospects +biographical +##cre +artery +##zers +insignia +nuns +endured +##eration +recommend +schwartz +serbs +berger +cromwell +crossroads +##ctor +enduring +clasped +grounded +##bine +marseille +twitched +abel +choke +https +catalyst +moldova +italians +##tist +disastrous +wee +##oured +##nti +wwf +nope +##piration +##asa +expresses +thumbs +167 +##nza +coca +1781 +cheating +##ption +skipped +sensory +heidelberg +spies +satan +dangers +semifinal +202 +bohemia +whitish +confusing +shipbuilding +relies +surgeons +landings +ravi +baku +moor +suffix +alejandro +##yana +litre +upheld +##unk +rajasthan +##rek +coaster +insists +posture +scenarios +etienne +favoured +appoint +transgender +elephants +poked +greenwood +defences +fulfilled +militant +somali +1758 +chalk +potent +##ucci +migrants +wink +assistants +nos +restriction +activism +niger +##ario +colon +shaun +##sat +daphne +##erated +swam +congregations +reprise +considerations +magnet +playable +xvi +##р +overthrow +tobias +knob +chavez +coding +##mers +propped +katrina +orient +newcomer +##suke +temperate +##pool +farmhouse +interrogation +##vd +committing +##vert +forthcoming +strawberry +joaquin +macau +ponds +shocking +siberia +##cellular +chant +contributors +##nant +##ologists +sped +absorb +hail +1782 +spared +##hore +barbados +karate +opus +originates +saul +##xie +evergreen +leaped +##rock +correlation +exaggerated +weekday +unification +bump +tracing +brig +afb +pathways +utilizing +##ners +mod +mb +disturbance +kneeling +##stad +##guchi +100th +pune +##thy +decreasing +168 +manipulation +miriam +academia +ecosystem +occupational +rbi +##lem +rift +##14 +rotary +stacked +incorporation +awakening +generators +guerrero +racist +##omy +cyber +derivatives +culminated +allie +annals +panzer +sainte +wikipedia +pops +zu +austro +##vate +algerian +politely +nicholson +mornings +educate +tastes +thrill +dartmouth +##gating +db +##jee +regan +differing +concentrating +choreography +divinity +##media +pledged +alexandre +routing +gregor +madeline +##idal +apocalypse +##hora +gunfire +culminating +elves +fined +liang +lam +programmed +tar +guessing +transparency +gabrielle +##gna +cancellation +flexibility +##lining +accession +shea +stronghold +nets +specializes +##rgan +abused +hasan +sgt +ling +exceeding +##₄ +admiration +supermarket +##ark +photographers +specialised +tilt +resonance +hmm +perfume +380 +sami +threatens +garland +botany +guarding +boiled +greet +puppy +russo +supplier +wilmington +vibrant +vijay +##bius +paralympic +grumbled +paige +faa +licking +margins +hurricanes +##gong +fest +grenade +ripping +##uz +counseling +weigh +##sian +needles +wiltshire +edison +costly +##not +fulton +tramway +redesigned +staffordshire +cache +gasping +watkins +sleepy +candidacy +##group +monkeys +timeline +throbbing +##bid +##sos +berth +uzbekistan +vanderbilt +bothering +overturned +ballots +gem +##iger +sunglasses +subscribers +hooker +compelling +ang +exceptionally +saloon +stab +##rdi +carla +terrifying +rom +##vision +coil +##oids +satisfying +vendors +31st +mackay +deities +overlooked +ambient +bahamas +felipe +olympia +whirled +botanist +advertised +tugging +##dden +disciples +morales +unionist +rites +foley +morse +motives +creepy +##₀ +soo +##sz +bargain +highness +frightening +turnpike +tory +reorganization +##cer +depict +biographer +##walk +unopposed +manifesto +##gles +institut +emile +accidental +kapoor +##dam +kilkenny +cortex +lively +##13 +romanesque +jain +shan +cannons +##ood +##ske +petrol +echoing +amalgamated +disappears +cautious +proposes +sanctions +trenton +##ر +flotilla +aus +contempt +tor +canary +cote +theirs +##hun +conceptual +deleted +fascinating +paso +blazing +elf +honourable +hutchinson +##eiro +##outh +##zin +surveyor +tee +amidst +wooded +reissue +intro +##ono +cobb +shelters +newsletter +hanson +brace +encoding +confiscated +dem +caravan +marino +scroll +melodic +cows +imam +##adi +##aneous +northward +searches +biodiversity +cora +310 +roaring +##bers +connell +theologian +halo +compose +pathetic +unmarried +dynamo +##oot +az +calculation +toulouse +deserves +humour +nr +forgiveness +tam +undergone +martyr +pamela +myths +whore +counselor +hicks +290 +heavens +battleship +electromagnetic +##bbs +stellar +establishments +presley +hopped +##chin +temptation +90s +wills +nas +##yuan +nhs +##nya +seminars +##yev +adaptations +gong +asher +lex +indicator +sikh +tobago +cites +goin +##yte +satirical +##gies +characterised +correspond +bubbles +lure +participates +##vid +eruption +skate +therapeutic +1785 +canals +wholesale +defaulted +sac +460 +petit +##zzled +virgil +leak +ravens +256 +portraying +##yx +ghetto +creators +dams +portray +vicente +##rington +fae +namesake +bounty +##arium +joachim +##ota +##iser +aforementioned +axle +snout +depended +dismantled +reuben +480 +##ibly +gallagher +##lau +##pd +earnest +##ieu +##iary +inflicted +objections +##llar +asa +gritted +##athy +jericho +##sea +##was +flick +underside +ceramics +undead +substituted +195 +eastward +undoubtedly +wheeled +chimney +##iche +guinness +cb +##ager +siding +##bell +traitor +baptiste +disguised +inauguration +149 +tipperary +choreographer +perched +warmed +stationary +eco +##ike +##ntes +bacterial +##aurus +flores +phosphate +##core +attacker +invaders +alvin +intersects +a1 +indirectly +immigrated +businessmen +cornelius +valves +narrated +pill +sober +ul +nationale +monastic +applicants +scenery +##jack +161 +motifs +constitutes +cpu +##osh +jurisdictions +sd +tuning +irritation +woven +##uddin +fertility +gao +##erie +antagonist +impatient +glacial +hides +boarded +denominations +interception +##jas +cookie +nicola +##tee +algebraic +marquess +bahn +parole +buyers +bait +turbines +paperwork +bestowed +natasha +renee +oceans +purchases +157 +vaccine +215 +##tock +fixtures +playhouse +integrate +jai +oswald +intellectuals +##cky +booked +nests +mortimer +##isi +obsession +sept +##gler +##sum +440 +scrutiny +simultaneous +squinted +##shin +collects +oven +shankar +penned +remarkably +##я +slips +luggage +spectral +1786 +collaborations +louie +consolidation +##ailed +##ivating +420 +hoover +blackpool +harness +ignition +vest +tails +belmont +mongol +skinner +##nae +visually +mage +derry +##tism +##unce +stevie +transitional +##rdy +redskins +drying +prep +prospective +##21 +annoyance +oversee +##loaded +fills +##books +##iki +announces +fda +scowled +respects +prasad +mystic +tucson +##vale +revue +springer +bankrupt +1772 +aristotle +salvatore +habsburg +##geny +dal +natal +nut +pod +chewing +darts +moroccan +walkover +rosario +lenin +punjabi +##ße +grossed +scattering +wired +invasive +hui +polynomial +corridors +wakes +gina +portrays +##cratic +arid +retreating +erich +irwin +sniper +##dha +linen +lindsey +maneuver +butch +shutting +socio +bounce +commemorative +postseason +jeremiah +pines +275 +mystical +beads +bp +abbas +furnace +bidding +consulted +assaulted +empirical +rubble +enclosure +sob +weakly +cancel +polly +yielded +##emann +curly +prediction +battered +70s +vhs +jacqueline +render +sails +barked +detailing +grayson +riga +sloane +raging +##yah +herbs +bravo +##athlon +alloy +giggle +imminent +suffers +assumptions +waltz +##itate +accomplishments +##ited +bathing +remixed +deception +prefix +##emia +deepest +##tier +##eis +balkan +frogs +##rong +slab +##pate +philosophers +peterborough +grains +imports +dickinson +rwanda +##atics +1774 +dirk +lan +tablets +##rove +clone +##rice +caretaker +hostilities +mclean +##gre +regimental +treasures +norms +impose +tsar +tango +diplomacy +variously +complain +192 +recognise +arrests +1779 +celestial +pulitzer +##dus +bing +libretto +##moor +adele +splash +##rite +expectation +lds +confronts +##izer +spontaneous +harmful +wedge +entrepreneurs +buyer +##ope +bilingual +translate +rugged +conner +circulated +uae +eaton +##gra +##zzle +lingered +lockheed +vishnu +reelection +alonso +##oom +joints +yankee +headline +cooperate +heinz +laureate +invading +##sford +echoes +scandinavian +##dham +hugging +vitamin +salute +micah +hind +trader +##sper +radioactive +##ndra +militants +poisoned +ratified +remark +campeonato +deprived +wander +prop +##dong +outlook +##tani +##rix +##eye +chiang +darcy +##oping +mandolin +spice +statesman +babylon +182 +walled +forgetting +afro +##cap +158 +giorgio +buffer +##polis +planetary +##gis +overlap +terminals +kinda +centenary +##bir +arising +manipulate +elm +ke +1770 +ak +##tad +chrysler +mapped +moose +pomeranian +quad +macarthur +assemblies +shoreline +recalls +stratford +##rted +noticeable +##evic +imp +##rita +##sque +accustomed +supplying +tents +disgusted +vogue +sipped +filters +khz +reno +selecting +luftwaffe +mcmahon +tyne +masterpiece +carriages +collided +dunes +exercised +flare +remembers +muzzle +##mobile +heck +##rson +burgess +lunged +middleton +boycott +bilateral +##sity +hazardous +lumpur +multiplayer +spotlight +jackets +goldman +liege +porcelain +rag +waterford +benz +attracts +hopeful +battling +ottomans +kensington +baked +hymns +cheyenne +lattice +levine +borrow +polymer +clashes +michaels +monitored +commitments +denounced +##25 +##von +cavity +##oney +hobby +akin +##holders +futures +intricate +cornish +patty +##oned +illegally +dolphin +##lag +barlow +yellowish +maddie +apologized +luton +plagued +##puram +nana +##rds +sway +fanny +łodz +##rino +psi +suspicions +hanged +##eding +initiate +charlton +##por +nak +competent +235 +analytical +annex +wardrobe +reservations +##rma +sect +162 +fairfax +hedge +piled +buckingham +uneven +bauer +simplicity +snyder +interpret +accountability +donors +moderately +byrd +continents +##cite +##max +disciple +hr +jamaican +ping +nominees +##uss +mongolian +diver +attackers +eagerly +ideological +pillows +miracles +apartheid +revolver +sulfur +clinics +moran +163 +##enko +ile +katy +rhetoric +##icated +chronology +recycling +##hrer +elongated +mughal +pascal +profiles +vibration +databases +domination +##fare +##rant +matthias +digest +rehearsal +polling +weiss +initiation +reeves +clinging +flourished +impress +ngo +##hoff +##ume +buckley +symposium +rhythms +weed +emphasize +transforming +##taking +##gence +##yman +accountant +analyze +flicker +foil +priesthood +voluntarily +decreases +##80 +##hya +slater +sv +charting +mcgill +##lde +moreno +##iu +besieged +zur +robes +##phic +admitting +api +deported +turmoil +peyton +earthquakes +##ares +nationalists +beau +clair +brethren +interrupt +welch +curated +galerie +requesting +164 +##ested +impending +steward +viper +##vina +complaining +beautifully +brandy +foam +nl +1660 +##cake +alessandro +punches +laced +explanations +##lim +attribute +clit +reggie +discomfort +##cards +smoothed +whales +##cene +adler +countered +duffy +disciplinary +widening +recipe +reliance +conducts +goats +gradient +preaching +##shaw +matilda +quasi +striped +meridian +cannabis +cordoba +certificates +##agh +##tering +graffiti +hangs +pilgrims +repeats +##ych +revive +urine +etat +##hawk +fueled +belts +fuzzy +susceptible +##hang +mauritius +salle +sincere +beers +hooks +##cki +arbitration +entrusted +advise +sniffed +seminar +junk +donnell +processors +principality +strapped +celia +mendoza +everton +fortunes +prejudice +starving +reassigned +steamer +##lund +tuck +evenly +foreman +##ffen +dans +375 +envisioned +slit +##xy +baseman +liberia +rosemary +##weed +electrified +periodically +potassium +stride +contexts +sperm +slade +mariners +influx +bianca +subcommittee +##rane +spilling +icao +estuary +##nock +delivers +iphone +##ulata +isa +mira +bohemian +dessert +##sbury +welcoming +proudly +slowing +##chs +musee +ascension +russ +##vian +waits +##psy +africans +exploit +##morphic +gov +eccentric +crab +peck +##ull +entrances +formidable +marketplace +groom +bolted +metabolism +patton +robbins +courier +payload +endure +##ifier +andes +refrigerator +##pr +ornate +##uca +ruthless +illegitimate +masonry +strasbourg +bikes +adobe +##³ +apples +quintet +willingly +niche +bakery +corpses +energetic +##cliffe +##sser +##ards +177 +centimeters +centro +fuscous +cretaceous +rancho +##yde +andrei +telecom +tottenham +oasis +ordination +vulnerability +presiding +corey +cp +penguins +sims +##pis +malawi +piss +##48 +correction +##cked +##ffle +##ryn +countdown +detectives +psychiatrist +psychedelic +dinosaurs +blouse +##get +choi +vowed +##oz +randomly +##pol +49ers +scrub +blanche +bruins +dusseldorf +##using +unwanted +##ums +212 +dominique +elevations +headlights +om +laguna +##oga +1750 +famously +ignorance +shrewsbury +##aine +ajax +breuning +che +confederacy +greco +overhaul +##screen +paz +skirts +disagreement +cruelty +jagged +phoebe +shifter +hovered +viruses +##wes +mandy +##lined +##gc +landlord +squirrel +dashed +##ι +ornamental +gag +wally +grange +literal +spurs +undisclosed +proceeding +yin +##text +billie +orphan +spanned +humidity +indy +weighted +presentations +explosions +lucian +##tary +vaughn +hindus +##anga +##hell +psycho +171 +daytona +protects +efficiently +rematch +sly +tandem +##oya +rebranded +impaired +hee +metropolis +peach +godfrey +diaspora +ethnicity +prosperous +gleaming +dar +grossing +playback +##rden +stripe +pistols +##tain +births +labelled +##cating +172 +rudy +alba +##onne +aquarium +hostility +##gb +##tase +shudder +sumatra +hardest +lakers +consonant +creeping +demos +homicide +capsule +zeke +liberties +expulsion +pueblo +##comb +trait +transporting +##ddin +##neck +##yna +depart +gregg +mold +ledge +hangar +oldham +playboy +termination +analysts +gmbh +romero +##itic +insist +cradle +filthy +brightness +slash +shootout +deposed +bordering +##truct +isis +microwave +tumbled +sheltered +cathy +werewolves +messy +andersen +convex +clapped +clinched +satire +wasting +edo +vc +rufus +##jak +mont +##etti +poznan +##keeping +restructuring +transverse +##rland +azerbaijani +slovene +gestures +roommate +choking +shear +##quist +vanguard +oblivious +##hiro +disagreed +baptism +##lich +coliseum +##aceae +salvage +societe +cory +locke +relocation +relying +versailles +ahl +swelling +##elo +cheerful +##word +##edes +gin +sarajevo +obstacle +diverted +##nac +messed +thoroughbred +fluttered +utrecht +chewed +acquaintance +assassins +dispatch +mirza +##wart +nike +salzburg +swell +yen +##gee +idle +ligue +samson +##nds +##igh +playful +spawned +##cise +tease +##case +burgundy +##bot +stirring +skeptical +interceptions +marathi +##dies +bedrooms +aroused +pinch +##lik +preferences +tattoos +buster +digitally +projecting +rust +##ital +kitten +priorities +addison +pseudo +##guard +dusk +icons +sermon +##psis +##iba +bt +##lift +##xt +ju +truce +rink +##dah +##wy +defects +psychiatry +offences +calculate +glucose +##iful +##rized +##unda +francaise +##hari +richest +warwickshire +carly +1763 +purity +redemption +lending +##cious +muse +bruises +cerebral +aero +carving +##name +preface +terminology +invade +monty +##int +anarchist +blurred +##iled +rossi +treats +guts +shu +foothills +ballads +undertaking +premise +cecilia +affiliates +blasted +conditional +wilder +minors +drone +rudolph +buffy +swallowing +horton +attested +##hop +rutherford +howell +primetime +livery +penal +##bis +minimize +hydro +wrecked +wrought +palazzo +##gling +cans +vernacular +friedman +nobleman +shale +walnut +danielle +##ection +##tley +sears +##kumar +chords +lend +flipping +streamed +por +dracula +gallons +sacrifices +gamble +orphanage +##iman +mckenzie +##gible +boxers +daly +##balls +##ان +208 +##ific +##rative +##iq +exploited +slated +##uity +circling +hillary +pinched +goldberg +provost +campaigning +lim +piles +ironically +jong +mohan +successors +usaf +##tem +##ught +autobiographical +haute +preserves +##ending +acquitted +comparisons +203 +hydroelectric +gangs +cypriot +torpedoes +rushes +chrome +derive +bumps +instability +fiat +pets +##mbe +silas +dye +reckless +settler +##itation +info +heats +##writing +176 +canonical +maltese +fins +mushroom +stacy +aspen +avid +##kur +##loading +vickers +gaston +hillside +statutes +wilde +gail +kung +sabine +comfortably +motorcycles +##rgo +169 +pneumonia +fetch +##sonic +axel +faintly +parallels +##oop +mclaren +spouse +compton +interdisciplinary +miner +##eni +181 +clamped +##chal +##llah +separates +versa +##mler +scarborough +labrador +##lity +##osing +rutgers +hurdles +como +166 +burt +divers +##100 +wichita +cade +coincided +##erson +bruised +mla +##pper +vineyard +##ili +##brush +notch +mentioning +jase +hearted +kits +doe +##acle +pomerania +##ady +ronan +seizure +pavel +problematic +##zaki +domenico +##ulin +catering +penelope +dependence +parental +emilio +ministerial +atkinson +##bolic +clarkson +chargers +colby +grill +peeked +arises +summon +##aged +fools +##grapher +faculties +qaeda +##vial +garner +refurbished +##hwa +geelong +disasters +nudged +bs +shareholder +lori +algae +reinstated +rot +##ades +##nous +invites +stainless +183 +inclusive +##itude +diocesan +til +##icz +denomination +##xa +benton +floral +registers +##ider +##erman +##kell +absurd +brunei +guangzhou +hitter +retaliation +##uled +##eve +blanc +nh +consistency +contamination +##eres +##rner +dire +palermo +broadcasters +diaries +inspire +vols +brewer +tightening +ky +mixtape +hormone +##tok +stokes +##color +##dly +##ssi +pg +##ometer +##lington +sanitation +##tility +intercontinental +apps +##adt +¹⁄₂ +cylinders +economies +favourable +unison +croix +gertrude +odyssey +vanity +dangling +##logists +upgrades +dice +middleweight +practitioner +##ight +206 +henrik +parlor +orion +angered +lac +python +blurted +##rri +sensual +intends +swings +angled +##phs +husky +attain +peerage +precinct +textiles +cheltenham +shuffled +dai +confess +tasting +bhutan +##riation +tyrone +segregation +abrupt +ruiz +##rish +smirked +blackwell +confidential +browning +amounted +##put +vase +scarce +fabulous +raided +staple +guyana +unemployed +glider +shay +##tow +carmine +troll +intervene +squash +superstar +##uce +cylindrical +len +roadway +researched +handy +##rium +##jana +meta +lao +declares +##rring +##tadt +##elin +##kova +willem +shrubs +napoleonic +realms +skater +qi +volkswagen +##ł +tad +hara +archaeologist +awkwardly +eerie +##kind +wiley +##heimer +##24 +titus +organizers +cfl +crusaders +lama +usb +vent +enraged +thankful +occupants +maximilian +##gaard +possessing +textbooks +##oran +collaborator +quaker +##ulo +avalanche +mono +silky +straits +isaiah +mustang +surged +resolutions +potomac +descend +cl +kilograms +plato +strains +saturdays +##olin +bernstein +##ype +holstein +ponytail +##watch +belize +conversely +heroine +perpetual +##ylus +charcoal +piedmont +glee +negotiating +backdrop +prologue +##jah +##mmy +pasadena +climbs +ramos +sunni +##holm +##tner +##tri +anand +deficiency +hertfordshire +stout +##avi +aperture +orioles +##irs +doncaster +intrigued +bombed +coating +otis +##mat +cocktail +##jit +##eto +amir +arousal +sar +##proof +##act +##ories +dixie +pots +##bow +whereabouts +159 +##fted +drains +bullying +cottages +scripture +coherent +fore +poe +appetite +##uration +sampled +##ators +##dp +derrick +rotor +jays +peacock +installment +##rro +advisors +##coming +rodeo +scotch +##mot +##db +##fen +##vant +ensued +rodrigo +dictatorship +martyrs +twenties +##н +towed +incidence +marta +rainforest +sai +scaled +##cles +oceanic +qualifiers +symphonic +mcbride +dislike +generalized +aubrey +colonization +##iation +##lion +##ssing +disliked +lublin +salesman +##ulates +spherical +whatsoever +sweating +avalon +contention +punt +severity +alderman +atari +##dina +##grant +##rop +scarf +seville +vertices +annexation +fairfield +fascination +inspiring +launches +palatinate +regretted +##rca +feral +##iom +elk +nap +olsen +reddy +yong +##leader +##iae +garment +transports +feng +gracie +outrage +viceroy +insides +##esis +breakup +grady +organizer +softer +grimaced +222 +murals +galicia +arranging +vectors +##rsten +bas +##sb +##cens +sloan +##eka +bitten +ara +fender +nausea +bumped +kris +banquet +comrades +detector +persisted +##llan +adjustment +endowed +cinemas +##shot +sellers +##uman +peek +epa +kindly +neglect +simpsons +talon +mausoleum +runaway +hangul +lookout +##cic +rewards +coughed +acquainted +chloride +##ald +quicker +accordion +neolithic +##qa +artemis +coefficient +lenny +pandora +tx +##xed +ecstasy +litter +segunda +chairperson +gemma +hiss +rumor +vow +nasal +antioch +compensate +patiently +transformers +##eded +judo +morrow +penis +posthumous +philips +bandits +husbands +denote +flaming +##any +##phones +langley +yorker +1760 +walters +##uo +##kle +gubernatorial +fatty +samsung +leroy +outlaw +##nine +unpublished +poole +jakob +##ᵢ +##ₙ +crete +distorted +superiority +##dhi +intercept +crust +mig +claus +crashes +positioning +188 +stallion +301 +frontal +armistice +##estinal +elton +aj +encompassing +camel +commemorated +malaria +woodward +calf +cigar +penetrate +##oso +willard +##rno +##uche +illustrate +amusing +convergence +noteworthy +##lma +##rva +journeys +realise +manfred +##sable +410 +##vocation +hearings +fiance +##posed +educators +provoked +adjusting +##cturing +modular +stockton +paterson +vlad +rejects +electors +selena +maureen +##tres +uber +##rce +swirled +##num +proportions +nanny +pawn +naturalist +parma +apostles +awoke +ethel +wen +##bey +monsoon +overview +##inating +mccain +rendition +risky +adorned +##ih +equestrian +germain +nj +conspicuous +confirming +##yoshi +shivering +##imeter +milestone +rumours +flinched +bounds +smacked +token +##bei +lectured +automobiles +##shore +impacted +##iable +nouns +nero +##leaf +ismail +prostitute +trams +##lace +bridget +sud +stimulus +impressions +reins +revolves +##oud +##gned +giro +honeymoon +##swell +criterion +##sms +##uil +libyan +prefers +##osition +211 +preview +sucks +accusation +bursts +metaphor +diffusion +tolerate +faye +betting +cinematographer +liturgical +specials +bitterly +humboldt +##ckle +flux +rattled +##itzer +archaeologists +odor +authorised +marshes +discretion +##ов +alarmed +archaic +inverse +##leton +explorers +##pine +drummond +tsunami +woodlands +##minate +##tland +booklet +insanity +owning +insert +crafted +calculus +##tore +receivers +##bt +stung +##eca +##nched +prevailing +travellers +eyeing +lila +graphs +##borne +178 +julien +##won +morale +adaptive +therapist +erica +cw +libertarian +bowman +pitches +vita +##ional +crook +##ads +##entation +caledonia +mutiny +##sible +1840s +automation +##ß +flock +##pia +ironic +pathology +##imus +remarried +##22 +joker +withstand +energies +##att +shropshire +hostages +madeleine +tentatively +conflicting +mateo +recipes +euros +ol +mercenaries +nico +##ndon +albuquerque +augmented +mythical +bel +freud +##child +cough +##lica +365 +freddy +lillian +genetically +nuremberg +calder +209 +bonn +outdoors +paste +suns +urgency +vin +restraint +tyson +##cera +##selle +barrage +bethlehem +kahn +##par +mounts +nippon +barony +happier +ryu +makeshift +sheldon +blushed +castillo +barking +listener +taped +bethel +fluent +headlines +pornography +rum +disclosure +sighing +mace +doubling +gunther +manly +##plex +rt +interventions +physiological +forwards +emerges +##tooth +##gny +compliment +rib +recession +visibly +barge +faults +connector +exquisite +prefect +##rlin +patio +##cured +elevators +brandt +italics +pena +173 +wasp +satin +ea +botswana +graceful +respectable +##jima +##rter +##oic +franciscan +generates +##dl +alfredo +disgusting +##olate +##iously +sherwood +warns +cod +promo +cheryl +sino +##ة +##escu +twitch +##zhi +brownish +thom +ortiz +##dron +densely +##beat +carmel +reinforce +##bana +187 +anastasia +downhill +vertex +contaminated +remembrance +harmonic +homework +##sol +fiancee +gears +olds +angelica +loft +ramsay +quiz +colliery +sevens +##cape +autism +##hil +walkway +##boats +ruben +abnormal +ounce +khmer +##bbe +zachary +bedside +morphology +punching +##olar +sparrow +convinces +##35 +hewitt +queer +remastered +rods +mabel +solemn +notified +lyricist +symmetric +##xide +174 +encore +passports +wildcats +##uni +baja +##pac +mildly +##ease +bleed +commodity +mounds +glossy +orchestras +##omo +damian +prelude +ambitions +##vet +awhile +remotely +##aud +asserts +imply +##iques +distinctly +modelling +remedy +##dded +windshield +dani +xiao +##endra +audible +powerplant +1300 +invalid +elemental +acquisitions +##hala +immaculate +libby +plata +smuggling +ventilation +denoted +minh +##morphism +430 +differed +dion +kelley +lore +mocking +sabbath +spikes +hygiene +drown +runoff +stylized +tally +liberated +aux +interpreter +righteous +aba +siren +reaper +pearce +millie +##cier +##yra +gaius +##iso +captures +##ttering +dorm +claudio +##sic +benches +knighted +blackness +##ored +discount +fumble +oxidation +routed +##ς +novak +perpendicular +spoiled +fracture +splits +##urt +pads +topology +##cats +axes +fortunate +offenders +protestants +esteem +221 +broadband +convened +frankly +hound +prototypes +isil +facilitated +keel +##sher +sahara +awaited +bubba +orb +prosecutors +186 +hem +520 +##xing +relaxing +remnant +romney +sorted +slalom +stefano +ulrich +##active +exemption +folder +pauses +foliage +hitchcock +epithet +204 +criticisms +##aca +ballistic +brody +hinduism +chaotic +youths +equals +##pala +pts +thicker +analogous +capitalist +improvised +overseeing +sinatra +ascended +beverage +##tl +straightforward +##kon +curran +##west +bois +325 +induce +surveying +emperors +sax +unpopular +##kk +cartoonist +fused +##mble +unto +##yuki +localities +##cko +##ln +darlington +slain +academie +lobbying +sediment +puzzles +##grass +defiance +dickens +manifest +tongues +alumnus +arbor +coincide +184 +appalachian +mustafa +examiner +cabaret +traumatic +yves +bracelet +draining +heroin +magnum +baths +odessa +consonants +mitsubishi +##gua +kellan +vaudeville +##fr +joked +null +straps +probation +##ław +ceded +interfaces +##pas +##zawa +blinding +viet +224 +rothschild +museo +640 +huddersfield +##vr +tactic +##storm +brackets +dazed +incorrectly +##vu +reg +glazed +fearful +manifold +benefited +irony +##sun +stumbling +##rte +willingness +balkans +mei +wraps +##aba +injected +##lea +gu +syed +harmless +##hammer +bray +takeoff +poppy +timor +cardboard +astronaut +purdue +weeping +southbound +cursing +stalls +diagonal +##neer +lamar +bryce +comte +weekdays +harrington +##uba +negatively +##see +lays +grouping +##cken +##henko +affirmed +halle +modernist +##lai +hodges +smelling +aristocratic +baptized +dismiss +justification +oilers +##now +coupling +qin +snack +healer +##qing +gardener +layla +battled +formulated +stephenson +gravitational +##gill +##jun +1768 +granny +coordinating +suites +##cd +##ioned +monarchs +##cote +##hips +sep +blended +apr +barrister +deposition +fia +mina +policemen +paranoid +##pressed +churchyard +covert +crumpled +creep +abandoning +tr +transmit +conceal +barr +understands +readiness +spire +##cology +##enia +##erry +610 +startling +unlock +vida +bowled +slots +##nat +##islav +spaced +trusting +admire +rig +##ink +slack +##70 +mv +207 +casualty +##wei +classmates +##odes +##rar +##rked +amherst +furnished +evolve +foundry +menace +mead +##lein +flu +wesleyan +##kled +monterey +webber +##vos +wil +##mith +##на +bartholomew +justices +restrained +##cke +amenities +191 +mediated +sewage +trenches +ml +mainz +##thus +1800s +##cula +##inski +caine +bonding +213 +converts +spheres +superseded +marianne +crypt +sweaty +ensign +historia +##br +spruce +##post +##ask +forks +thoughtfully +yukon +pamphlet +ames +##uter +karma +##yya +bryn +negotiation +sighs +incapable +##mbre +##ntial +actresses +taft +##mill +luce +prevailed +##amine +1773 +motionless +envoy +testify +investing +sculpted +instructors +provence +kali +cullen +horseback +##while +goodwin +##jos +gaa +norte +##ldon +modify +wavelength +abd +214 +skinned +sprinter +forecast +scheduling +marries +squared +tentative +##chman +boer +##isch +bolts +swap +fisherman +assyrian +impatiently +guthrie +martins +murdoch +194 +tanya +nicely +dolly +lacy +med +##45 +syn +decks +fashionable +millionaire +##ust +surfing +##ml +##ision +heaved +tammy +consulate +attendees +routinely +197 +fuse +saxophonist +backseat +malaya +##lord +scowl +tau +##ishly +193 +sighted +steaming +##rks +303 +911 +##holes +##hong +ching +##wife +bless +conserved +jurassic +stacey +unix +zion +chunk +rigorous +blaine +198 +peabody +slayer +dismay +brewers +nz +##jer +det +##glia +glover +postwar +int +penetration +sylvester +imitation +vertically +airlift +heiress +knoxville +viva +##uin +390 +macon +##rim +##fighter +##gonal +janice +##orescence +##wari +marius +belongings +leicestershire +196 +blanco +inverted +preseason +sanity +sobbing +##due +##elt +##dled +collingwood +regeneration +flickering +shortest +##mount +##osi +feminism +##lat +sherlock +cabinets +fumbled +northbound +precedent +snaps +##mme +researching +##akes +guillaume +insights +manipulated +vapor +neighbour +sap +gangster +frey +f1 +stalking +scarcely +callie +barnett +tendencies +audi +doomed +assessing +slung +panchayat +ambiguous +bartlett +##etto +distributing +violating +wolverhampton +##hetic +swami +histoire +##urus +liable +pounder +groin +hussain +larsen +popping +surprises +##atter +vie +curt +##station +mute +relocate +musicals +authorization +richter +##sef +immortality +tna +bombings +##press +deteriorated +yiddish +##acious +robbed +colchester +cs +pmid +ao +verified +balancing +apostle +swayed +recognizable +oxfordshire +retention +nottinghamshire +contender +judd +invitational +shrimp +uhf +##icient +cleaner +longitudinal +tanker +##mur +acronym +broker +koppen +sundance +suppliers +##gil +4000 +clipped +fuels +petite +##anne +landslide +helene +diversion +populous +landowners +auspices +melville +quantitative +##xes +ferries +nicky +##llus +doo +haunting +roche +carver +downed +unavailable +##pathy +approximation +hiroshima +##hue +garfield +valle +comparatively +keyboardist +traveler +##eit +congestion +calculating +subsidiaries +##bate +serb +modernization +fairies +deepened +ville +averages +##lore +inflammatory +tonga +##itch +co₂ +squads +##hea +gigantic +serum +enjoyment +retailer +verona +35th +cis +##phobic +magna +technicians +##vati +arithmetic +##sport +levin +##dation +amtrak +chow +sienna +##eyer +backstage +entrepreneurship +##otic +learnt +tao +##udy +worcestershire +formulation +baggage +hesitant +bali +sabotage +##kari +barren +enhancing +murmur +pl +freshly +putnam +syntax +aces +medicines +resentment +bandwidth +##sier +grins +chili +guido +##sei +framing +implying +gareth +lissa +genevieve +pertaining +admissions +geo +thorpe +proliferation +sato +bela +analyzing +parting +##gor +awakened +##isman +huddled +secrecy +##kling +hush +gentry +540 +dungeons +##ego +coasts +##utz +sacrificed +##chule +landowner +mutually +prevalence +programmer +adolescent +disrupted +seaside +gee +trusts +vamp +georgie +##nesian +##iol +schedules +sindh +##market +etched +hm +sparse +bey +beaux +scratching +gliding +unidentified +216 +collaborating +gems +jesuits +oro +accumulation +shaping +mbe +anal +##xin +231 +enthusiasts +newscast +##egan +janata +dewey +parkinson +179 +ankara +biennial +towering +dd +inconsistent +950 +##chet +thriving +terminate +cabins +furiously +eats +advocating +donkey +marley +muster +phyllis +leiden +##user +grassland +glittering +iucn +loneliness +217 +memorandum +armenians +##ddle +popularized +rhodesia +60s +lame +##illon +sans +bikini +header +orbits +##xx +##finger +##ulator +sharif +spines +biotechnology +strolled +naughty +yates +##wire +fremantle +milo +##mour +abducted +removes +##atin +humming +wonderland +##chrome +##ester +hume +pivotal +##rates +armand +grams +believers +elector +rte +apron +bis +scraped +##yria +endorsement +initials +##llation +eps +dotted +hints +buzzing +emigration +nearer +##tom +indicators +##ulu +coarse +neutron +protectorate +##uze +directional +exploits +pains +loire +1830s +proponents +guggenheim +rabbits +ritchie +305 +hectare +inputs +hutton +##raz +verify +##ako +boilers +longitude +##lev +skeletal +yer +emilia +citrus +compromised +##gau +pokemon +prescription +paragraph +eduard +cadillac +attire +categorized +kenyan +weddings +charley +##bourg +entertain +monmouth +##lles +nutrients +davey +mesh +incentive +practised +ecosystems +kemp +subdued +overheard +##rya +bodily +maxim +##nius +apprenticeship +ursula +##fight +lodged +rug +silesian +unconstitutional +patel +inspected +coyote +unbeaten +##hak +34th +disruption +convict +parcel +##cl +##nham +collier +implicated +mallory +##iac +##lab +susannah +winkler +##rber +shia +phelps +sediments +graphical +robotic +##sner +adulthood +mart +smoked +##isto +kathryn +clarified +##aran +divides +convictions +oppression +pausing +burying +##mt +federico +mathias +eileen +##tana +kite +hunched +##acies +189 +##atz +disadvantage +liza +kinetic +greedy +paradox +yokohama +dowager +trunks +ventured +##gement +gupta +vilnius +olaf +##thest +crimean +hopper +##ej +progressively +arturo +mouthed +arrondissement +##fusion +rubin +simulcast +oceania +##orum +##stra +##rred +busiest +intensely +navigator +cary +##vine +##hini +##bies +fife +rowe +rowland +posing +insurgents +shafts +lawsuits +activate +conor +inward +culturally +garlic +265 +##eering +eclectic +##hui +##kee +##nl +furrowed +vargas +meteorological +rendezvous +##aus +culinary +commencement +##dition +quota +##notes +mommy +salaries +overlapping +mule +##iology +##mology +sums +wentworth +##isk +##zione +mainline +subgroup +##illy +hack +plaintiff +verdi +bulb +differentiation +engagements +multinational +supplemented +bertrand +caller +regis +##naire +##sler +##arts +##imated +blossom +propagation +kilometer +viaduct +vineyards +##uate +beckett +optimization +golfer +songwriters +seminal +semitic +thud +volatile +evolving +ridley +##wley +trivial +distributions +scandinavia +jiang +##ject +wrestled +insistence +##dio +emphasizes +napkin +##ods +adjunct +rhyme +##ricted +##eti +hopeless +surrounds +tremble +32nd +smoky +##ntly +oils +medicinal +padded +steer +wilkes +219 +255 +concessions +hue +uniquely +blinded +landon +yahoo +##lane +hendrix +commemorating +dex +specify +chicks +##ggio +intercity +1400 +morley +##torm +highlighting +##oting +pang +oblique +stalled +##liner +flirting +newborn +1769 +bishopric +shaved +232 +currie +##ush +dharma +spartan +##ooped +favorites +smug +novella +sirens +abusive +creations +espana +##lage +paradigm +semiconductor +sheen +##rdo +##yen +##zak +nrl +renew +##pose +##tur +adjutant +marches +norma +##enity +ineffective +weimar +grunt +##gat +lordship +plotting +expenditure +infringement +lbs +refrain +av +mimi +mistakenly +postmaster +1771 +##bara +ras +motorsports +tito +199 +subjective +##zza +bully +stew +##kaya +prescott +1a +##raphic +##zam +bids +styling +paranormal +reeve +sneaking +exploding +katz +akbar +migrant +syllables +indefinitely +##ogical +destroys +replaces +applause +##phine +pest +##fide +218 +articulated +bertie +##thing +##cars +##ptic +courtroom +crowley +aesthetics +cummings +tehsil +hormones +titanic +dangerously +##ibe +stadion +jaenelle +auguste +ciudad +##chu +mysore +partisans +##sio +lucan +philipp +##aly +debating +henley +interiors +##rano +##tious +homecoming +beyonce +usher +henrietta +prepares +weeds +##oman +ely +plucked +##pire +##dable +luxurious +##aq +artifact +password +pasture +juno +maddy +minsk +##dder +##ologies +##rone +assessments +martian +royalist +1765 +examines +##mani +##rge +nino +223 +parry +scooped +relativity +##eli +##uting +##cao +congregational +noisy +traverse +##agawa +strikeouts +nickelodeon +obituary +transylvania +binds +depictions +polk +trolley +##yed +##lard +breeders +##under +dryly +hokkaido +1762 +strengths +stacks +bonaparte +connectivity +neared +prostitutes +stamped +anaheim +gutierrez +sinai +##zzling +bram +fresno +madhya +##86 +proton +##lena +##llum +##phon +reelected +wanda +##anus +##lb +ample +distinguishing +##yler +grasping +sermons +tomato +bland +stimulation +avenues +##eux +spreads +scarlett +fern +pentagon +assert +baird +chesapeake +ir +calmed +distortion +fatalities +##olis +correctional +pricing +##astic +##gina +prom +dammit +ying +collaborate +##chia +welterweight +33rd +pointer +substitution +bonded +umpire +communicating +multitude +paddle +##obe +federally +intimacy +##insky +betray +ssr +##lett +##lean +##lves +##therapy +airbus +##tery +functioned +ud +bearer +biomedical +netflix +##hire +##nca +condom +brink +ik +##nical +macy +##bet +flap +gma +experimented +jelly +lavender +##icles +##ulia +munro +##mian +##tial +rye +##rle +60th +gigs +hottest +rotated +predictions +fuji +bu +##erence +##omi +barangay +##fulness +##sas +clocks +##rwood +##liness +cereal +roe +wight +decker +uttered +babu +onion +xml +forcibly +##df +petra +sarcasm +hartley +peeled +storytelling +##42 +##xley +##ysis +##ffa +fibre +kiel +auditor +fig +harald +greenville +##berries +geographically +nell +quartz +##athic +cemeteries +##lr +crossings +nah +holloway +reptiles +chun +sichuan +snowy +660 +corrections +##ivo +zheng +ambassadors +blacksmith +fielded +fluids +hardcover +turnover +medications +melvin +academies +##erton +ro +roach +absorbing +spaniards +colton +##founded +outsider +espionage +kelsey +245 +edible +##ulf +dora +establishes +##sham +##tries +contracting +##tania +cinematic +costello +nesting +##uron +connolly +duff +##nology +mma +##mata +fergus +sexes +gi +optics +spectator +woodstock +banning +##hee +##fle +differentiate +outfielder +refinery +226 +312 +gerhard +horde +lair +drastically +##udi +landfall +##cheng +motorsport +odi +##achi +predominant +quay +skins +##ental +edna +harshly +complementary +murdering +##aves +wreckage +##90 +ono +outstretched +lennox +munitions +galen +reconcile +470 +scalp +bicycles +gillespie +questionable +rosenberg +guillermo +hostel +jarvis +kabul +volvo +opium +yd +##twined +abuses +decca +outpost +##cino +sensible +neutrality +##64 +ponce +anchorage +atkins +turrets +inadvertently +disagree +libre +vodka +reassuring +weighs +##yal +glide +jumper +ceilings +repertory +outs +stain +##bial +envy +##ucible +smashing +heightened +policing +hyun +mixes +lai +prima +##ples +celeste +##bina +lucrative +intervened +kc +manually +##rned +stature +staffed +bun +bastards +nairobi +priced +##auer +thatcher +##kia +tripped +comune +##ogan +##pled +brasil +incentives +emanuel +hereford +musica +##kim +benedictine +biennale +##lani +eureka +gardiner +rb +knocks +sha +##ael +##elled +##onate +efficacy +ventura +masonic +sanford +maize +leverage +##feit +capacities +santana +##aur +novelty +vanilla +##cter +##tour +benin +##oir +##rain +neptune +drafting +tallinn +##cable +humiliation +##boarding +schleswig +fabian +bernardo +liturgy +spectacle +sweeney +pont +routledge +##tment +cosmos +ut +hilt +sleek +universally +##eville +##gawa +typed +##dry +favors +allegheny +glaciers +##rly +recalling +aziz +##log +parasite +requiem +auf +##berto +##llin +illumination +##breaker +##issa +festivities +bows +govern +vibe +vp +333 +sprawled +larson +pilgrim +bwf +leaping +##rts +##ssel +alexei +greyhound +hoarse +##dler +##oration +seneca +##cule +gaping +##ulously +##pura +cinnamon +##gens +##rricular +craven +fantasies +houghton +engined +reigned +dictator +supervising +##oris +bogota +commentaries +unnatural +fingernails +spirituality +tighten +##tm +canadiens +protesting +intentional +cheers +sparta +##ytic +##iere +##zine +widen +belgarath +controllers +dodd +iaaf +navarre +##ication +defect +squire +steiner +whisky +##mins +560 +inevitably +tome +##gold +chew +##uid +##lid +elastic +##aby +streaked +alliances +jailed +regal +##ined +##phy +czechoslovak +narration +absently +##uld +bluegrass +guangdong +quran +criticizing +hose +hari +##liest +##owa +skier +streaks +deploy +##lom +raft +bose +dialed +huff +##eira +haifa +simplest +bursting +endings +ib +sultanate +##titled +franks +whitman +ensures +sven +##ggs +collaborators +forster +organising +ui +banished +napier +injustice +teller +layered +thump +##otti +roc +battleships +evidenced +fugitive +sadie +robotics +##roud +equatorial +geologist +##iza +yielding +##bron +##sr +internationale +mecca +##diment +sbs +skyline +toad +uploaded +reflective +undrafted +lal +leafs +bayern +##dai +lakshmi +shortlisted +##stick +##wicz +camouflage +donate +af +christi +lau +##acio +disclosed +nemesis +1761 +assemble +straining +northamptonshire +tal +##asi +bernardino +premature +heidi +42nd +coefficients +galactic +reproduce +buzzed +sensations +zionist +monsieur +myrtle +##eme +archery +strangled +musically +viewpoint +antiquities +bei +trailers +seahawks +cured +pee +preferring +tasmanian +lange +sul +##mail +##working +colder +overland +lucivar +massey +gatherings +haitian +##smith +disapproval +flaws +##cco +##enbach +1766 +npr +##icular +boroughs +creole +forums +techno +1755 +dent +abdominal +streetcar +##eson +##stream +procurement +gemini +predictable +##tya +acheron +christoph +feeder +fronts +vendor +bernhard +jammu +tumors +slang +##uber +goaltender +twists +curving +manson +vuelta +mer +peanut +confessions +pouch +unpredictable +allowance +theodor +vascular +##factory +bala +authenticity +metabolic +coughing +nanjing +##cea +pembroke +##bard +splendid +36th +ff +hourly +##ahu +elmer +handel +##ivate +awarding +thrusting +dl +experimentation +##hesion +##46 +caressed +entertained +steak +##rangle +biologist +orphans +baroness +oyster +stepfather +##dridge +mirage +reefs +speeding +##31 +barons +1764 +227 +inhabit +preached +repealed +##tral +honoring +boogie +captives +administer +johanna +##imate +gel +suspiciously +1767 +sobs +##dington +backbone +hayward +garry +##folding +##nesia +maxi +##oof +##ppe +ellison +galileo +##stand +crimea +frenzy +amour +bumper +matrices +natalia +baking +garth +palestinians +##grove +smack +conveyed +ensembles +gardening +##manship +##rup +##stituting +1640 +harvesting +topography +jing +shifters +dormitory +##carriage +##lston +ist +skulls +##stadt +dolores +jewellery +sarawak +##wai +##zier +fences +christy +confinement +tumbling +credibility +fir +stench +##bria +##plication +##nged +##sam +virtues +##belt +marjorie +pba +##eem +##made +celebrates +schooner +agitated +barley +fulfilling +anthropologist +##pro +restrict +novi +regulating +##nent +padres +##rani +##hesive +loyola +tabitha +milky +olson +proprietor +crambidae +guarantees +intercollegiate +ljubljana +hilda +##sko +ignorant +hooded +##lts +sardinia +##lidae +##vation +frontman +privileged +witchcraft +##gp +jammed +laude +poking +##than +bracket +amazement +yunnan +##erus +maharaja +linnaeus +264 +commissioning +milano +peacefully +##logies +akira +rani +regulator +##36 +grasses +##rance +luzon +crows +compiler +gretchen +seaman +edouard +tab +buccaneers +ellington +hamlets +whig +socialists +##anto +directorial +easton +mythological +##kr +##vary +rhineland +semantic +taut +dune +inventions +succeeds +##iter +replication +branched +##pired +jul +prosecuted +kangaroo +penetrated +##avian +middlesbrough +doses +bleak +madam +predatory +relentless +##vili +reluctance +##vir +hailey +crore +silvery +1759 +monstrous +swimmers +transmissions +hawthorn +informing +##eral +toilets +caracas +crouch +kb +##sett +295 +cartel +hadley +##aling +alexia +yvonne +##biology +cinderella +eton +superb +blizzard +stabbing +industrialist +maximus +##gm +##orus +groves +maud +clade +oversized +comedic +##bella +rosen +nomadic +fulham +montane +beverages +galaxies +redundant +swarm +##rot +##folia +##llis +buckinghamshire +fen +bearings +bahadur +##rom +gilles +phased +dynamite +faber +benoit +vip +##ount +##wd +booking +fractured +tailored +anya +spices +westwood +cairns +auditions +inflammation +steamed +##rocity +##acion +##urne +skyla +thereof +watford +torment +archdeacon +transforms +lulu +demeanor +fucked +serge +##sor +mckenna +minas +entertainer +##icide +caress +originate +residue +##sty +1740 +##ilised +##org +beech +##wana +subsidies +##ghton +emptied +gladstone +ru +firefighters +voodoo +##rcle +het +nightingale +tamara +edmond +ingredient +weaknesses +silhouette +285 +compatibility +withdrawing +hampson +##mona +anguish +giggling +##mber +bookstore +##jiang +southernmost +tilting +##vance +bai +economical +rf +briefcase +dreadful +hinted +projections +shattering +totaling +##rogate +analogue +indicted +periodical +fullback +##dman +haynes +##tenberg +##ffs +##ishment +1745 +thirst +stumble +penang +vigorous +##ddling +##kor +##lium +octave +##ove +##enstein +##inen +##ones +siberian +##uti +cbn +repeal +swaying +##vington +khalid +tanaka +unicorn +otago +plastered +lobe +riddle +##rella +perch +##ishing +croydon +filtered +graeme +tripoli +##ossa +crocodile +##chers +sufi +mined +##tung +inferno +lsu +##phi +swelled +utilizes +£2 +cale +periodicals +styx +hike +informally +coop +lund +##tidae +ala +hen +qui +transformations +disposed +sheath +chickens +##cade +fitzroy +sas +silesia +unacceptable +odisha +1650 +sabrina +pe +spokane +ratios +athena +massage +shen +dilemma +##drum +##riz +##hul +corona +doubtful +niall +##pha +##bino +fines +cite +acknowledging +bangor +ballard +bathurst +##resh +huron +mustered +alzheimer +garments +kinase +tyre +warship +##cp +flashback +pulmonary +braun +cheat +kamal +cyclists +constructions +grenades +ndp +traveller +excuses +stomped +signalling +trimmed +futsal +mosques +relevance +##wine +wta +##23 +##vah +##lter +hoc +##riding +optimistic +##´s +deco +sim +interacting +rejecting +moniker +waterways +##ieri +##oku +mayors +gdansk +outnumbered +pearls +##ended +##hampton +fairs +totals +dominating +262 +notions +stairway +compiling +pursed +commodities +grease +yeast +##jong +carthage +griffiths +residual +amc +contraction +laird +sapphire +##marine +##ivated +amalgamation +dissolve +inclination +lyle +packaged +altitudes +suez +canons +graded +lurched +narrowing +boasts +guise +wed +enrico +##ovsky +rower +scarred +bree +cub +iberian +protagonists +bargaining +proposing +trainers +voyages +vans +fishes +##aea +##ivist +##verance +encryption +artworks +kazan +sabre +cleopatra +hepburn +rotting +supremacy +mecklenburg +##brate +burrows +hazards +outgoing +flair +organizes +##ctions +scorpion +##usions +boo +234 +chevalier +dunedin +slapping +##34 +ineligible +pensions +##38 +##omic +manufactures +emails +bismarck +238 +weakening +blackish +ding +mcgee +quo +##rling +northernmost +xx +manpower +greed +sampson +clicking +##ange +##horpe +##inations +##roving +torre +##eptive +##moral +symbolism +38th +asshole +meritorious +outfits +splashed +biographies +sprung +astros +##tale +302 +737 +filly +raoul +nw +tokugawa +linden +clubhouse +##apa +tracts +romano +##pio +putin +tags +##note +chained +dickson +gunshot +moe +gunn +rashid +##tails +zipper +##bas +##nea +contrasted +##ply +##udes +plum +pharaoh +##pile +aw +comedies +ingrid +sandwiches +subdivisions +1100 +mariana +nokia +kamen +hz +delaney +veto +herring +##words +possessive +outlines +##roup +siemens +stairwell +rc +gallantry +messiah +palais +yells +233 +zeppelin +##dm +bolivar +##cede +smackdown +mckinley +##mora +##yt +muted +geologic +finely +unitary +avatar +hamas +maynard +rees +bog +contrasting +##rut +liv +chico +disposition +pixel +##erate +becca +dmitry +yeshiva +narratives +##lva +##ulton +mercenary +sharpe +tempered +navigate +stealth +amassed +keynes +##lini +untouched +##rrie +havoc +lithium +##fighting +abyss +graf +southward +wolverine +balloons +implements +ngos +transitions +##icum +ambushed +concacaf +dormant +economists +##dim +costing +csi +rana +universite +boulders +verity +##llon +collin +mellon +misses +cypress +fluorescent +lifeless +spence +##ulla +crewe +shepard +pak +revelations +##م +jolly +gibbons +paw +##dro +##quel +freeing +##test +shack +fries +palatine +##51 +##hiko +accompaniment +cruising +recycled +##aver +erwin +sorting +synthesizers +dyke +realities +sg +strides +enslaved +wetland +##ghan +competence +gunpowder +grassy +maroon +reactors +objection +##oms +carlson +gearbox +macintosh +radios +shelton +##sho +clergyman +prakash +254 +mongols +trophies +oricon +228 +stimuli +twenty20 +cantonese +cortes +mirrored +##saurus +bhp +cristina +melancholy +##lating +enjoyable +nuevo +##wny +downfall +schumacher +##ind +banging +lausanne +rumbled +paramilitary +reflex +ax +amplitude +migratory +##gall +##ups +midi +barnard +lastly +sherry +##hp +##nall +keystone +##kra +carleton +slippery +##53 +coloring +foe +socket +otter +##rgos +mats +##tose +consultants +bafta +bison +topping +##km +490 +primal +abandonment +transplant +atoll +hideous +mort +pained +reproduced +tae +howling +##turn +unlawful +billionaire +hotter +poised +lansing +##chang +dinamo +retro +messing +nfc +domesday +##mina +blitz +timed +##athing +##kley +ascending +gesturing +##izations +signaled +tis +chinatown +mermaid +savanna +jameson +##aint +catalina +##pet +##hers +cochrane +cy +chatting +##kus +alerted +computation +mused +noelle +majestic +mohawk +campo +octagonal +##sant +##hend +241 +aspiring +##mart +comprehend +iona +paralyzed +shimmering +swindon +rhone +##eley +reputed +configurations +pitchfork +agitation +francais +gillian +lipstick +##ilo +outsiders +pontifical +resisting +bitterness +sewer +rockies +##edd +##ucher +misleading +1756 +exiting +galloway +##nging +risked +##heart +246 +commemoration +schultz +##rka +integrating +##rsa +poses +shrieked +##weiler +guineas +gladys +jerking +owls +goldsmith +nightly +penetrating +##unced +lia +##33 +ignited +betsy +##aring +##thorpe +follower +vigorously +##rave +coded +kiran +knit +zoology +tbilisi +##28 +##bered +repository +govt +deciduous +dino +growling +##bba +enhancement +unleashed +chanting +pussy +biochemistry +##eric +kettle +repression +toxicity +nrhp +##arth +##kko +##bush +ernesto +commended +outspoken +242 +mca +parchment +sms +kristen +##aton +bisexual +raked +glamour +navajo +a2 +conditioned +showcased +##hma +spacious +youthful +##esa +usl +appliances +junta +brest +layne +conglomerate +enchanted +chao +loosened +picasso +circulating +inspect +montevideo +##centric +##kti +piazza +spurred +##aith +bari +freedoms +poultry +stamford +lieu +##ect +indigo +sarcastic +bahia +stump +attach +dvds +frankenstein +lille +approx +scriptures +pollen +##script +nmi +overseen +##ivism +tides +proponent +newmarket +inherit +milling +##erland +centralized +##rou +distributors +credentials +drawers +abbreviation +##lco +##xon +downing +uncomfortably +ripe +##oes +erase +franchises +##ever +populace +##bery +##khar +decomposition +pleas +##tet +daryl +sabah +##stle +##wide +fearless +genie +lesions +annette +##ogist +oboe +appendix +nair +dripped +petitioned +maclean +mosquito +parrot +rpg +hampered +1648 +operatic +reservoirs +##tham +irrelevant +jolt +summarized +##fp +medallion +##taff +##− +clawed +harlow +narrower +goddard +marcia +bodied +fremont +suarez +altering +tempest +mussolini +porn +##isms +sweetly +oversees +walkers +solitude +grimly +shrines +hk +ich +supervisors +hostess +dietrich +legitimacy +brushes +expressive +##yp +dissipated +##rse +localized +systemic +##nikov +gettysburg +##js +##uaries +dialogues +muttering +251 +housekeeper +sicilian +discouraged +##frey +beamed +kaladin +halftime +kidnap +##amo +##llet +1754 +synonymous +depleted +instituto +insulin +reprised +##opsis +clashed +##ctric +interrupting +radcliffe +insisting +medici +1715 +ejected +playfully +turbulent +##47 +starvation +##rini +shipment +rebellious +petersen +verification +merits +##rified +cakes +##charged +1757 +milford +shortages +spying +fidelity +##aker +emitted +storylines +harvested +seismic +##iform +cheung +kilda +theoretically +barbie +lynx +##rgy +##tius +goblin +mata +poisonous +##nburg +reactive +residues +obedience +##евич +conjecture +##rac +401 +hating +sixties +kicker +moaning +motown +##bha +emancipation +neoclassical +##hering +consoles +ebert +professorship +##tures +sustaining +assaults +obeyed +affluent +incurred +tornadoes +##eber +##zow +emphasizing +highlanders +cheated +helmets +##ctus +internship +terence +bony +executions +legislators +berries +peninsular +tinged +##aco +1689 +amplifier +corvette +ribbons +lavish +pennant +##lander +worthless +##chfield +##forms +mariano +pyrenees +expenditures +##icides +chesterfield +mandir +tailor +39th +sergey +nestled +willed +aristocracy +devotees +goodnight +raaf +rumored +weaponry +remy +appropriations +harcourt +burr +riaa +##lence +limitation +unnoticed +guo +soaking +swamps +##tica +collapsing +tatiana +descriptive +brigham +psalm +##chment +maddox +##lization +patti +caliph +##aja +akron +injuring +serra +##ganj +basins +##sari +astonished +launcher +##church +hilary +wilkins +sewing +##sf +stinging +##fia +##ncia +underwood +startup +##ition +compilations +vibrations +embankment +jurist +##nity +bard +juventus +groundwater +kern +palaces +helium +boca +cramped +marissa +soto +##worm +jae +princely +##ggy +faso +bazaar +warmly +##voking +229 +pairing +##lite +##grate +##nets +wien +freaked +ulysses +rebirth +##alia +##rent +mummy +guzman +jimenez +stilled +##nitz +trajectory +tha +woken +archival +professions +##pts +##pta +hilly +shadowy +shrink +##bolt +norwood +glued +migrate +stereotypes +devoid +##pheus +625 +evacuate +horrors +infancy +gotham +knowles +optic +downloaded +sachs +kingsley +parramatta +darryl +mor +##onale +shady +commence +confesses +kan +##meter +##placed +marlborough +roundabout +regents +frigates +io +##imating +gothenburg +revoked +carvings +clockwise +convertible +intruder +##sche +banged +##ogo +vicky +bourgeois +##mony +dupont +footing +##gum +pd +##real +buckle +yun +penthouse +sane +720 +serviced +stakeholders +neumann +bb +##eers +comb +##gam +catchment +pinning +rallies +typing +##elles +forefront +freiburg +sweetie +giacomo +widowed +goodwill +worshipped +aspirations +midday +##vat +fishery +##trick +bournemouth +turk +243 +hearth +ethanol +guadalajara +murmurs +sl +##uge +afforded +scripted +##hta +wah +##jn +coroner +translucent +252 +memorials +puck +progresses +clumsy +##race +315 +candace +recounted +##27 +##slin +##uve +filtering +##mac +howl +strata +heron +leveled +##ays +dubious +##oja +##т +##wheel +citations +exhibiting +##laya +##mics +##pods +turkic +##lberg +injunction +##ennial +##mit +antibodies +##44 +organise +##rigues +cardiovascular +cushion +inverness +##zquez +dia +cocoa +sibling +##tman +##roid +expanse +feasible +tunisian +algiers +##relli +rus +bloomberg +dso +westphalia +bro +tacoma +281 +downloads +##ours +konrad +duran +##hdi +continuum +jett +compares +legislator +secession +##nable +##gues +##zuka +translating +reacher +##gley +##ła +aleppo +##agi +tc +orchards +trapping +linguist +versatile +drumming +postage +calhoun +superiors +##mx +barefoot +leary +##cis +ignacio +alfa +kaplan +##rogen +bratislava +mori +##vot +disturb +haas +313 +cartridges +gilmore +radiated +salford +tunic +hades +##ulsive +archeological +delilah +magistrates +auditioned +brewster +charters +empowerment +blogs +cappella +dynasties +iroquois +whipping +##krishna +raceway +truths +myra +weaken +judah +mcgregor +##horse +mic +refueling +37th +burnley +bosses +markus +premio +query +##gga +dunbar +##economic +darkest +lyndon +sealing +commendation +reappeared +##mun +addicted +ezio +slaughtered +satisfactory +shuffle +##eves +##thic +##uj +fortification +warrington +##otto +resurrected +fargo +mane +##utable +##lei +##space +foreword +ox +##aris +##vern +abrams +hua +##mento +sakura +##alo +uv +sentimental +##skaya +midfield +##eses +sturdy +scrolls +macleod +##kyu +entropy +##lance +mitochondrial +cicero +excelled +thinner +convoys +perceive +##oslav +##urable +systematically +grind +burkina +287 +##tagram +ops +##aman +guantanamo +##cloth +##tite +forcefully +wavy +##jou +pointless +##linger +##tze +layton +portico +superficial +clerical +outlaws +##hism +burials +muir +##inn +creditors +hauling +rattle +##leg +calais +monde +archers +reclaimed +dwell +wexford +hellenic +falsely +remorse +##tek +dough +furnishings +##uttered +gabon +neurological +novice +##igraphy +contemplated +pulpit +nightstand +saratoga +##istan +documenting +pulsing +taluk +##firmed +busted +marital +##rien +disagreements +wasps +##yes +hodge +mcdonnell +mimic +fran +pendant +dhabi +musa +##nington +congratulations +argent +darrell +concussion +losers +regrets +thessaloniki +reversal +donaldson +hardwood +thence +achilles +ritter +##eran +demonic +jurgen +prophets +goethe +eki +classmate +buff +##cking +yank +irrational +##inging +perished +seductive +qur +sourced +##crat +##typic +mustard +ravine +barre +horizontally +characterization +phylogenetic +boise +##dit +##runner +##tower +brutally +intercourse +seduce +##bbing +fay +ferris +ogden +amar +nik +unarmed +##inator +evaluating +kyrgyzstan +sweetness +##lford +##oki +mccormick +meiji +notoriety +stimulate +disrupt +figuring +instructional +mcgrath +##zoo +groundbreaking +##lto +flinch +khorasan +agrarian +bengals +mixer +radiating +##sov +ingram +pitchers +nad +tariff +##cript +tata +##codes +##emi +##ungen +appellate +lehigh +##bled +##giri +brawl +duct +texans +##ciation +##ropolis +skipper +speculative +vomit +doctrines +stresses +253 +davy +graders +whitehead +jozef +timely +cumulative +haryana +paints +appropriately +boon +cactus +##ales +##pid +dow +legions +##pit +perceptions +1730 +picturesque +##yse +periphery +rune +wr +##aha +celtics +sentencing +whoa +##erin +confirms +variance +425 +moines +mathews +spade +rave +m1 +fronted +fx +blending +alleging +reared +##gl +237 +##paper +grassroots +eroded +##free +##physical +directs +ordeal +##sław +accelerate +hacker +rooftop +##inia +lev +buys +cebu +devote +##lce +specialising +##ulsion +choreographed +repetition +warehouses +##ryl +paisley +tuscany +analogy +sorcerer +hash +huts +shards +descends +exclude +nix +chaplin +gaga +ito +vane +##drich +causeway +misconduct +limo +orchestrated +glands +jana +##kot +u2 +##mple +##sons +branching +contrasts +scoop +longed +##virus +chattanooga +##75 +syrup +cornerstone +##tized +##mind +##iaceae +careless +precedence +frescoes +##uet +chilled +consult +modelled +snatch +peat +##thermal +caucasian +humane +relaxation +spins +temperance +##lbert +occupations +lambda +hybrids +moons +mp3 +##oese +247 +rolf +societal +yerevan +ness +##ssler +befriended +mechanized +nominate +trough +boasted +cues +seater +##hom +bends +##tangle +conductors +emptiness +##lmer +eurasian +adriatic +tian +##cie +anxiously +lark +propellers +chichester +jock +ev +2a +##holding +credible +recounts +tori +loyalist +abduction +##hoot +##redo +nepali +##mite +ventral +tempting +##ango +##crats +steered +##wice +javelin +dipping +laborers +prentice +looming +titanium +##ː +badges +emir +tensor +##ntation +egyptians +rash +denies +hawthorne +lombard +showers +wehrmacht +dietary +trojan +##reus +welles +executing +horseshoe +lifeboat +##lak +elsa +infirmary +nearing +roberta +boyer +mutter +trillion +joanne +##fine +##oked +sinks +vortex +uruguayan +clasp +sirius +##block +accelerator +prohibit +sunken +byu +chronological +diplomats +ochreous +510 +symmetrical +1644 +maia +##tology +salts +reigns +atrocities +##ия +hess +bared +issn +##vyn +cater +saturated +##cycle +##isse +sable +voyager +dyer +yusuf +##inge +fountains +wolff +##39 +##nni +engraving +rollins +atheist +ominous +##ault +herr +chariot +martina +strung +##fell +##farlane +horrific +sahib +gazes +saetan +erased +ptolemy +##olic +flushing +lauderdale +analytic +##ices +530 +navarro +beak +gorilla +herrera +broom +guadalupe +raiding +sykes +311 +bsc +deliveries +1720 +invasions +carmichael +tajikistan +thematic +ecumenical +sentiments +onstage +##rians +##brand +##sume +catastrophic +flanks +molten +##arns +waller +aimee +terminating +##icing +alternately +##oche +nehru +printers +outraged +##eving +empires +template +banners +repetitive +za +##oise +vegetarian +##tell +guiana +opt +cavendish +lucknow +synthesized +##hani +##mada +finalized +##ctable +fictitious +mayoral +unreliable +##enham +embracing +peppers +rbis +##chio +##neo +inhibition +slashed +togo +orderly +embroidered +safari +salty +236 +barron +benito +totaled +##dak +pubs +simulated +caden +devin +tolkien +momma +welding +sesame +##ept +gottingen +hardness +630 +shaman +temeraire +620 +adequately +pediatric +##kit +ck +assertion +radicals +composure +cadence +seafood +beaufort +lazarus +mani +warily +cunning +kurdistan +249 +cantata +##kir +ares +##41 +##clusive +nape +townland +geared +insulted +flutter +boating +violate +draper +dumping +malmo +##hh +##romatic +firearm +alta +bono +obscured +##clave +exceeds +panorama +unbelievable +##train +preschool +##essed +disconnected +installing +rescuing +secretaries +accessibility +##castle +##drive +##ifice +##film +bouts +slug +waterway +mindanao +##buro +##ratic +halves +##ل +calming +liter +maternity +adorable +bragg +electrification +mcc +##dote +roxy +schizophrenia +##body +munoz +kaye +whaling +239 +mil +tingling +tolerant +##ago +unconventional +volcanoes +##finder +deportivo +##llie +robson +kaufman +neuroscience +wai +deportation +masovian +scraping +converse +##bh +hacking +bulge +##oun +administratively +yao +580 +amp +mammoth +booster +claremont +hooper +nomenclature +pursuits +mclaughlin +melinda +##sul +catfish +barclay +substrates +taxa +zee +originals +kimberly +packets +padma +##ality +borrowing +ostensibly +solvent +##bri +##genesis +##mist +lukas +shreveport +veracruz +##ь +##lou +##wives +cheney +tt +anatolia +hobbs +##zyn +cyclic +radiant +alistair +greenish +siena +dat +independents +##bation +conform +pieter +hyper +applicant +bradshaw +spores +telangana +vinci +inexpensive +nuclei +322 +jang +nme +soho +spd +##ign +cradled +receptionist +pow +##43 +##rika +fascism +##ifer +experimenting +##ading +##iec +##region +345 +jocelyn +maris +stair +nocturnal +toro +constabulary +elgin +##kker +msc +##giving +##schen +##rase +doherty +doping +sarcastically +batter +maneuvers +##cano +##apple +##gai +##git +intrinsic +##nst +##stor +1753 +showtime +cafes +gasps +lviv +ushered +##thed +fours +restart +astonishment +transmitting +flyer +shrugs +##sau +intriguing +cones +dictated +mushrooms +medial +##kovsky +##elman +escorting +gaped +##26 +godfather +##door +##sell +djs +recaptured +timetable +vila +1710 +3a +aerodrome +mortals +scientology +##orne +angelina +mag +convection +unpaid +insertion +intermittent +lego +##nated +endeavor +kota +pereira +##lz +304 +bwv +glamorgan +insults +agatha +fey +##cend +fleetwood +mahogany +protruding +steamship +zeta +##arty +mcguire +suspense +##sphere +advising +urges +##wala +hurriedly +meteor +gilded +inline +arroyo +stalker +##oge +excitedly +revered +##cure +earle +introductory +##break +##ilde +mutants +puff +pulses +reinforcement +##haling +curses +lizards +stalk +correlated +##fixed +fallout +macquarie +##unas +bearded +denton +heaving +802 +##ocation +winery +assign +dortmund +##lkirk +everest +invariant +charismatic +susie +##elling +bled +lesley +telegram +sumner +bk +##ogen +##к +wilcox +needy +colbert +duval +##iferous +##mbled +allotted +attends +imperative +##hita +replacements +hawker +##inda +insurgency +##zee +##eke +casts +##yla +680 +ives +transitioned +##pack +##powering +authoritative +baylor +flex +cringed +plaintiffs +woodrow +##skie +drastic +ape +aroma +unfolded +commotion +nt +preoccupied +theta +routines +lasers +privatization +wand +domino +ek +clenching +nsa +strategically +showered +bile +handkerchief +pere +storing +christophe +insulting +316 +nakamura +romani +asiatic +magdalena +palma +cruises +stripping +405 +konstantin +soaring +##berman +colloquially +forerunner +havilland +incarcerated +parasites +sincerity +##utus +disks +plank +saigon +##ining +corbin +homo +ornaments +powerhouse +##tlement +chong +fastened +feasibility +idf +morphological +usable +##nish +##zuki +aqueduct +jaguars +keepers +##flies +aleksandr +faust +assigns +ewing +bacterium +hurled +tricky +hungarians +integers +wallis +321 +yamaha +##isha +hushed +oblivion +aviator +evangelist +friars +##eller +monograph +ode +##nary +airplanes +labourers +charms +##nee +1661 +hagen +tnt +rudder +fiesta +transcript +dorothea +ska +inhibitor +maccabi +retorted +raining +encompassed +clauses +menacing +1642 +lineman +##gist +vamps +##ape +##dick +gloom +##rera +dealings +easing +seekers +##nut +##pment +helens +unmanned +##anu +##isson +basics +##amy +##ckman +adjustments +1688 +brutality +horne +##zell +sui +##55 +##mable +aggregator +##thal +rhino +##drick +##vira +counters +zoom +##01 +##rting +mn +montenegrin +packard +##unciation +##♭ +##kki +reclaim +scholastic +thugs +pulsed +##icia +syriac +quan +saddam +banda +kobe +blaming +buddies +dissent +##lusion +##usia +corbett +jaya +delle +erratic +lexie +##hesis +435 +amiga +hermes +##pressing +##leen +chapels +gospels +jamal +##uating +compute +revolving +warp +##sso +##thes +armory +##eras +##gol +antrim +loki +##kow +##asian +##good +##zano +braid +handwriting +subdistrict +funky +pantheon +##iculate +concurrency +estimation +improper +juliana +##his +newcomers +johnstone +staten +communicated +##oco +##alle +sausage +stormy +##stered +##tters +superfamily +##grade +acidic +collateral +tabloid +##oped +##rza +bladder +austen +##ellant +mcgraw +##hay +hannibal +mein +aquino +lucifer +wo +badger +boar +cher +christensen +greenberg +interruption +##kken +jem +244 +mocked +bottoms +cambridgeshire +##lide +sprawling +##bbly +eastwood +ghent +synth +##buck +advisers +##bah +nominally +hapoel +qu +daggers +estranged +fabricated +towels +vinnie +wcw +misunderstanding +anglia +nothin +unmistakable +##dust +##lova +chilly +marquette +truss +##edge +##erine +reece +##lty +##chemist +##connected +272 +308 +41st +bash +raion +waterfalls +##ump +##main +labyrinth +queue +theorist +##istle +bharatiya +flexed +soundtracks +rooney +leftist +patrolling +wharton +plainly +alleviate +eastman +schuster +topographic +engages +immensely +unbearable +fairchild +1620 +dona +lurking +parisian +oliveira +ia +indictment +hahn +bangladeshi +##aster +vivo +##uming +##ential +antonia +expects +indoors +kildare +harlan +##logue +##ogenic +##sities +forgiven +##wat +childish +tavi +##mide +##orra +plausible +grimm +successively +scooted +##bola +##dget +##rith +spartans +emery +flatly +azure +epilogue +##wark +flourish +##iny +##tracted +##overs +##oshi +bestseller +distressed +receipt +spitting +hermit +topological +##cot +drilled +subunit +francs +##layer +eel +##fk +##itas +octopus +footprint +petitions +ufo +##say +##foil +interfering +leaking +palo +##metry +thistle +valiant +##pic +narayan +mcpherson +##fast +gonzales +##ym +##enne +dustin +novgorod +solos +##zman +doin +##raph +##patient +##meyer +soluble +ashland +cuffs +carole +pendleton +whistling +vassal +##river +deviation +revisited +constituents +rallied +rotate +loomed +##eil +##nting +amateurs +augsburg +auschwitz +crowns +skeletons +##cona +bonnet +257 +dummy +globalization +simeon +sleeper +mandal +differentiated +##crow +##mare +milne +bundled +exasperated +talmud +owes +segregated +##feng +##uary +dentist +piracy +props +##rang +devlin +##torium +malicious +paws +##laid +dependency +##ergy +##fers +##enna +258 +pistons +rourke +jed +grammatical +tres +maha +wig +512 +ghostly +jayne +##achal +##creen +##ilis +##lins +##rence +designate +##with +arrogance +cambodian +clones +showdown +throttle +twain +##ception +lobes +metz +nagoya +335 +braking +##furt +385 +roaming +##minster +amin +crippled +##37 +##llary +indifferent +hoffmann +idols +intimidating +1751 +261 +influenza +memo +onions +1748 +bandage +consciously +##landa +##rage +clandestine +observes +swiped +tangle +##ener +##jected +##trum +##bill +##lta +hugs +congresses +josiah +spirited +##dek +humanist +managerial +filmmaking +inmate +rhymes +debuting +grimsby +ur +##laze +duplicate +vigor +##tf +republished +bolshevik +refurbishment +antibiotics +martini +methane +newscasts +royale +horizons +levant +iain +visas +##ischen +paler +##around +manifestation +snuck +alf +chop +futile +pedestal +rehab +##kat +bmg +kerman +res +fairbanks +jarrett +abstraction +saharan +##zek +1746 +procedural +clearer +kincaid +sash +luciano +##ffey +crunch +helmut +##vara +revolutionaries +##tute +creamy +leach +##mmon +1747 +permitting +nes +plight +wendell +##lese +contra +ts +clancy +ipa +mach +staples +autopsy +disturbances +nueva +karin +pontiac +##uding +proxy +venerable +haunt +leto +bergman +expands +##helm +wal +##pipe +canning +celine +cords +obesity +##enary +intrusion +planner +##phate +reasoned +sequencing +307 +harrow +##chon +##dora +marred +mcintyre +repay +tarzan +darting +248 +harrisburg +margarita +repulsed +##hur +##lding +belinda +hamburger +novo +compliant +runways +bingham +registrar +skyscraper +ic +cuthbert +improvisation +livelihood +##corp +##elial +admiring +##dened +sporadic +believer +casablanca +popcorn +##29 +asha +shovel +##bek +##dice +coiled +tangible +##dez +casper +elsie +resin +tenderness +rectory +##ivision +avail +sonar +##mori +boutique +##dier +guerre +bathed +upbringing +vaulted +sandals +blessings +##naut +##utnant +1680 +306 +foxes +pia +corrosion +hesitantly +confederates +crystalline +footprints +shapiro +tirana +valentin +drones +45th +microscope +shipments +texted +inquisition +wry +guernsey +unauthorized +resigning +760 +ripple +schubert +stu +reassure +felony +##ardo +brittle +koreans +##havan +##ives +dun +implicit +tyres +##aldi +##lth +magnolia +##ehan +##puri +##poulos +aggressively +fei +gr +familiarity +##poo +indicative +##trust +fundamentally +jimmie +overrun +395 +anchors +moans +##opus +britannia +armagh +##ggle +purposely +seizing +##vao +bewildered +mundane +avoidance +cosmopolitan +geometridae +quartermaster +caf +415 +chatter +engulfed +gleam +purge +##icate +juliette +jurisprudence +guerra +revisions +##bn +casimir +brew +##jm +1749 +clapton +cloudy +conde +hermitage +278 +simulations +torches +vincenzo +matteo +##rill +hidalgo +booming +westbound +accomplishment +tentacles +unaffected +##sius +annabelle +flopped +sloping +##litz +dreamer +interceptor +vu +##loh +consecration +copying +messaging +breaker +climates +hospitalized +1752 +torino +afternoons +winfield +witnessing +##teacher +breakers +choirs +sawmill +coldly +##ege +sipping +haste +uninhabited +conical +bibliography +pamphlets +severn +edict +##oca +deux +illnesses +grips +##pl +rehearsals +sis +thinkers +tame +##keepers +1690 +acacia +reformer +##osed +##rys +shuffling +##iring +##shima +eastbound +ionic +rhea +flees +littered +##oum +rocker +vomiting +groaning +champ +overwhelmingly +civilizations +paces +sloop +adoptive +##tish +skaters +##vres +aiding +mango +##joy +nikola +shriek +##ignon +pharmaceuticals +##mg +tuna +calvert +gustavo +stocked +yearbook +##urai +##mana +computed +subsp +riff +hanoi +kelvin +hamid +moors +pastures +summons +jihad +nectar +##ctors +bayou +untitled +pleasing +vastly +republics +intellect +##η +##ulio +##tou +crumbling +stylistic +sb +##ی +consolation +frequented +h₂o +walden +widows +##iens +404 +##ignment +chunks +improves +288 +grit +recited +##dev +snarl +sociological +##arte +##gul +inquired +##held +bruise +clube +consultancy +homogeneous +hornets +multiplication +pasta +prick +savior +##grin +##kou +##phile +yoon +##gara +grimes +vanishing +cheering +reacting +bn +distillery +##quisite +##vity +coe +dockyard +massif +##jord +escorts +voss +##valent +byte +chopped +hawke +illusions +workings +floats +##koto +##vac +kv +annapolis +madden +##onus +alvaro +noctuidae +##cum +##scopic +avenge +steamboat +forte +illustrates +erika +##trip +570 +dew +nationalities +bran +manifested +thirsty +diversified +muscled +reborn +##standing +arson +##lessness +##dran +##logram +##boys +##kushima +##vious +willoughby +##phobia +286 +alsace +dashboard +yuki +##chai +granville +myspace +publicized +tricked +##gang +adjective +##ater +relic +reorganisation +enthusiastically +indications +saxe +##lassified +consolidate +iec +padua +helplessly +ramps +renaming +regulars +pedestrians +accents +convicts +inaccurate +lowers +mana +##pati +barrie +bjp +outta +someplace +berwick +flanking +invoked +marrow +sparsely +excerpts +clothed +rei +##ginal +wept +##straße +##vish +alexa +excel +##ptive +membranes +aquitaine +creeks +cutler +sheppard +implementations +ns +##dur +fragrance +budge +concordia +magnesium +marcelo +##antes +gladly +vibrating +##rral +##ggles +montrose +##omba +lew +seamus +1630 +cocky +##ament +##uen +bjorn +##rrick +fielder +fluttering +##lase +methyl +kimberley +mcdowell +reductions +barbed +##jic +##tonic +aeronautical +condensed +distracting +##promising +huffed +##cala +##sle +claudius +invincible +missy +pious +balthazar +ci +##lang +butte +combo +orson +##dication +myriad +1707 +silenced +##fed +##rh +coco +netball +yourselves +##oza +clarify +heller +peg +durban +etudes +offender +roast +blackmail +curvature +##woods +vile +309 +illicit +suriname +##linson +overture +1685 +bubbling +gymnast +tucking +##mming +##ouin +maldives +##bala +gurney +##dda +##eased +##oides +backside +pinto +jars +racehorse +tending +##rdial +baronetcy +wiener +duly +##rke +barbarian +cupping +flawed +##thesis +bertha +pleistocene +puddle +swearing +##nob +##tically +fleeting +prostate +amulet +educating +##mined +##iti +##tler +75th +jens +respondents +analytics +cavaliers +papacy +raju +##iente +##ulum +##tip +funnel +271 +disneyland +##lley +sociologist +##iam +2500 +faulkner +louvre +menon +##dson +276 +##ower +afterlife +mannheim +peptide +referees +comedians +meaningless +##anger +##laise +fabrics +hurley +renal +sleeps +##bour +##icle +breakout +kristin +roadside +animator +clover +disdain +unsafe +redesign +##urity +firth +barnsley +portage +reset +narrows +268 +commandos +expansive +speechless +tubular +##lux +essendon +eyelashes +smashwords +##yad +##bang +##claim +craved +sprinted +chet +somme +astor +wrocław +orton +266 +bane +##erving +##uing +mischief +##amps +##sund +scaling +terre +##xious +impairment +offenses +undermine +moi +soy +contiguous +arcadia +inuit +seam +##tops +macbeth +rebelled +##icative +##iot +590 +elaborated +frs +uniformed +##dberg +259 +powerless +priscilla +stimulated +980 +qc +arboretum +frustrating +trieste +bullock +##nified +enriched +glistening +intern +##adia +locus +nouvelle +ollie +ike +lash +starboard +ee +tapestry +headlined +hove +rigged +##vite +pollock +##yme +thrive +clustered +cas +roi +gleamed +olympiad +##lino +pressured +regimes +##hosis +##lick +ripley +##ophone +kickoff +gallon +rockwell +##arable +crusader +glue +revolutions +scrambling +1714 +grover +##jure +englishman +aztec +263 +contemplating +coven +ipad +preach +triumphant +tufts +##esian +rotational +##phus +328 +falkland +##brates +strewn +clarissa +rejoin +environmentally +glint +banded +drenched +moat +albanians +johor +rr +maestro +malley +nouveau +shaded +taxonomy +v6 +adhere +bunk +airfields +##ritan +1741 +encompass +remington +tran +##erative +amelie +mazda +friar +morals +passions +##zai +breadth +vis +##hae +argus +burnham +caressing +insider +rudd +##imov +##mini +##rso +italianate +murderous +textual +wainwright +armada +bam +weave +timer +##taken +##nh +fra +##crest +ardent +salazar +taps +tunis +##ntino +allegro +gland +philanthropic +##chester +implication +##optera +esq +judas +noticeably +wynn +##dara +inched +indexed +crises +villiers +bandit +royalties +patterned +cupboard +interspersed +accessory +isla +kendrick +entourage +stitches +##esthesia +headwaters +##ior +interlude +distraught +draught +1727 +##basket +biased +sy +transient +triad +subgenus +adapting +kidd +shortstop +##umatic +dimly +spiked +mcleod +reprint +nellie +pretoria +windmill +##cek +singled +##mps +273 +reunite +##orous +747 +bankers +outlying +##omp +##ports +##tream +apologies +cosmetics +patsy +##deh +##ocks +##yson +bender +nantes +serene +##nad +lucha +mmm +323 +##cius +##gli +cmll +coinage +nestor +juarez +##rook +smeared +sprayed +twitching +sterile +irina +embodied +juveniles +enveloped +miscellaneous +cancers +dq +gulped +luisa +crested +swat +donegal +ref +##anov +##acker +hearst +mercantile +##lika +doorbell +ua +vicki +##alla +##som +bilbao +psychologists +stryker +sw +horsemen +turkmenistan +wits +##national +anson +mathew +screenings +##umb +rihanna +##agne +##nessy +aisles +##iani +##osphere +hines +kenton +saskatoon +tasha +truncated +##champ +##itan +mildred +advises +fredrik +interpreting +inhibitors +##athi +spectroscopy +##hab +##kong +karim +panda +##oia +##nail +##vc +conqueror +kgb +leukemia +##dity +arrivals +cheered +pisa +phosphorus +shielded +##riated +mammal +unitarian +urgently +chopin +sanitary +##mission +spicy +drugged +hinges +##tort +tipping +trier +impoverished +westchester +##caster +267 +epoch +nonstop +##gman +##khov +aromatic +centrally +cerro +##tively +##vio +billions +modulation +sedimentary +283 +facilitating +outrageous +goldstein +##eak +##kt +ld +maitland +penultimate +pollard +##dance +fleets +spaceship +vertebrae +##nig +alcoholism +als +recital +##bham +##ference +##omics +m2 +##bm +trois +##tropical +##в +commemorates +##meric +marge +##raction +1643 +670 +cosmetic +ravaged +##ige +catastrophe +eng +##shida +albrecht +arterial +bellamy +decor +harmon +##rde +bulbs +synchronized +vito +easiest +shetland +shielding +wnba +##glers +##ssar +##riam +brianna +cumbria +##aceous +##rard +cores +thayer +##nsk +brood +hilltop +luminous +carts +keynote +larkin +logos +##cta +##ا +##mund +##quay +lilith +tinted +277 +wrestle +mobilization +##uses +sequential +siam +bloomfield +takahashi +274 +##ieving +presenters +ringo +blazed +witty +##oven +##ignant +devastation +haydn +harmed +newt +therese +##peed +gershwin +molina +rabbis +sudanese +001 +innate +restarted +##sack +##fus +slices +wb +##shah +enroll +hypothetical +hysterical +1743 +fabio +indefinite +warped +##hg +exchanging +525 +unsuitable +##sboro +gallo +1603 +bret +cobalt +homemade +##hunter +mx +operatives +##dhar +terraces +durable +latch +pens +whorls +##ctuated +##eaux +billing +ligament +succumbed +##gly +regulators +spawn +##brick +##stead +filmfare +rochelle +##nzo +1725 +circumstance +saber +supplements +##nsky +##tson +crowe +wellesley +carrot +##9th +##movable +primate +drury +sincerely +topical +##mad +##rao +callahan +kyiv +smarter +tits +undo +##yeh +announcements +anthologies +barrio +nebula +##islaus +##shaft +##tyn +bodyguards +2021 +assassinate +barns +emmett +scully +##mah +##yd +##eland +##tino +##itarian +demoted +gorman +lashed +prized +adventist +writ +##gui +alla +invertebrates +##ausen +1641 +amman +1742 +align +healy +redistribution +##gf +##rize +insulation +##drop +adherents +hezbollah +vitro +ferns +yanking +269 +php +registering +uppsala +cheerleading +confines +mischievous +tully +##ross +49th +docked +roam +stipulated +pumpkin +##bry +prompt +##ezer +blindly +shuddering +craftsmen +frail +scented +katharine +scramble +shaggy +sponge +helix +zaragoza +279 +##52 +43rd +backlash +fontaine +seizures +posse +cowan +nonfiction +telenovela +wwii +hammered +undone +##gpur +encircled +irs +##ivation +artefacts +oneself +searing +smallpox +##belle +##osaurus +shandong +breached +upland +blushing +rankin +infinitely +psyche +tolerated +docking +evicted +##col +unmarked +##lving +gnome +lettering +litres +musique +##oint +benevolent +##jal +blackened +##anna +mccall +racers +tingle +##ocene +##orestation +introductions +radically +292 +##hiff +##باد +1610 +1739 +munchen +plead +##nka +condo +scissors +##sight +##tens +apprehension +##cey +##yin +hallmark +watering +formulas +sequels +##llas +aggravated +bae +commencing +##building +enfield +prohibits +marne +vedic +civilized +euclidean +jagger +beforehand +blasts +dumont +##arney +##nem +740 +conversions +hierarchical +rios +simulator +##dya +##lellan +hedges +oleg +thrusts +shadowed +darby +maximize +1744 +gregorian +##nded +##routed +sham +unspecified +##hog +emory +factual +##smo +##tp +fooled +##rger +ortega +wellness +marlon +##oton +##urance +casket +keating +ley +enclave +##ayan +char +influencing +jia +##chenko +412 +ammonia +erebidae +incompatible +violins +cornered +##arat +grooves +astronauts +columbian +rampant +fabrication +kyushu +mahmud +vanish +##dern +mesopotamia +##lete +ict +##rgen +caspian +kenji +pitted +##vered +999 +grimace +roanoke +tchaikovsky +twinned +##analysis +##awan +xinjiang +arias +clemson +kazakh +sizable +1662 +##khand +##vard +plunge +tatum +vittorio +##nden +cholera +##dana +##oper +bracing +indifference +projectile +superliga +##chee +realises +upgrading +299 +porte +retribution +##vies +nk +stil +##resses +ama +bureaucracy +blackberry +bosch +testosterone +collapses +greer +##pathic +ioc +fifties +malls +##erved +bao +baskets +adolescents +siegfried +##osity +##tosis +mantra +detecting +existent +fledgling +##cchi +dissatisfied +gan +telecommunication +mingled +sobbed +6000 +controversies +outdated +taxis +##raus +fright +slams +##lham +##fect +##tten +detectors +fetal +tanned +##uw +fray +goth +olympian +skipping +mandates +scratches +sheng +unspoken +hyundai +tracey +hotspur +restrictive +##buch +americana +mundo +##bari +burroughs +diva +vulcan +##6th +distinctions +thumping +##ngen +mikey +sheds +fide +rescues +springsteen +vested +valuation +##ece +##ely +pinnacle +rake +sylvie +##edo +almond +quivering +##irus +alteration +faltered +##wad +51st +hydra +ticked +##kato +recommends +##dicated +antigua +arjun +stagecoach +wilfred +trickle +pronouns +##pon +aryan +nighttime +##anian +gall +pea +stitch +##hei +leung +milos +##dini +eritrea +nexus +starved +snowfall +kant +parasitic +cot +discus +hana +strikers +appleton +kitchens +##erina +##partisan +##itha +##vius +disclose +metis +##channel +1701 +tesla +##vera +fitch +1735 +blooded +##tila +decimal +##tang +##bai +cyclones +eun +bottled +peas +pensacola +basha +bolivian +crabs +boil +lanterns +partridge +roofed +1645 +necks +##phila +opined +patting +##kla +##lland +chuckles +volta +whereupon +##nche +devout +euroleague +suicidal +##dee +inherently +involuntary +knitting +nasser +##hide +puppets +colourful +courageous +southend +stills +miraculous +hodgson +richer +rochdale +ethernet +greta +uniting +prism +umm +##haya +##itical +##utation +deterioration +pointe +prowess +##ropriation +lids +scranton +billings +subcontinent +##koff +##scope +brute +kellogg +psalms +degraded +##vez +stanisław +##ructured +ferreira +pun +astonishing +gunnar +##yat +arya +prc +gottfried +##tight +excursion +##ographer +dina +##quil +##nare +huffington +illustrious +wilbur +gundam +verandah +##zard +naacp +##odle +constructive +fjord +kade +##naud +generosity +thrilling +baseline +cayman +frankish +plastics +accommodations +zoological +##fting +cedric +qb +motorized +##dome +##otted +squealed +tackled +canucks +budgets +situ +asthma +dail +gabled +grasslands +whimpered +writhing +judgments +##65 +minnie +pv +##carbon +bananas +grille +domes +monique +odin +maguire +markham +tierney +##estra +##chua +libel +poke +speedy +atrium +laval +notwithstanding +##edly +fai +kala +##sur +robb +##sma +listings +luz +supplementary +tianjin +##acing +enzo +jd +ric +scanner +croats +transcribed +##49 +arden +cv +##hair +##raphy +##lver +##uy +357 +seventies +staggering +alam +horticultural +hs +regression +timbers +blasting +##ounded +montagu +manipulating +##cit +catalytic +1550 +troopers +##meo +condemnation +fitzpatrick +##oire +##roved +inexperienced +1670 +castes +##lative +outing +314 +dubois +flicking +quarrel +ste +learners +1625 +iq +whistled +##class +282 +classify +tariffs +temperament +355 +folly +liszt +##yles +immersed +jordanian +ceasefire +apparel +extras +maru +fished +##bio +harta +stockport +assortment +craftsman +paralysis +transmitters +##cola +blindness +##wk +fatally +proficiency +solemnly +##orno +repairing +amore +groceries +ultraviolet +##chase +schoolhouse +##tua +resurgence +nailed +##otype +##× +ruse +saliva +diagrams +##tructing +albans +rann +thirties +1b +antennas +hilarious +cougars +paddington +stats +##eger +breakaway +ipod +reza +authorship +prohibiting +scoffed +##etz +##ttle +conscription +defected +trondheim +##fires +ivanov +keenan +##adan +##ciful +##fb +##slow +locating +##ials +##tford +cadiz +basalt +blankly +interned +rags +rattling +##tick +carpathian +reassured +sync +bum +guildford +iss +staunch +##onga +astronomers +sera +sofie +emergencies +susquehanna +##heard +duc +mastery +vh1 +williamsburg +bayer +buckled +craving +##khan +##rdes +bloomington +##write +alton +barbecue +##bians +justine +##hri +##ndt +delightful +smartphone +newtown +photon +retrieval +peugeot +hissing +##monium +##orough +flavors +lighted +relaunched +tainted +##games +##lysis +anarchy +microscopic +hopping +adept +evade +evie +##beau +inhibit +sinn +adjustable +hurst +intuition +wilton +cisco +44th +lawful +lowlands +stockings +thierry +##dalen +##hila +##nai +fates +prank +tb +maison +lobbied +provocative +1724 +4a +utopia +##qual +carbonate +gujarati +purcell +##rford +curtiss +##mei +overgrown +arenas +mediation +swallows +##rnik +respectful +turnbull +##hedron +##hope +alyssa +ozone +##ʻi +ami +gestapo +johansson +snooker +canteen +cuff +declines +empathy +stigma +##ags +##iner +##raine +taxpayers +gui +volga +##wright +##copic +lifespan +overcame +tattooed +enactment +giggles +##ador +##camp +barrington +bribe +obligatory +orbiting +peng +##enas +elusive +sucker +##vating +cong +hardship +empowered +anticipating +estrada +cryptic +greasy +detainees +planck +sudbury +plaid +dod +marriott +kayla +##ears +##vb +##zd +mortally +##hein +cognition +radha +319 +liechtenstein +meade +richly +argyle +harpsichord +liberalism +trumpets +lauded +tyrant +salsa +tiled +lear +promoters +reused +slicing +trident +##chuk +##gami +##lka +cantor +checkpoint +##points +gaul +leger +mammalian +##tov +##aar +##schaft +doha +frenchman +nirvana +##vino +delgado +headlining +##eron +##iography +jug +tko +1649 +naga +intersections +##jia +benfica +nawab +##suka +ashford +gulp +##deck +##vill +##rug +brentford +frazier +pleasures +dunne +potsdam +shenzhen +dentistry +##tec +flanagan +##dorff +##hear +chorale +dinah +prem +quezon +##rogated +relinquished +sutra +terri +##pani +flaps +##rissa +poly +##rnet +homme +aback +##eki +linger +womb +##kson +##lewood +doorstep +orthodoxy +threaded +westfield +##rval +dioceses +fridays +subsided +##gata +loyalists +##biotic +##ettes +letterman +lunatic +prelate +tenderly +invariably +souza +thug +winslow +##otide +furlongs +gogh +jeopardy +##runa +pegasus +##umble +humiliated +standalone +tagged +##roller +freshmen +klan +##bright +attaining +initiating +transatlantic +logged +viz +##uance +1723 +combatants +intervening +stephane +chieftain +despised +grazed +317 +cdc +galveston +godzilla +macro +simulate +##planes +parades +##esses +960 +##ductive +##unes +equator +overdose +##cans +##hosh +##lifting +joshi +epstein +sonora +treacherous +aquatics +manchu +responsive +##sation +supervisory +##christ +##llins +##ibar +##balance +##uso +kimball +karlsruhe +mab +##emy +ignores +phonetic +reuters +spaghetti +820 +almighty +danzig +rumbling +tombstone +designations +lured +outset +##felt +supermarkets +##wt +grupo +kei +kraft +susanna +##blood +comprehension +genealogy +##aghan +##verted +redding +##ythe +1722 +bowing +##pore +##roi +lest +sharpened +fulbright +valkyrie +sikhs +##unds +swans +bouquet +merritt +##tage +##venting +commuted +redhead +clerks +leasing +cesare +dea +hazy +##vances +fledged +greenfield +servicemen +##gical +armando +blackout +dt +sagged +downloadable +intra +potion +pods +##4th +##mism +xp +attendants +gambia +stale +##ntine +plump +asteroids +rediscovered +buds +flea +hive +##neas +1737 +classifications +debuts +##eles +olympus +scala +##eurs +##gno +##mute +hummed +sigismund +visuals +wiggled +await +pilasters +clench +sulfate +##ances +bellevue +enigma +trainee +snort +##sw +clouded +denim +##rank +##rder +churning +hartman +lodges +riches +sima +##missible +accountable +socrates +regulates +mueller +##cr +1702 +avoids +solids +himalayas +nutrient +pup +##jevic +squat +fades +nec +##lates +##pina +##rona +##ου +privateer +tequila +##gative +##mpton +apt +hornet +immortals +##dou +asturias +cleansing +dario +##rries +##anta +etymology +servicing +zhejiang +##venor +##nx +horned +erasmus +rayon +relocating +£10 +##bags +escalated +promenade +stubble +2010s +artisans +axial +liquids +mora +sho +yoo +##tsky +bundles +oldies +##nally +notification +bastion +##ths +sparkle +##lved +1728 +leash +pathogen +highs +##hmi +immature +880 +gonzaga +ignatius +mansions +monterrey +sweets +bryson +##loe +polled +regatta +brightest +pei +rosy +squid +hatfield +payroll +addict +meath +cornerback +heaviest +lodging +##mage +capcom +rippled +##sily +barnet +mayhem +ymca +snuggled +rousseau +##cute +blanchard +284 +fragmented +leighton +chromosomes +risking +##md +##strel +##utter +corinne +coyotes +cynical +hiroshi +yeomanry +##ractive +ebook +grading +mandela +plume +agustin +magdalene +##rkin +bea +femme +trafford +##coll +##lun +##tance +52nd +fourier +upton +##mental +camilla +gust +iihf +islamabad +longevity +##kala +feldman +netting +##rization +endeavour +foraging +mfa +orr +##open +greyish +contradiction +graz +##ruff +handicapped +marlene +tweed +oaxaca +spp +campos +miocene +pri +configured +cooks +pluto +cozy +pornographic +##entes +70th +fairness +glided +jonny +lynne +rounding +sired +##emon +##nist +remade +uncover +##mack +complied +lei +newsweek +##jured +##parts +##enting +##pg +293 +finer +guerrillas +athenian +deng +disused +stepmother +accuse +gingerly +seduction +521 +confronting +##walker +##going +gora +nostalgia +sabres +virginity +wrenched +##minated +syndication +wielding +eyre +##56 +##gnon +##igny +behaved +taxpayer +sweeps +##growth +childless +gallant +##ywood +amplified +geraldine +scrape +##ffi +babylonian +fresco +##rdan +##kney +##position +1718 +restricting +tack +fukuoka +osborn +selector +partnering +##dlow +318 +gnu +kia +tak +whitley +gables +##54 +##mania +mri +softness +immersion +##bots +##evsky +1713 +chilling +insignificant +pcs +##uis +elites +lina +purported +supplemental +teaming +##americana +##dding +##inton +proficient +rouen +##nage +##rret +niccolo +selects +##bread +fluffy +1621 +gruff +knotted +mukherjee +polgara +thrash +nicholls +secluded +smoothing +thru +corsica +loaf +whitaker +inquiries +##rrier +##kam +indochina +289 +marlins +myles +peking +##tea +extracts +pastry +superhuman +connacht +vogel +##ditional +##het +##udged +##lash +gloss +quarries +refit +teaser +##alic +##gaon +20s +materialized +sling +camped +pickering +tung +tracker +pursuant +##cide +cranes +soc +##cini +##typical +##viere +anhalt +overboard +workout +chores +fares +orphaned +stains +##logie +fenton +surpassing +joyah +triggers +##itte +grandmaster +##lass +##lists +clapping +fraudulent +ledger +nagasaki +##cor +##nosis +##tsa +eucalyptus +tun +##icio +##rney +##tara +dax +heroism +ina +wrexham +onboard +unsigned +##dates +moshe +galley +winnie +droplets +exiles +praises +watered +noodles +##aia +fein +adi +leland +multicultural +stink +bingo +comets +erskine +modernized +canned +constraint +domestically +chemotherapy +featherweight +stifled +##mum +darkly +irresistible +refreshing +hasty +isolate +##oys +kitchener +planners +##wehr +cages +yarn +implant +toulon +elects +childbirth +yue +##lind +##lone +cn +rightful +sportsman +junctions +remodeled +specifies +##rgh +291 +##oons +complimented +##urgent +lister +ot +##logic +bequeathed +cheekbones +fontana +gabby +##dial +amadeus +corrugated +maverick +resented +triangles +##hered +##usly +nazareth +tyrol +1675 +assent +poorer +sectional +aegean +##cous +296 +nylon +ghanaian +##egorical +##weig +cushions +forbid +fusiliers +obstruction +somerville +##scia +dime +earrings +elliptical +leyte +oder +polymers +timmy +atm +midtown +piloted +settles +continual +externally +mayfield +##uh +enrichment +henson +keane +persians +1733 +benji +braden +pep +324 +##efe +contenders +pepsi +valet +##isches +298 +##asse +##earing +goofy +stroll +##amen +authoritarian +occurrences +adversary +ahmedabad +tangent +toppled +dorchester +1672 +modernism +marxism +islamist +charlemagne +exponential +racks +unicode +brunette +mbc +pic +skirmish +##bund +##lad +##powered +##yst +hoisted +messina +shatter +##ctum +jedi +vantage +##music +##neil +clemens +mahmoud +corrupted +authentication +lowry +nils +##washed +omnibus +wounding +jillian +##itors +##opped +serialized +narcotics +handheld +##arm +##plicity +intersecting +stimulating +##onis +crate +fellowships +hemingway +casinos +climatic +fordham +copeland +drip +beatty +leaflets +robber +brothel +madeira +##hedral +sphinx +ultrasound +##vana +valor +forbade +leonid +villas +##aldo +duane +marquez +##cytes +disadvantaged +forearms +kawasaki +reacts +consular +lax +uncles +uphold +##hopper +concepcion +dorsey +lass +##izan +arching +passageway +1708 +researches +tia +internationals +##graphs +##opers +distinguishes +javanese +divert +##uven +plotted +##listic +##rwin +##erik +##tify +affirmative +signifies +validation +##bson +kari +felicity +georgina +zulu +##eros +##rained +##rath +overcoming +##dot +argyll +##rbin +1734 +chiba +ratification +windy +earls +parapet +##marks +hunan +pristine +astrid +punta +##gart +brodie +##kota +##oder +malaga +minerva +rouse +##phonic +bellowed +pagoda +portals +reclamation +##gur +##odies +##⁄₄ +parentheses +quoting +allergic +palette +showcases +benefactor +heartland +nonlinear +##tness +bladed +cheerfully +scans +##ety +##hone +1666 +girlfriends +pedersen +hiram +sous +##liche +##nator +1683 +##nery +##orio +##umen +bobo +primaries +smiley +##cb +unearthed +uniformly +fis +metadata +1635 +ind +##oted +recoil +##titles +##tura +##ια +406 +hilbert +jamestown +mcmillan +tulane +seychelles +##frid +antics +coli +fated +stucco +##grants +1654 +bulky +accolades +arrays +caledonian +carnage +optimism +puebla +##tative +##cave +enforcing +rotherham +seo +dunlop +aeronautics +chimed +incline +zoning +archduke +hellenistic +##oses +##sions +candi +thong +##ople +magnate +rustic +##rsk +projective +slant +##offs +danes +hollis +vocalists +##ammed +congenital +contend +gesellschaft +##ocating +##pressive +douglass +quieter +##cm +##kshi +howled +salim +spontaneously +townsville +buena +southport +##bold +kato +1638 +faerie +stiffly +##vus +##rled +297 +flawless +realising +taboo +##7th +bytes +straightening +356 +jena +##hid +##rmin +cartwright +berber +bertram +soloists +411 +noses +417 +coping +fission +hardin +inca +##cen +1717 +mobilized +vhf +##raf +biscuits +curate +##85 +##anial +331 +gaunt +neighbourhoods +1540 +##abas +blanca +bypassed +sockets +behold +coincidentally +##bane +nara +shave +splinter +terrific +##arion +##erian +commonplace +juris +redwood +waistband +boxed +caitlin +fingerprints +jennie +naturalized +##ired +balfour +craters +jody +bungalow +hugely +quilt +glitter +pigeons +undertaker +bulging +constrained +goo +##sil +##akh +assimilation +reworked +##person +persuasion +##pants +felicia +##cliff +##ulent +1732 +explodes +##dun +##inium +##zic +lyman +vulture +hog +overlook +begs +northwards +ow +spoil +##urer +fatima +favorably +accumulate +sargent +sorority +corresponded +dispersal +kochi +toned +##imi +##lita +internacional +newfound +##agger +##lynn +##rigue +booths +peanuts +##eborg +medicare +muriel +nur +##uram +crates +millennia +pajamas +worsened +##breakers +jimi +vanuatu +yawned +##udeau +carousel +##hony +hurdle +##ccus +##mounted +##pod +rv +##eche +airship +ambiguity +compulsion +recapture +##claiming +arthritis +##osomal +1667 +asserting +ngc +sniffing +dade +discontent +glendale +ported +##amina +defamation +rammed +##scent +fling +livingstone +##fleet +875 +##ppy +apocalyptic +comrade +lcd +##lowe +cessna +eine +persecuted +subsistence +demi +hoop +reliefs +710 +coptic +progressing +stemmed +perpetrators +1665 +priestess +##nio +dobson +ebony +rooster +itf +tortricidae +##bbon +##jian +cleanup +##jean +##øy +1721 +eighties +taxonomic +holiness +##hearted +##spar +antilles +showcasing +stabilized +##nb +gia +mascara +michelangelo +dawned +##uria +##vinsky +extinguished +fitz +grotesque +£100 +##fera +##loid +##mous +barges +neue +throbbed +cipher +johnnie +##a1 +##mpt +outburst +##swick +spearheaded +administrations +c1 +heartbreak +pixels +pleasantly +##enay +lombardy +plush +##nsed +bobbie +##hly +reapers +tremor +xiang +minogue +substantive +hitch +barak +##wyl +kwan +##encia +910 +obscene +elegance +indus +surfer +bribery +conserve +##hyllum +##masters +horatio +##fat +apes +rebound +psychotic +##pour +iteration +##mium +##vani +botanic +horribly +antiques +dispose +paxton +##hli +##wg +timeless +1704 +disregard +engraver +hounds +##bau +##version +looted +uno +facilitates +groans +masjid +rutland +antibody +disqualification +decatur +footballers +quake +slacks +48th +rein +scribe +stabilize +commits +exemplary +tho +##hort +##chison +pantry +traversed +##hiti +disrepair +identifiable +vibrated +baccalaureate +##nnis +csa +interviewing +##iensis +##raße +greaves +wealthiest +343 +classed +jogged +£5 +##58 +##atal +illuminating +knicks +respecting +##uno +scrubbed +##iji +##dles +kruger +moods +growls +raider +silvia +chefs +kam +vr +cree +percival +##terol +gunter +counterattack +defiant +henan +ze +##rasia +##riety +equivalence +submissions +##fra +##thor +bautista +mechanically +##heater +cornice +herbal +templar +##mering +outputs +ruining +ligand +renumbered +extravagant +mika +blockbuster +eta +insurrection +##ilia +darkening +ferocious +pianos +strife +kinship +##aer +melee +##anor +##iste +##may +##oue +decidedly +weep +##jad +##missive +##ppel +354 +puget +unease +##gnant +1629 +hammering +kassel +ob +wessex +##lga +bromwich +egan +paranoia +utilization +##atable +##idad +contradictory +provoke +##ols +##ouring +##tangled +knesset +##very +##lette +plumbing +##sden +##¹ +greensboro +occult +sniff +338 +zev +beaming +gamer +haggard +mahal +##olt +##pins +mendes +utmost +briefing +gunnery +##gut +##pher +##zh +##rok +1679 +khalifa +sonya +##boot +principals +urbana +wiring +##liffe +##minating +##rrado +dahl +nyu +skepticism +np +townspeople +ithaca +lobster +somethin +##fur +##arina +##−1 +freighter +zimmerman +biceps +contractual +##herton +amend +hurrying +subconscious +##anal +336 +meng +clermont +spawning +##eia +##lub +dignitaries +impetus +snacks +spotting +twigs +##bilis +##cz +##ouk +libertadores +nic +skylar +##aina +##firm +gustave +asean +##anum +dieter +legislatures +flirt +bromley +trolls +umar +##bbies +##tyle +blah +parc +bridgeport +crank +negligence +##nction +46th +constantin +molded +bandages +seriousness +00pm +siegel +carpets +compartments +upbeat +statehood +##dner +##edging +marko +730 +platt +##hane +paving +##iy +1738 +abbess +impatience +limousine +nbl +##talk +441 +lucille +mojo +nightfall +robbers +##nais +karel +brisk +calves +replicate +ascribed +telescopes +##olf +intimidated +##reen +ballast +specialization +##sit +aerodynamic +caliphate +rainer +visionary +##arded +epsilon +##aday +##onte +aggregation +auditory +boosted +reunification +kathmandu +loco +robyn +402 +acknowledges +appointing +humanoid +newell +redeveloped +restraints +##tained +barbarians +chopper +1609 +italiana +##lez +##lho +investigates +wrestlemania +##anies +##bib +690 +##falls +creaked +dragoons +gravely +minions +stupidity +volley +##harat +##week +musik +##eries +##uously +fungal +massimo +semantics +malvern +##ahl +##pee +discourage +embryo +imperialism +1910s +profoundly +##ddled +jiangsu +sparkled +stat +##holz +sweatshirt +tobin +##iction +sneered +##cheon +##oit +brit +causal +smyth +##neuve +diffuse +perrin +silvio +##ipes +##recht +detonated +iqbal +selma +##nism +##zumi +roasted +##riders +tay +##ados +##mament +##mut +##rud +840 +completes +nipples +cfa +flavour +hirsch +##laus +calderon +sneakers +moravian +##ksha +1622 +rq +294 +##imeters +bodo +##isance +##pre +##ronia +anatomical +excerpt +##lke +dh +kunst +##tablished +##scoe +biomass +panted +unharmed +gael +housemates +montpellier +##59 +coa +rodents +tonic +hickory +singleton +##taro +451 +1719 +aldo +breaststroke +dempsey +och +rocco +##cuit +merton +dissemination +midsummer +serials +##idi +haji +polynomials +##rdon +gs +enoch +prematurely +shutter +taunton +£3 +##grating +##inates +archangel +harassed +##asco +326 +archway +dazzling +##ecin +1736 +sumo +wat +##kovich +1086 +honneur +##ently +##nostic +##ttal +##idon +1605 +403 +1716 +blogger +rents +##gnan +hires +##ikh +##dant +howie +##rons +handler +retracted +shocks +1632 +arun +duluth +kepler +trumpeter +##lary +peeking +seasoned +trooper +##mara +laszlo +##iciencies +##rti +heterosexual +##inatory +##ssion +indira +jogging +##inga +##lism +beit +dissatisfaction +malice +##ately +nedra +peeling +##rgeon +47th +stadiums +475 +vertigo +##ains +iced +restroom +##plify +##tub +illustrating +pear +##chner +##sibility +inorganic +rappers +receipts +watery +##kura +lucinda +##oulos +reintroduced +##8th +##tched +gracefully +saxons +nutritional +wastewater +rained +favourites +bedrock +fisted +hallways +likeness +upscale +##lateral +1580 +blinds +prequel +##pps +##tama +deter +humiliating +restraining +tn +vents +1659 +laundering +recess +rosary +tractors +coulter +federer +##ifiers +##plin +persistence +##quitable +geschichte +pendulum +quakers +##beam +bassett +pictorial +buffet +koln +##sitor +drills +reciprocal +shooters +##57 +##cton +##tees +converge +pip +dmitri +donnelly +yamamoto +aqua +azores +demographics +hypnotic +spitfire +suspend +wryly +roderick +##rran +sebastien +##asurable +mavericks +##fles +##200 +himalayan +prodigy +##iance +transvaal +demonstrators +handcuffs +dodged +mcnamara +sublime +1726 +crazed +##efined +##till +ivo +pondered +reconciled +shrill +sava +##duk +bal +cad +heresy +jaipur +goran +##nished +341 +lux +shelly +whitehall +##hre +israelis +peacekeeping +##wled +1703 +demetrius +ousted +##arians +##zos +beale +anwar +backstroke +raged +shrinking +cremated +##yck +benign +towing +wadi +darmstadt +landfill +parana +soothe +colleen +sidewalks +mayfair +tumble +hepatitis +ferrer +superstructure +##gingly +##urse +##wee +anthropological +translators +##mies +closeness +hooves +##pw +mondays +##roll +##vita +landscaping +##urized +purification +sock +thorns +thwarted +jalan +tiberius +##taka +saline +##rito +confidently +khyber +sculptors +##ij +brahms +hammersmith +inspectors +battista +fivb +fragmentation +hackney +##uls +arresting +exercising +antoinette +bedfordshire +##zily +dyed +##hema +1656 +racetrack +variability +##tique +1655 +austrians +deteriorating +madman +theorists +aix +lehman +weathered +1731 +decreed +eruptions +1729 +flaw +quinlan +sorbonne +flutes +nunez +1711 +adored +downwards +fable +rasped +1712 +moritz +mouthful +renegade +shivers +stunts +dysfunction +restrain +translit +327 +pancakes +##avio +##cision +##tray +351 +vial +##lden +bain +##maid +##oxide +chihuahua +malacca +vimes +##rba +##rnier +1664 +donnie +plaques +##ually +337 +bangs +floppy +huntsville +loretta +nikolay +##otte +eater +handgun +ubiquitous +##hett +eras +zodiac +1634 +##omorphic +1820s +##zog +cochran +##bula +##lithic +warring +##rada +dalai +excused +blazers +mcconnell +reeling +bot +este +##abi +geese +hoax +taxon +##bla +guitarists +##icon +condemning +hunts +inversion +moffat +taekwondo +##lvis +1624 +stammered +##rest +##rzy +sousa +fundraiser +marylebone +navigable +uptown +cabbage +daniela +salman +shitty +whimper +##kian +##utive +programmers +protections +rm +##rmi +##rued +forceful +##enes +fuss +##tao +##wash +brat +oppressive +reykjavik +spartak +ticking +##inkles +##kiewicz +adolph +horst +maui +protege +straighten +cpc +landau +concourse +clements +resultant +##ando +imaginative +joo +reactivated +##rem +##ffled +##uising +consultative +##guide +flop +kaitlyn +mergers +parenting +somber +##vron +supervise +vidhan +##imum +courtship +exemplified +harmonies +medallist +refining +##rrow +##ка +amara +##hum +780 +goalscorer +sited +overshadowed +rohan +displeasure +secretive +multiplied +osman +##orth +engravings +padre +##kali +##veda +miniatures +mis +##yala +clap +pali +rook +##cana +1692 +57th +antennae +astro +oskar +1628 +bulldog +crotch +hackett +yucatan +##sure +amplifiers +brno +ferrara +migrating +##gree +thanking +turing +##eza +mccann +ting +andersson +onslaught +gaines +ganga +incense +standardization +##mation +sentai +scuba +stuffing +turquoise +waivers +alloys +##vitt +regaining +vaults +##clops +##gizing +digger +furry +memorabilia +probing +##iad +payton +rec +deutschland +filippo +opaque +seamen +zenith +afrikaans +##filtration +disciplined +inspirational +##merie +banco +confuse +grafton +tod +##dgets +championed +simi +anomaly +biplane +##ceptive +electrode +##para +1697 +cleavage +crossbow +swirl +informant +##lars +##osta +afi +bonfire +spec +##oux +lakeside +slump +##culus +##lais +##qvist +##rrigan +1016 +facades +borg +inwardly +cervical +xl +pointedly +050 +stabilization +##odon +chests +1699 +hacked +ctv +orthogonal +suzy +##lastic +gaulle +jacobite +rearview +##cam +##erted +ashby +##drik +##igate +##mise +##zbek +affectionately +canine +disperse +latham +##istles +##ivar +spielberg +##orin +##idium +ezekiel +cid +##sg +durga +middletown +##cina +customized +frontiers +harden +##etano +##zzy +1604 +bolsheviks +##66 +coloration +yoko +##bedo +briefs +slabs +debra +liquidation +plumage +##oin +blossoms +dementia +subsidy +1611 +proctor +relational +jerseys +parochial +ter +##ici +esa +peshawar +cavalier +loren +cpi +idiots +shamrock +1646 +dutton +malabar +mustache +##endez +##ocytes +referencing +terminates +marche +yarmouth +##sop +acton +mated +seton +subtly +baptised +beige +extremes +jolted +kristina +telecast +##actic +safeguard +waldo +##baldi +##bular +endeavors +sloppy +subterranean +##ensburg +##itung +delicately +pigment +tq +##scu +1626 +##ound +collisions +coveted +herds +##personal +##meister +##nberger +chopra +##ricting +abnormalities +defective +galician +lucie +##dilly +alligator +likened +##genase +burundi +clears +complexion +derelict +deafening +diablo +fingered +champaign +dogg +enlist +isotope +labeling +mrna +##erre +brilliance +marvelous +##ayo +1652 +crawley +ether +footed +dwellers +deserts +hamish +rubs +warlock +skimmed +##lizer +870 +buick +embark +heraldic +irregularities +##ajan +kiara +##kulam +##ieg +antigen +kowalski +##lge +oakley +visitation +##mbit +vt +##suit +1570 +murderers +##miento +##rites +chimneys +##sling +condemn +custer +exchequer +havre +##ghi +fluctuations +##rations +dfb +hendricks +vaccines +##tarian +nietzsche +biking +juicy +##duced +brooding +scrolling +selangor +##ragan +352 +annum +boomed +seminole +sugarcane +##dna +departmental +dismissing +innsbruck +arteries +ashok +batavia +daze +kun +overtook +##rga +##tlan +beheaded +gaddafi +holm +electronically +faulty +galilee +fractures +kobayashi +##lized +gunmen +magma +aramaic +mala +eastenders +inference +messengers +bf +##qu +407 +bathrooms +##vere +1658 +flashbacks +ideally +misunderstood +##jali +##weather +mendez +##grounds +505 +uncanny +##iii +1709 +friendships +##nbc +sacrament +accommodated +reiterated +logistical +pebbles +thumped +##escence +administering +decrees +drafts +##flight +##cased +##tula +futuristic +picket +intimidation +winthrop +##fahan +interfered +339 +afar +francoise +morally +uta +cochin +croft +dwarfs +##bruck +##dents +##nami +biker +##hner +##meral +nano +##isen +##ometric +##pres +##ан +brightened +meek +parcels +securely +gunners +##jhl +##zko +agile +hysteria +##lten +##rcus +bukit +champs +chevy +cuckoo +leith +sadler +theologians +welded +##section +1663 +jj +plurality +xander +##rooms +##formed +shredded +temps +intimately +pau +tormented +##lok +##stellar +1618 +charred +ems +essen +##mmel +alarms +spraying +ascot +blooms +twinkle +##abia +##apes +internment +obsidian +##chaft +snoop +##dav +##ooping +malibu +##tension +quiver +##itia +hays +mcintosh +travers +walsall +##ffie +1623 +beverley +schwarz +plunging +structurally +m3 +rosenthal +vikram +##tsk +770 +ghz +##onda +##tiv +chalmers +groningen +pew +reckon +unicef +##rvis +55th +##gni +1651 +sulawesi +avila +cai +metaphysical +screwing +turbulence +##mberg +augusto +samba +56th +baffled +momentary +toxin +##urian +##wani +aachen +condoms +dali +steppe +##3d +##app +##oed +##year +adolescence +dauphin +electrically +inaccessible +microscopy +nikita +##ega +atv +##cel +##enter +##oles +##oteric +##ы +accountants +punishments +wrongly +bribes +adventurous +clinch +flinders +southland +##hem +##kata +gough +##ciency +lads +soared +##ה +undergoes +deformation +outlawed +rubbish +##arus +##mussen +##nidae +##rzburg +arcs +##ingdon +##tituted +1695 +wheelbase +wheeling +bombardier +campground +zebra +##lices +##oj +##bain +lullaby +##ecure +donetsk +wylie +grenada +##arding +##ης +squinting +eireann +opposes +##andra +maximal +runes +##broken +##cuting +##iface +##ror +##rosis +additive +britney +adultery +triggering +##drome +detrimental +aarhus +containment +jc +swapped +vichy +##ioms +madly +##oric +##rag +brant +##ckey +##trix +1560 +1612 +broughton +rustling +##stems +##uder +asbestos +mentoring +##nivorous +finley +leaps +##isan +apical +pry +slits +substitutes +##dict +intuitive +fantasia +insistent +unreasonable +##igen +##vna +domed +hannover +margot +ponder +##zziness +impromptu +jian +lc +rampage +stemming +##eft +andrey +gerais +whichever +amnesia +appropriated +anzac +clicks +modifying +ultimatum +cambrian +maids +verve +yellowstone +##mbs +conservatoire +##scribe +adherence +dinners +spectra +imperfect +mysteriously +sidekick +tatar +tuba +##aks +##ifolia +distrust +##athan +##zle +c2 +ronin +zac +##pse +celaena +instrumentalist +scents +skopje +##mbling +comical +compensated +vidal +condor +intersect +jingle +wavelengths +##urrent +mcqueen +##izzly +carp +weasel +422 +kanye +militias +postdoctoral +eugen +gunslinger +##ɛ +faux +hospice +##for +appalled +derivation +dwarves +##elis +dilapidated +##folk +astoria +philology +##lwyn +##otho +##saka +inducing +philanthropy +##bf +##itative +geek +markedly +sql +##yce +bessie +indices +rn +##flict +495 +frowns +resolving +weightlifting +tugs +cleric +contentious +1653 +mania +rms +##miya +##reate +##ruck +##tucket +bien +eels +marek +##ayton +##cence +discreet +unofficially +##ife +leaks +##bber +1705 +332 +dung +compressor +hillsborough +pandit +shillings +distal +##skin +381 +##tat +##you +nosed +##nir +mangrove +undeveloped +##idia +textures +##inho +##500 +##rise +ae +irritating +nay +amazingly +bancroft +apologetic +compassionate +kata +symphonies +##lovic +airspace +##lch +930 +gifford +precautions +fulfillment +sevilla +vulgar +martinique +##urities +looting +piccolo +tidy +##dermott +quadrant +armchair +incomes +mathematicians +stampede +nilsson +##inking +##scan +foo +quarterfinal +##ostal +shang +shouldered +squirrels +##owe +344 +vinegar +##bner +##rchy +##systems +delaying +##trics +ars +dwyer +rhapsody +sponsoring +##gration +bipolar +cinder +starters +##olio +##urst +421 +signage +##nty +aground +figurative +mons +acquaintances +duets +erroneously +soyuz +elliptic +recreated +##cultural +##quette +##ssed +##tma +##zcz +moderator +scares +##itaire +##stones +##udence +juniper +sighting +##just +##nsen +britten +calabria +ry +bop +cramer +forsyth +stillness +##л +airmen +gathers +unfit +##umber +##upt +taunting +##rip +seeker +streamlined +##bution +holster +schumann +tread +vox +##gano +##onzo +strive +dil +reforming +covent +newbury +predicting +##orro +decorate +tre +##puted +andover +ie +asahi +dept +dunkirk +gills +##tori +buren +huskies +##stis +##stov +abstracts +bets +loosen +##opa +1682 +yearning +##glio +##sir +berman +effortlessly +enamel +napoli +persist +##peration +##uez +attache +elisa +b1 +invitations +##kic +accelerating +reindeer +boardwalk +clutches +nelly +polka +starbucks +##kei +adamant +huey +lough +unbroken +adventurer +embroidery +inspecting +stanza +##ducted +naia +taluka +##pone +##roids +chases +deprivation +florian +##jing +##ppet +earthly +##lib +##ssee +colossal +foreigner +vet +freaks +patrice +rosewood +triassic +upstate +##pkins +dominates +ata +chants +ks +vo +##400 +##bley +##raya +##rmed +555 +agra +infiltrate +##ailing +##ilation +##tzer +##uppe +##werk +binoculars +enthusiast +fujian +squeak +##avs +abolitionist +almeida +boredom +hampstead +marsden +rations +##ands +inflated +334 +bonuses +rosalie +patna +##rco +329 +detachments +penitentiary +54th +flourishing +woolf +##dion +##etched +papyrus +##lster +##nsor +##toy +bobbed +dismounted +endelle +inhuman +motorola +tbs +wince +wreath +##ticus +hideout +inspections +sanjay +disgrace +infused +pudding +stalks +##urbed +arsenic +leases +##hyl +##rrard +collarbone +##waite +##wil +dowry +##bant +##edance +genealogical +nitrate +salamanca +scandals +thyroid +necessitated +##! +##" +### +##$ +##% +##& +##' +##( +##) +##* +##+ +##, +##- +##. +##/ +##: +##; +##< +##= +##> +##? +##@ +##[ +##\ +##] +##^ +##_ +##` +##{ +##| +##} +##~ +##¡ +##¢ +##£ +##¤ +##¥ +##¦ +##§ +##¨ +##© +##ª +##« +##¬ +##® +##± +##´ +##µ +##¶ +##· +##º +##» +##¼ +##¾ +##¿ +##æ +##ð +##÷ +##þ +##đ +##ħ +##ŋ +##œ +##ƒ +##ɐ +##ɑ +##ɒ +##ɔ +##ɕ +##ə +##ɡ +##ɣ +##ɨ +##ɪ +##ɫ +##ɬ +##ɯ +##ɲ +##ɴ +##ɹ +##ɾ +##ʀ +##ʁ +##ʂ +##ʃ +##ʉ +##ʊ +##ʋ +##ʌ +##ʎ +##ʐ +##ʑ +##ʒ +##ʔ +##ʰ +##ʲ +##ʳ +##ʷ +##ʸ +##ʻ +##ʼ +##ʾ +##ʿ +##ˈ +##ˡ +##ˢ +##ˣ +##ˤ +##β +##γ +##δ +##ε +##ζ +##θ +##κ +##λ +##μ +##ξ +##ο +##π +##ρ +##σ +##τ +##υ +##φ +##χ +##ψ +##ω +##б +##г +##д +##ж +##з +##м +##п +##с +##у +##ф +##х +##ц +##ч +##ш +##щ +##ъ +##э +##ю +##ђ +##є +##і +##ј +##љ +##њ +##ћ +##ӏ +##ա +##բ +##գ +##դ +##ե +##թ +##ի +##լ +##կ +##հ +##մ +##յ +##ն +##ո +##պ +##ս +##վ +##տ +##ր +##ւ +##ք +##־ +##א +##ב +##ג +##ד +##ו +##ז +##ח +##ט +##י +##ך +##כ +##ל +##ם +##מ +##ן +##נ +##ס +##ע +##ף +##פ +##ץ +##צ +##ק +##ר +##ש +##ת +##، +##ء +##ب +##ت +##ث +##ج +##ح +##خ +##ذ +##ز +##س +##ش +##ص +##ض +##ط +##ظ +##ع +##غ +##ـ +##ف +##ق +##ك +##و +##ى +##ٹ +##پ +##چ +##ک +##گ +##ں +##ھ +##ہ +##ے +##अ +##आ +##उ +##ए +##क +##ख +##ग +##च +##ज +##ट +##ड +##ण +##त +##थ +##द +##ध +##न +##प +##ब +##भ +##म +##य +##र +##ल +##व +##श +##ष +##स +##ह +##ा +##ि +##ी +##ो +##। +##॥ +##ং +##অ +##আ +##ই +##উ +##এ +##ও +##ক +##খ +##গ +##চ +##ছ +##জ +##ট +##ড +##ণ +##ত +##থ +##দ +##ধ +##ন +##প +##ব +##ভ +##ম +##য +##র +##ল +##শ +##ষ +##স +##হ +##া +##ি +##ী +##ে +##க +##ச +##ட +##த +##ந +##ன +##ப +##ம +##ய +##ர +##ல +##ள +##வ +##ா +##ி +##ு +##ே +##ை +##ನ +##ರ +##ಾ +##ක +##ය +##ර +##ල +##ව +##ා +##ก +##ง +##ต +##ท +##น +##พ +##ม +##ย +##ร +##ล +##ว +##ส +##อ +##า +##เ +##་ +##། +##ག +##ང +##ད +##ན +##པ +##བ +##མ +##འ +##ར +##ལ +##ས +##မ +##ა +##ბ +##გ +##დ +##ე +##ვ +##თ +##ი +##კ +##ლ +##მ +##ნ +##ო +##რ +##ს +##ტ +##უ +##ᄀ +##ᄂ +##ᄃ +##ᄅ +##ᄆ +##ᄇ +##ᄉ +##ᄊ +##ᄋ +##ᄌ +##ᄎ +##ᄏ +##ᄐ +##ᄑ +##ᄒ +##ᅡ +##ᅢ +##ᅥ +##ᅦ +##ᅧ +##ᅩ +##ᅪ +##ᅭ +##ᅮ +##ᅯ +##ᅲ +##ᅳ +##ᅴ +##ᅵ +##ᆨ +##ᆫ +##ᆯ +##ᆷ +##ᆸ +##ᆼ +##ᴬ +##ᴮ +##ᴰ +##ᴵ +##ᴺ +##ᵀ +##ᵃ +##ᵇ +##ᵈ +##ᵉ +##ᵍ +##ᵏ +##ᵐ +##ᵒ +##ᵖ +##ᵗ +##ᵘ +##ᵣ +##ᵤ +##ᵥ +##ᶜ +##ᶠ +##‐ +##‑ +##‒ +##– +##— +##― +##‖ +##‘ +##’ +##‚ +##“ +##” +##„ +##† +##‡ +##• +##… +##‰ +##′ +##″ +##› +##‿ +##⁄ +##⁰ +##ⁱ +##⁴ +##⁵ +##⁶ +##⁷ +##⁸ +##⁹ +##⁻ +##ⁿ +##₅ +##₆ +##₇ +##₈ +##₉ +##₊ +##₍ +##₎ +##ₐ +##ₑ +##ₒ +##ₓ +##ₕ +##ₖ +##ₗ +##ₘ +##ₚ +##ₛ +##ₜ +##₤ +##₩ +##€ +##₱ +##₹ +##ℓ +##№ +##ℝ +##™ +##⅓ +##⅔ +##← +##↑ +##→ +##↓ +##↔ +##↦ +##⇄ +##⇌ +##⇒ +##∂ +##∅ +##∆ +##∇ +##∈ +##∗ +##∘ +##√ +##∞ +##∧ +##∨ +##∩ +##∪ +##≈ +##≡ +##≤ +##≥ +##⊂ +##⊆ +##⊕ +##⊗ +##⋅ +##─ +##│ +##■ +##▪ +##● +##★ +##☆ +##☉ +##♠ +##♣ +##♥ +##♦ +##♯ +##⟨ +##⟩ +##ⱼ +##⺩ +##⺼ +##⽥ +##、 +##。 +##〈 +##〉 +##《 +##》 +##「 +##」 +##『 +##』 +##〜 +##あ +##い +##う +##え +##お +##か +##き +##く +##け +##こ +##さ +##し +##す +##せ +##そ +##た +##ち +##っ +##つ +##て +##と +##な +##に +##ぬ +##ね +##の +##は +##ひ +##ふ +##へ +##ほ +##ま +##み +##む +##め +##も +##や +##ゆ +##よ +##ら +##り +##る +##れ +##ろ +##を +##ん +##ァ +##ア +##ィ +##イ +##ウ +##ェ +##エ +##オ +##カ +##キ +##ク +##ケ +##コ +##サ +##シ +##ス +##セ +##タ +##チ +##ッ +##ツ +##テ +##ト +##ナ +##ニ +##ノ +##ハ +##ヒ +##フ +##ヘ +##ホ +##マ +##ミ +##ム +##メ +##モ +##ャ +##ュ +##ョ +##ラ +##リ +##ル +##レ +##ロ +##ワ +##ン +##・ +##ー +##一 +##三 +##上 +##下 +##不 +##世 +##中 +##主 +##久 +##之 +##也 +##事 +##二 +##五 +##井 +##京 +##人 +##亻 +##仁 +##介 +##代 +##仮 +##伊 +##会 +##佐 +##侍 +##保 +##信 +##健 +##元 +##光 +##八 +##公 +##内 +##出 +##分 +##前 +##劉 +##力 +##加 +##勝 +##北 +##区 +##十 +##千 +##南 +##博 +##原 +##口 +##古 +##史 +##司 +##合 +##吉 +##同 +##名 +##和 +##囗 +##四 +##国 +##國 +##土 +##地 +##坂 +##城 +##堂 +##場 +##士 +##夏 +##外 +##大 +##天 +##太 +##夫 +##奈 +##女 +##子 +##学 +##宀 +##宇 +##安 +##宗 +##定 +##宣 +##宮 +##家 +##宿 +##寺 +##將 +##小 +##尚 +##山 +##岡 +##島 +##崎 +##川 +##州 +##巿 +##帝 +##平 +##年 +##幸 +##广 +##弘 +##張 +##彳 +##後 +##御 +##德 +##心 +##忄 +##志 +##忠 +##愛 +##成 +##我 +##戦 +##戸 +##手 +##扌 +##政 +##文 +##新 +##方 +##日 +##明 +##星 +##春 +##昭 +##智 +##曲 +##書 +##月 +##有 +##朝 +##木 +##本 +##李 +##村 +##東 +##松 +##林 +##森 +##楊 +##樹 +##橋 +##歌 +##止 +##正 +##武 +##比 +##氏 +##民 +##水 +##氵 +##氷 +##永 +##江 +##沢 +##河 +##治 +##法 +##海 +##清 +##漢 +##瀬 +##火 +##版 +##犬 +##王 +##生 +##田 +##男 +##疒 +##発 +##白 +##的 +##皇 +##目 +##相 +##省 +##真 +##石 +##示 +##社 +##神 +##福 +##禾 +##秀 +##秋 +##空 +##立 +##章 +##竹 +##糹 +##美 +##義 +##耳 +##良 +##艹 +##花 +##英 +##華 +##葉 +##藤 +##行 +##街 +##西 +##見 +##訁 +##語 +##谷 +##貝 +##貴 +##車 +##軍 +##辶 +##道 +##郎 +##郡 +##部 +##都 +##里 +##野 +##金 +##鈴 +##镇 +##長 +##門 +##間 +##阝 +##阿 +##陳 +##陽 +##雄 +##青 +##面 +##風 +##食 +##香 +##馬 +##高 +##龍 +##龸 +##fi +##fl +##! +##( +##) +##, +##- +##. +##/ +##: +##? +##~ diff --git a/modelzoo/LanguageModeling/BERT/.dockerignore b/modelzoo/LanguageModeling/BERT/.dockerignore new file mode 100644 index 00000000..c70b78d6 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/.dockerignore @@ -0,0 +1,27 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +data_dl/ +.idea/ +.git/ +.vscode/ +__pycache__/ +results/ +data/binary +data/download +data/extracted +data/formatted_one_article_per_line +data/sharded +data/hdf5* +data/tfrecord* +checkpoints/ diff --git a/modelzoo/LanguageModeling/BERT/.gitignore b/modelzoo/LanguageModeling/BERT/.gitignore new file mode 100644 index 00000000..61c2b075 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/.gitignore @@ -0,0 +1,147 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +#Data +data/download +data/extracted +data/formatted_one_article_per_line +data/sharded +data/hdf5* +data/tfrecord* +data/*/*.zip + +#Resutls +results/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +.vscode/ +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# TensorRT +*.engine +models/ diff --git a/modelzoo/LanguageModeling/BERT/Bert_result.md b/modelzoo/LanguageModeling/BERT/Bert_result.md deleted file mode 100644 index 722e9f80..00000000 --- a/modelzoo/LanguageModeling/BERT/Bert_result.md +++ /dev/null @@ -1,26 +0,0 @@ -| Parameters | A100 | A800 | -| --------------------- | ----------------------- | ----------------------- | -| DataSet | SQuAD1.1 | SQuAD1.1 | -| num_hidden_layers | 12 | 12 | -| batch_size_per_gpu | 32 | 32 | -| learning_rate_per_gpu | 5e-6 | 5e-6 | -| precision | fp16 | Fp16 | -| use_xla | true | true | -| num_gpus | 8 | 8 | -| max_seq_length | 384 | 384 | -| doc_stride | 128 | 128 | -| epochs | 1 | 1 | -| checkpoint | uncased_L-12_H-768_A-12 | uncased_L-12_H-768_A-12 | - -| Task | total_training_steps | train_loss | F1 | exact_match | Throughput Average (sentences/sec)! | Training Duration sec | GPU Util | GPU Memory-Usage(MB)! | -| -------------- | ---------- | ----------------------------------- | --------------------- | -------- | --------------------- | --------------------- | --------------------- | --------------------- | -| A800_GPU-8_bs-12_LR-5e-6_fp16_XLA-true_BERT-base_SQuAD1.1_Epoch-1 | | | 85.2391 | 76.6982 | 280.94 | 1011.48 | 8 * 96% | | -| | | | | | | | | | -| | | | | | | | | | -| | | | | | | | | | -| | | | | | | | | | -| TITAN_GPU-4_bs-12_LR-5e-6_fp16_XLA-true_BERT-base_SQuAD1.1_Epoch-1 | 1846 | 1.0677543878555298 | 84.4242 | 75.4494 | 286.32 | 511.30 for Examples = 88608 | 8 * 96% | | - -*Memory(GiB): Max consumption - -*CPU Util: Max moment value diff --git a/modelzoo/LanguageModeling/BERT/Dockerfile b/modelzoo/LanguageModeling/BERT/Dockerfile new file mode 100644 index 00000000..cc37ea6b --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/Dockerfile @@ -0,0 +1,55 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:21.02-tf2-py3 +FROM ${FROM_IMAGE_NAME} + +RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl + +WORKDIR /workspace +ENV HOME /workspace + +WORKDIR /workspace +RUN git clone https://github.com/openai/gradient-checkpointing.git +RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd .. +RUN git clone https://github.com/soskek/bookcorpus.git +RUN git clone https://github.com/titipata/pubmed_parser + +RUN pip3 install /workspace/pubmed_parser + +# Environment +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 + +# Install Python 3 packages +RUN pip3 install \ + requests \ + tqdm \ + horovod \ + sentencepiece \ + tensorflow_hub \ + pynvml \ + wget \ + progressbar \ + git+https://github.com/NVIDIA/dllogger + +WORKDIR /workspace/bert_tf2 +# Copy model into image - This can be overridden by mounting a volume to the same location. +COPY . . +ENV PYTHONPATH="/workspace/wikiextractor:/workspace/bert_tf2:${PYTHONPATH}" + +#disable lazy compilatoin +ENV TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false" + +ENV TF_DEVICE_MIN_SYS_MEMORY_IN_MB=2048 diff --git a/modelzoo/LanguageModeling/BERT/README.md b/modelzoo/LanguageModeling/BERT/README.md index 6684b4d5..f5e58033 100644 --- a/modelzoo/LanguageModeling/BERT/README.md +++ b/modelzoo/LanguageModeling/BERT/README.md @@ -150,7 +150,7 @@ For information about: #### Enabling mixed precision -This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--dtype=fp16` flag to the `run_pretraining.py` or `run_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code: +This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--use_fp16` flag to the `run_pretraining.py` or `run_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code: 1. Set the Keras mixed precision policy: ```python @@ -393,7 +393,7 @@ The `official/` folder contains necessary files of building model architecture a Aside from the options to set hyperparameters, the relevant options to control the behaviour of the `run_pretraining.py` script are: ``` - --config_file: Bert configuration file to define core bert layers. + --bert_config_file: Bert configuration file to define core bert layers. --init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model). --[no]use_horovod: Whether to use horovod.(default: 'false') --[no]use_fp16: Whether to use fp32 or fp16 arithmetic on GPU. When false, uses TF32 on A100 and FP32 on V100 GPUS.(default: 'false') @@ -407,7 +407,7 @@ Aside from the options to set hyperparameters, the relevant options to control t Aside from the options to set hyperparameters, some relevant options to control the behaviour of the `run_squad.py` script are: ``` - --config_file: Bert configuration file to define core bert layers. + --bert_config_file: Bert configuration file to define core bert layers. --model_dir: The location of the model checkpoint files. --mode: : One of {"train_and_predict", "train", "predict", "export_only"}. `train_and_predict`: both train and predict to a json file. `train`: only trains the model. trains the model and evaluates in the meantime. `predict`: predict answers from the squad json file. `export_only`: will take the latest checkpoint inside model_dir and export a `SavedModel`. --max_answer_length: The maximum length of an answer that can be generated. (default: '30')(an integer) @@ -569,7 +569,7 @@ mpirun -np 8 \ -x LD_LIBRARY_PATH \ -x PATH -mca pml ob1 -mca btl ^openib \ python run_squad.py --use_horovod --vocab_file=$BERT_DIR/vocab.txt \ - --config_file=$BERT_DIR/bert_config.json \ + --bert_config_file=$BERT_DIR/bert_config.json \ --model_dir=/results ``` diff --git a/modelzoo/LanguageModeling/BERT/bert_dllogger.json b/modelzoo/LanguageModeling/BERT/bert_dllogger.json new file mode 100644 index 00000000..13d81e4a --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/bert_dllogger.json @@ -0,0 +1,15 @@ +DLLL {"timestamp": "1750063068.327787", "elapsedtime": "7e-06", "datetime": "2025-06-16 08:37:48.327787+00:00", "type": "METADATA", "metric": "mlm_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.327887", "elapsedtime": "0.000107", "datetime": "2025-06-16 08:37:48.327887+00:00", "type": "METADATA", "metric": "nsp_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.327918", "elapsedtime": "0.000138", "datetime": "2025-06-16 08:37:48.327918+00:00", "type": "METADATA", "metric": "avg_loss_step", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.327944", "elapsedtime": "0.000164", "datetime": "2025-06-16 08:37:48.327944+00:00", "type": "METADATA", "metric": "total_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.327964", "elapsedtime": "0.000184", "datetime": "2025-06-16 08:37:48.327964+00:00", "type": "METADATA", "metric": "loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.327982", "elapsedtime": "0.000202", "datetime": "2025-06-16 08:37:48.327982+00:00", "type": "METADATA", "metric": "f1", "metadata": {"unit": null, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.328002", "elapsedtime": "0.000222", "datetime": "2025-06-16 08:37:48.328002+00:00", "type": "METADATA", "metric": "precision", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.32802", "elapsedtime": "0.00024", "datetime": "2025-06-16 08:37:48.328020+00:00", "type": "METADATA", "metric": "recall", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.328037", "elapsedtime": "0.000257", "datetime": "2025-06-16 08:37:48.328037+00:00", "type": "METADATA", "metric": "mcc", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.328056", "elapsedtime": "0.000276", "datetime": "2025-06-16 08:37:48.328056+00:00", "type": "METADATA", "metric": "exact_match", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.328073", "elapsedtime": "0.000293", "datetime": "2025-06-16 08:37:48.328073+00:00", "type": "METADATA", "metric": "throughput_train", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "TRAIN"}} +DLLL {"timestamp": "1750063068.328092", "elapsedtime": "0.000312", "datetime": "2025-06-16 08:37:48.328092+00:00", "type": "METADATA", "metric": "throughput_inf", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063068.328108", "elapsedtime": "0.000328", "datetime": "2025-06-16 08:37:48.328108+00:00", "type": "METADATA", "metric": "throughput_val", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL"}} +DLLL {"timestamp": "1750063224.077653", "datetime": "2025-06-16 08:40:24.077653+00:00", "elapsedtime": "155.749873", "type": "LOG", "step": [], "data": {"throughput_train": 54.50596785734736}} +DLLL {"timestamp": "1750063224.077839", "datetime": "2025-06-16 08:40:24.077839+00:00", "elapsedtime": "155.750059", "type": "LOG", "step": [], "data": {"total_loss": 1.4864305257797241}} diff --git a/modelzoo/LanguageModeling/BERT/classifier_data_lib.py b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py new file mode 100644 index 00000000..891d4991 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py @@ -0,0 +1,581 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BERT library to process data for classification task.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os + +from absl import logging +import tensorflow as tf + +import tokenization + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def __init__(self, process_text_fn=tokenization.convert_to_unicode): + self.process_text_fn = process_text_fn + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @staticmethod + def get_processor_name(): + """Gets the string identifier of the processor.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.io.gfile.GFile(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self, process_text_fn=tokenization.convert_to_unicode): + super(XnliProcessor, self).__init__(process_text_fn) + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "multinli", + "multinli.train.%s.tsv" % self.language)) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + if label == self.process_text_fn("contradictory"): + label = self.process_text_fn("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = self.process_text_fn(line[0]) + if language != self.process_text_fn(self.language): + continue + text_a = self.process_text_fn(line[6]) + text_b = self.process_text_fn(line[7]) + label = self.process_text_fn(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "XNLI" + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "MNLI" + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, self.process_text_fn(line[0])) + text_a = self.process_text_fn(line[8]) + text_b = self.process_text_fn(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = self.process_text_fn(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "MRPC" + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = self.process_text_fn(line[3]) + text_b = self.process_text_fn(line[4]) + if set_type == "test": + label = "0" + else: + label = self.process_text_fn(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "COLA" + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = self.process_text_fn(line[1]) + label = "0" + else: + text_a = self.process_text_fn(line[3]) + label = self.process_text_fn(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class SstProcessor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "SST-2" + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[0]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "QNLI" + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, 1) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + label = "entailment" + else: + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s", (example.guid)) + logging.info("tokens: %s", + " ".join([tokenization.printable_text(x) for x in tokens])) + logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logging.info("label: %s (id = %d)", example.label, label_id) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features(examples, label_list, + max_seq_length, tokenizer, + output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logging.info("Writing example %d of %d", ex_index, len(examples)) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def generate_tf_record_from_data_file(processor, + data_dir, + tokenizer, + train_data_output_path=None, + eval_data_output_path=None, + max_seq_length=128): + """Generates and saves training data into a tf record file. + + Arguments: + processor: Input processor object to be used for generating data. Subclass + of `DataProcessor`. + data_dir: Directory that contains train/eval data to process. Data files + should be in from "dev.tsv", "test.tsv", or "train.tsv". + tokenizer: The tokenizer to be applied on the data. + train_data_output_path: Output to which processed tf record for training + will be saved. + eval_data_output_path: Output to which processed tf record for evaluation + will be saved. + max_seq_length: Maximum sequence length of the to be generated + training/eval data. + + Returns: + A dictionary containing input meta data. + """ + assert train_data_output_path or eval_data_output_path + + label_list = processor.get_labels() + assert train_data_output_path + train_input_data_examples = processor.get_train_examples(data_dir) + file_based_convert_examples_to_features(train_input_data_examples, label_list, + max_seq_length, tokenizer, + train_data_output_path) + num_training_data = len(train_input_data_examples) + + if eval_data_output_path: + eval_input_data_examples = processor.get_dev_examples(data_dir) + file_based_convert_examples_to_features(eval_input_data_examples, + label_list, max_seq_length, + tokenizer, eval_data_output_path) + + meta_data = { + "task_type": "bert_classification", + "processor_type": processor.get_processor_name(), + "num_labels": len(processor.get_labels()), + "train_data_size": num_training_data, + "max_seq_length": max_seq_length, + } + + if eval_data_output_path: + meta_data["eval_data_size"] = len(eval_input_data_examples) + + return meta_data diff --git a/modelzoo/LanguageModeling/BERT/common_flags.py b/modelzoo/LanguageModeling/BERT/common_flags.py new file mode 100644 index 00000000..9728288f --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/common_flags.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defining common flags used across all BERT models/applications.""" + +from absl import flags +import tensorflow as tf + +from deepray.utils.flags import core as flags_core + + +def define_common_bert_flags(): + """Define common flags for BERT tasks.""" + flags.DEFINE_string('bert_config_file', None, + 'Bert configuration file to define core bert layers.') + flags.DEFINE_string( + 'model_export_path', None, + 'Path to the directory, where trainined model will be ' + 'exported.') + flags.DEFINE_string('tpu', '', 'TPU address to connect to.') + flags.DEFINE_integer('num_train_epochs', 3, + 'Total number of training epochs to perform.') + flags.DEFINE_integer( + 'steps_per_loop', 200, + 'Number of steps per graph-mode loop. Only training step ' + 'happens inside the loop. Callbacks will not be called ' + 'inside.') + flags.DEFINE_boolean( + 'scale_loss', False, + 'Whether to divide the loss by number of replica inside the per-replica ' + 'loss function.') + flags.DEFINE_boolean( + 'use_keras_compile_fit', False, + 'If True, uses Keras compile/fit() API for training logic. Otherwise ' + 'use custom training loop.') + flags.DEFINE_string( + 'hub_module_url', None, 'TF-Hub path/url to Bert module. ' + 'If specified, init_checkpoint flag should not be used.') + flags.DEFINE_enum( + 'model_type', 'bert', ['bert', 'albert'], + 'Specifies the type of the model. ' + 'If "bert", will use canonical BERT; if "albert", will use ALBERT model.') + flags.DEFINE_boolean( + 'use_fp16', False, + 'Whether to use fp32 or fp16 arithmetic on GPU.') + flags.DEFINE_integer( + 'save_checkpoint_steps', 1000, + 'save checkpoint for every n steps') + flags.DEFINE_string( + 'dllog_path', 'bert_dllogger.json', 'filename where dllogger writes to') + flags.DEFINE_boolean( + 'benchmark', False, + 'Benchmark mode.') + + +def use_float16(): + return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16 + + +def get_loss_scale(): + return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic') diff --git a/modelzoo/LanguageModeling/BERT/create_finetuning_data.py b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py new file mode 100644 index 00000000..5bfeff55 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py @@ -0,0 +1,184 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BERT finetuning task dataset generator.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import json + +from absl import app +from absl import flags +import tensorflow as tf + +import classifier_data_lib +# word-piece tokenizer based squad_lib +import squad_lib as squad_lib_wp +# sentence-piece tokenizer based squad_lib +import squad_lib_sp +import tokenization + +FLAGS = flags.FLAGS + +flags.DEFINE_enum( + "fine_tuning_task_type", "classification", ["classification", "squad"], + "The name of the BERT fine tuning task for which data " + "will be generated..") + +# BERT classification specific flags. +flags.DEFINE_string( + "input_data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_enum("classification_task_name", "MNLI", + ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"], + "The name of the task to train BERT classifier.") + +# BERT Squad task specific flags. +flags.DEFINE_string( + "squad_data_file", None, + "The input data file in for generating training data for BERT squad task.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +# Shared flags across BERT fine-tuning tasks. +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "train_data_output_path", None, + "The path in which generated training input data will be written as tf" + " records.") + +flags.DEFINE_string( + "eval_data_output_path", None, + "The path in which generated training input data will be written as tf" + " records.") + +flags.DEFINE_string("meta_data_file_path", None, + "The path in which input meta data will be written.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string("sp_model_file", "", + "The path to the model used by sentence piece tokenizer.") + +flags.DEFINE_enum( + "tokenizer_impl", "word_piece", ["word_piece", "sentence_piece"], + "Specifies the tokenizer implementation, i.e., whehter to use word_piece " + "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, " + "while ALBERT uses sentence_piece tokenizer.") + + +def generate_classifier_dataset(): + """Generates classifier dataset and returns input meta data.""" + assert FLAGS.input_data_dir and FLAGS.classification_task_name + + processors = { + "cola": classifier_data_lib.ColaProcessor, + "mnli": classifier_data_lib.MnliProcessor, + "mrpc": classifier_data_lib.MrpcProcessor, + "qnli": classifier_data_lib.QnliProcessor, + "sst-2": classifier_data_lib.SstProcessor, + "xnli": classifier_data_lib.XnliProcessor, + } + task_name = FLAGS.classification_task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + if FLAGS.tokenizer_impl == "word_piece": + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + processor_text_fn = tokenization.convert_to_unicode + else: + assert FLAGS.tokenizer_impl == "sentence_piece" + tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file) + processor_text_fn = functools.partial( + tokenization.preprocess_text, lower=FLAGS.do_lower_case) + + processor = processors[task_name](processor_text_fn) + return classifier_data_lib.generate_tf_record_from_data_file( + processor, + FLAGS.input_data_dir, + tokenizer, + train_data_output_path=FLAGS.train_data_output_path, + eval_data_output_path=FLAGS.eval_data_output_path, + max_seq_length=FLAGS.max_seq_length) + + +def generate_squad_dataset(): + """Generates squad training dataset and returns input meta data.""" + assert FLAGS.squad_data_file + if FLAGS.tokenizer_impl == "word_piece": + return squad_lib_wp.generate_tf_record_from_json_file( + FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path, + FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length, + FLAGS.doc_stride, FLAGS.version_2_with_negative) + else: + assert FLAGS.tokenizer_impl == "sentence_piece" + return squad_lib_sp.generate_tf_record_from_json_file( + FLAGS.squad_data_file, FLAGS.sp_model_file, + FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case, + FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative) + + +def main(_): + if FLAGS.tokenizer_impl == "word_piece": + if not FLAGS.vocab_file: + raise ValueError( + "FLAG vocab_file for word-piece tokenizer is not specified.") + else: + assert FLAGS.tokenizer_impl == "sentence_piece" + if not FLAGS.sp_model_file: + raise ValueError( + "FLAG sp_model_file for sentence-piece tokenizer is not specified.") + + if FLAGS.fine_tuning_task_type == "classification": + input_meta_data = generate_classifier_dataset() + else: + input_meta_data = generate_squad_dataset() + + with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer: + writer.write(json.dumps(input_meta_data, indent=4) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("train_data_output_path") + flags.mark_flag_as_required("meta_data_file_path") + app.run(main) diff --git a/modelzoo/LanguageModeling/BERT/create_pretraining_data.py b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py new file mode 100644 index 00000000..69a5c696 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py @@ -0,0 +1,655 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +import collections +import itertools +import random + +# Import libraries +from absl import app +from absl import flags +from absl import logging +import tensorflow as tf + +import tokenization + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer( + "max_ngram_size", None, + "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a " + "weighting scheme to favor shorter n-grams. " + "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.") + +flags.DEFINE_bool( + "gzip_compress", False, + "Whether to use `GZIP` compress option to get compressed TFRecord files.") + +flags.DEFINE_bool( + "use_v2_feature_names", False, + "Whether to use the feature names consistent with the models.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files, + gzip_compress, use_v2_feature_names): + """Creates TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append( + tf.io.TFRecordWriter( + output_file, options="GZIP" if gzip_compress else "")) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + if use_v2_feature_names: + features["input_word_ids"] = create_int_feature(input_ids) + features["input_type_ids"] = create_int_feature(segment_ids) + else: + features["input_ids"] = create_int_feature(input_ids) + features["segment_ids"] = create_int_feature(segment_ids) + + features["input_mask"] = create_int_feature(input_mask) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + logging.info("*** Example ***") + logging.info("tokens: %s", " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + logging.info("%s: %s", feature_name, " ".join([str(x) for x in values])) + + for writer in writers: + writer.close() + + logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, + tokenizer, + max_seq_length, + dupe_factor, + short_seq_prob, + masked_lm_prob, + max_predictions_per_seq, + rng, + do_whole_word_mask=False, + max_ngram_size=None): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.io.gfile.GFile(input_file, "rb") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng, + do_whole_word_mask, max_ngram_size)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng, + do_whole_word_mask=False, + max_ngram_size=None): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, + do_whole_word_mask, max_ngram_size) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + +# A _Gram is a [half-open) interval of token indices which form a word. +# E.g., +# words: ["The", "doghouse"] +# tokens: ["The", "dog", "##house"] +# grams: [(0,1), (1,3)] +_Gram = collections.namedtuple("_Gram", ["begin", "end"]) + + +def _window(iterable, size): + """Helper to create a sliding window iterator with a given size. + E.g., + input = [1, 2, 3, 4] + _window(input, 1) => [1], [2], [3], [4] + _window(input, 2) => [1, 2], [2, 3], [3, 4] + _window(input, 3) => [1, 2, 3], [2, 3, 4] + _window(input, 4) => [1, 2, 3, 4] + _window(input, 5) => None + Args: + iterable: elements to iterate over. + size: size of the window. + Yields: + Elements of `iterable` batched into a sliding window of length `size`. + """ + i = iter(iterable) + window = [] + try: + for e in range(0, size): + window.append(next(i)) + yield window + except StopIteration: + # handle the case where iterable's length is less than the window size. + return + for e in i: + window = window[1:] + [e] + yield window + + +def _contiguous(sorted_grams): + """Test whether a sequence of grams is contiguous. + Args: + sorted_grams: _Grams which are sorted in increasing order. + Returns: + True if `sorted_grams` are touching each other. + E.g., + _contiguous([(1, 4), (4, 5), (5, 10)]) == True + _contiguous([(1, 2), (4, 5)]) == False + """ + for a, b in _window(sorted_grams, 2): + if a.end != b.begin: + return False + return True + + +def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng): + """Create a list of masking {1, ..., n}-grams from a list of one-grams. + This is an extention of 'whole word masking' to mask multiple, contiguous + words such as (e.g., "the red boat"). + Each input gram represents the token indices of a single word, + words: ["the", "red", "boat"] + tokens: ["the", "red", "boa", "##t"] + grams: [(0,1), (1,2), (2,4)] + For a `max_ngram_size` of three, possible outputs masks include: + 1-grams: (0,1), (1,2), (2,4) + 2-grams: (0,2), (1,4) + 3-grams; (0,4) + Output masks will not overlap and contain less than `max_masked_tokens` total + tokens. E.g., for the example above with `max_masked_tokens` as three, + valid outputs are, + [(0,1), (1,2)] # "the", "red" covering two tokens + [(1,2), (2,4)] # "red", "boa", "##t" covering three tokens + The length of the selected n-gram follows a zipf weighting to + favor shorter n-gram sizes (weight(1)=1, weight(2)=1/2, weight(3)=1/3, ...). + Args: + grams: List of one-grams. + max_ngram_size: Maximum number of contiguous one-grams combined to create + an n-gram. + max_masked_tokens: Maximum total number of tokens to be masked. + rng: `random.Random` generator. + Returns: + A list of n-grams to be used as masks. + """ + if not grams: + return None + + grams = sorted(grams) + num_tokens = grams[-1].end + + # Ensure our grams are valid (i.e., they don't overlap). + for a, b in _window(grams, 2): + if a.end > b.begin: + raise ValueError("overlapping grams: {}".format(grams)) + + # Build map from n-gram length to list of n-grams. + ngrams = {i: [] for i in range(1, max_ngram_size+1)} + for gram_size in range(1, max_ngram_size+1): + for g in _window(grams, gram_size): + if _contiguous(g): + # Add an n-gram which spans these one-grams. + ngrams[gram_size].append(_Gram(g[0].begin, g[-1].end)) + + # Shuffle each list of n-grams. + for v in ngrams.values(): + rng.shuffle(v) + + # Create the weighting for n-gram length selection. + # Stored cummulatively for `random.choices` below. + cummulative_weights = list( + itertools.accumulate([1./n for n in range(1, max_ngram_size+1)])) + + output_ngrams = [] + # Keep a bitmask of which tokens have been masked. + masked_tokens = [False] * num_tokens + # Loop until we have enough masked tokens or there are no more candidate + # n-grams of any length. + # Each code path should ensure one or more elements from `ngrams` are removed + # to guarentee this loop terminates. + while (sum(masked_tokens) < max_masked_tokens and + sum(len(s) for s in ngrams.values())): + # Pick an n-gram size based on our weights. + sz = random.choices(range(1, max_ngram_size+1), + cum_weights=cummulative_weights)[0] + + # Ensure this size doesn't result in too many masked tokens. + # E.g., a two-gram contains _at least_ two tokens. + if sum(masked_tokens) + sz > max_masked_tokens: + # All n-grams of this length are too long and can be removed from + # consideration. + ngrams[sz].clear() + continue + + # All of the n-grams of this size have been used. + if not ngrams[sz]: + continue + + # Choose a random n-gram of the given size. + gram = ngrams[sz].pop() + num_gram_tokens = gram.end-gram.begin + + # Check if this would add too many tokens. + if num_gram_tokens + sum(masked_tokens) > max_masked_tokens: + continue + + # Check if any of the tokens in this gram have already been masked. + if sum(masked_tokens[gram.begin:gram.end]): + continue + + # Found a usable n-gram! Mark its tokens as masked and add it to return. + masked_tokens[gram.begin:gram.end] = [True] * (gram.end-gram.begin) + output_ngrams.append(gram) + return output_ngrams + + +def _wordpieces_to_grams(tokens): + """Reconstitue grams (words) from `tokens`. + E.g., + tokens: ['[CLS]', 'That', 'lit', '##tle', 'blue', 'tru', '##ck', '[SEP]'] + grams: [ [1,2), [2, 4), [4,5) , [5, 6)] + Args: + tokens: list of wordpieces + Returns: + List of _Grams representing spans of whole words + (without "[CLS]" and "[SEP]"). + """ + grams = [] + gram_start_pos = None + for i, token in enumerate(tokens): + if gram_start_pos is not None and token.startswith("##"): + continue + if gram_start_pos is not None: + grams.append(_Gram(gram_start_pos, i)) + if token not in ["[CLS]", "[SEP]"]: + gram_start_pos = i + else: + gram_start_pos = None + if gram_start_pos is not None: + grams.append(_Gram(gram_start_pos, len(tokens))) + return grams + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng, + do_whole_word_mask, + max_ngram_size=None): + """Creates the predictions for the masked LM objective.""" + if do_whole_word_mask: + grams = _wordpieces_to_grams(tokens) + else: + # Here we consider each token to be a word to allow for sub-word masking. + if max_ngram_size: + raise ValueError("cannot use ngram masking without whole word masking") + grams = [_Gram(i, i+1) for i in range(0, len(tokens)) + if tokens[i] not in ["[CLS]", "[SEP]"]] + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + # Generate masks. If `max_ngram_size` in [0, None] it means we're doing + # whole word masking or token level masking. Both of these can be treated + # as the `max_ngram_size=1` case. + masked_grams = _masking_ngrams(grams, max_ngram_size or 1, + num_to_predict, rng) + masked_lms = [] + output_tokens = list(tokens) + for gram in masked_grams: + # 80% of the time, replace all n-gram tokens with [MASK] + if rng.random() < 0.8: + replacement_action = lambda idx: "[MASK]" + else: + # 10% of the time, keep all the original n-gram tokens. + if rng.random() < 0.5: + replacement_action = lambda idx: tokens[idx] + # 10% of the time, replace each n-gram token with a random word. + else: + replacement_action = lambda idx: rng.choice(vocab_words) + + for idx in range(gram.begin, gram.end): + output_tokens[idx] = replacement_action(idx) + masked_lms.append(MaskedLmInstance(index=idx, label=tokens[idx])) + + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.io.gfile.glob(input_pattern)) + + logging.info("*** Reading from input files ***") + for input_file in input_files: + logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size) + + output_files = FLAGS.output_file.split(",") + logging.info("*** Writing to output files ***") + for output_file in output_files: + logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files, + FLAGS.gzip_compress, + FLAGS.use_v2_feature_names) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + app.run(main) \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py new file mode 100644 index 00000000..53ee6c43 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py @@ -0,0 +1,26 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess + +class BooksDownloader: + def __init__(self, save_path): + self.save_path = save_path + pass + + + def download(self): + bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out' + bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' + bookscorpus_download_command += ' --trash-bad-count' + bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py new file mode 100644 index 00000000..22e48d4b --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py @@ -0,0 +1,32 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os + +class BookscorpusTextFormatting: + def __init__(self, books_path, output_filename, recursive = False): + self.books_path = books_path + self.recursive = recursive + self.output_filename = output_filename + + + # This puts one book per line + def merge(self): + with open(self.output_filename, mode='w', newline='\n') as ofile: + for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True): + with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file: + for line in file: + if line.strip() != '': + ofile.write(line.strip() + ' ') + ofile.write("\n\n") \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/Downloader.py b/modelzoo/LanguageModeling/BERT/data/Downloader.py new file mode 100644 index 00000000..bb5c6287 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/Downloader.py @@ -0,0 +1,123 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader +from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader +from WikiDownloader import WikiDownloader +from BooksDownloader import BooksDownloader +from GLUEDownloader import GLUEDownloader +from SquadDownloader import SquadDownloader +from PubMedDownloader import PubMedDownloader + +class Downloader: + def __init__(self, dataset_name, save_path): + self.dataset_name = dataset_name + self.save_path = save_path + + + def download(self): + if self.dataset_name == 'bookscorpus': + self.download_bookscorpus() + + elif self.dataset_name == 'wikicorpus_en': + self.download_wikicorpus('en') + + elif self.dataset_name == 'wikicorpus_zh': + self.download_wikicorpus('zh') + + elif self.dataset_name == 'pubmed_baseline': + self.download_pubmed('baseline') + + elif self.dataset_name == 'pubmed_daily_update': + self.download_pubmed('daily_update') + + elif self.dataset_name == 'pubmed_fulltext': + self.download_pubmed('fulltext') + + elif self.dataset_name == 'pubmed_open_access': + self.download_pubmed('open_access') + + elif self.dataset_name == 'google_pretrained_weights': + self.download_google_pretrained_weights() + + elif self.dataset_name == 'nvidia_pretrained_weights': + self.download_nvidia_pretrained_weights() + + elif self.dataset_name == 'mrpc': + self.download_glue(self.dataset_name) + + elif self.dataset_name == 'mnli': + self.download_glue(self.dataset_name) + + elif self.dataset_name == 'cola': + self.download_glue(self.dataset_name) + elif self.dataset_name == 'sst-2': + self.download_glue(self.dataset_name) + + elif self.dataset_name == 'squad': + self.download_squad() + + elif self.dataset_name == 'all': + self.download_bookscorpus() + self.download_wikicorpus('en') + self.download_wikicorpus('zh') + self.download_pubmed('baseline') + self.download_pubmed('daily_update') + self.download_pubmed('fulltext') + self.download_pubmed('open_access') + self.download_google_pretrained_weights() + self.download_nvidia_pretrained_weights() + self.download_glue("cola") + self.download_glue("mnli") + self.download_glue("mrpc") + self.download_glue("sst-2") + self.download_squad() + + else: + print(self.dataset_name) + assert False, 'Unknown dataset_name provided to downloader' + + + def download_bookscorpus(self): + downloader = BooksDownloader(self.save_path) + downloader.download() + + + def download_wikicorpus(self, language): + downloader = WikiDownloader(language, self.save_path) + downloader.download() + + + def download_pubmed(self, subset): + downloader = PubMedDownloader(subset, self.save_path) + downloader.download() + + + def download_google_pretrained_weights(self): + downloader = GooglePretrainedWeightDownloader(self.save_path) + downloader.download() + + + def download_nvidia_pretrained_weights(self): + downloader = NVIDIAPretrainedWeightDownloader(self.save_path) + downloader.download() + + + def download_glue(self, glue_task_name): + downloader = GLUEDownloader(self.save_path) + downloader.download(glue_task_name) + + + def download_squad(self): + downloader = SquadDownloader(self.save_path) + downloader.download() diff --git a/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py new file mode 100644 index 00000000..4c1e701f --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py @@ -0,0 +1,46 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import wget + +from pathlib import Path + + +def mkdir(path): + Path(path).mkdir(parents=True, exist_ok=True) + + +class GLUEDownloader: + + def __init__(self, save_path): + self.save_path = save_path + '/glue' + + def download(self, task_name): + mkdir(self.save_path) + if task_name in {'mrpc', 'mnli'}: + task_name = task_name.upper() + elif task_name == 'cola': + task_name = 'CoLA' + else: # SST-2 + assert task_name == 'sst-2' + task_name = 'SST' + wget.download( + 'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py', + out=self.save_path, + ) + sys.path.append(self.save_path) + import download_glue_data + download_glue_data.main( + ['--data_dir', self.save_path, '--tasks', task_name]) + sys.path.pop() diff --git a/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py new file mode 100644 index 00000000..7d21f0bf --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py @@ -0,0 +1,157 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import os +import urllib.request +import tarfile + +class GooglePretrainedWeightDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/google_pretrained_weights' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + # Download urls + self.model_urls = { + 'bert_base_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz', 'uncased_L-12_H-768_A-12.tar.gz'), + 'bert_large_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz', 'uncased_L-24_H-1024_A-16.tar.gz'), + # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'), + # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'), + # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'), + # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'), + # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip') + } + + # SHA256sum verification for file download integrity (and checking for changes from the download source over time) + self.bert_base_uncased_sha = { + 'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc', + 'bert_model.ckpt.data-00000-of-00001': 'f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f', + 'bert_model.ckpt.index': '06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767', + # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639', + 'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3', + } + + self.bert_large_uncased_sha = { + 'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb', + 'bert_model.ckpt.data-00000-of-00001': '9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b', + 'bert_model.ckpt.index': '1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336', + # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639', + 'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3', + } + + self.bert_base_cased_sha = { + 'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc', + 'bert_model.ckpt.data-00000-of-00001': 'ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85', + 'bert_model.ckpt.index': 'af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5', + 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639', + 'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02', + } + + self.bert_large_cased_sha = { + 'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57', + 'bert_model.ckpt.data-00000-of-00001': '1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b', + 'bert_model.ckpt.index': '373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3', + 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639', + 'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02', + } + + self.bert_base_multilingual_cased_sha = { + 'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0', + 'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5', + 'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37', + 'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa', + 'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c', + } + + self.bert_large_multilingual_uncased_sha = { + 'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624', + 'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429', + 'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7', + 'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29', + 'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f', + } + + self.bert_base_chinese_sha = { + 'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015', + 'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba', + 'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e', + 'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047', + 'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c', + } + + # Relate SHA to urls for loop below + self.model_sha = { + 'bert_base_uncased': self.bert_base_uncased_sha, + 'bert_large_uncased': self.bert_large_uncased_sha, + # 'bert_base_cased': self.bert_base_cased_sha, + # 'bert_large_cased': self.bert_large_cased_sha, + # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha, + # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha, + # 'bert_base_chinese': self.bert_base_chinese_sha + } + + # Helper to get sha256sum of a file + def sha256sum(self, filename): + h = hashlib.sha256() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + + return h.hexdigest() + + def download(self): + # Iterate over urls: download, unzip, verify sha256sum + found_mismatch_sha = False + for model in self.model_urls: + url = self.model_urls[model][0] + file = self.save_path + '/' + self.model_urls[model][1] + + print('Downloading', url) + response = urllib.request.urlopen(url) + with open(file, 'wb') as handle: + handle.write(response.read()) + + print('Unzipping', file) + tf = tarfile.open(file) + tf.extractall(self.save_path) + + sha_dict = self.model_sha[model] + for extracted_file in sha_dict: + sha = sha_dict[extracted_file] + if sha != self.sha256sum(file[:-7] + '/' + extracted_file): + found_mismatch_sha = True + print('SHA256sum does not match on file:', extracted_file, 'from download url:', url) + else: + print(file[:-7] + '/' + extracted_file, '\t', 'verified') + + if not found_mismatch_sha: + print("All downloads pass sha256sum verification.") + + def serialize(self): + pass + + def deserialize(self): + pass + + def listAvailableWeights(self): + print("Available Weight Datasets") + for item in self.model_urls: + print(item) + + def listLocallyStoredWeights(self): + pass + diff --git a/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py new file mode 100644 index 00000000..13c9a320 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py @@ -0,0 +1,27 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +class NVIDIAPretrainedWeightDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/nvidia_pretrained_weights' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + pass + + + def download(self): + assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.' \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py new file mode 100644 index 00000000..a2aef07a --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import glob +import gzip +import os +import urllib.request +import shutil +import sys + +class PubMedDownloader: + def __init__(self, subset, save_path): + self.subset = subset + # Modifying self.save_path in two steps to handle creation of subdirectories + self.save_path = save_path + '/pubmed' + '/' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + self.save_path = self.save_path + '/' + subset + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + self.download_urls = { + 'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/', + 'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/', + 'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/', + 'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/' + } + + + def download(self): + print('subset:', self.subset) + url = self.download_urls[self.subset] + self.download_files(url) + self.extract_files() + + + def download_files(self, url): + url = self.download_urls[self.subset] + output = os.popen('curl ' + url).read() + + if self.subset == 'fulltext' or self.subset == 'open_access': + line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use' + for line in output.splitlines(): + if line[-10:] == 'xml.tar.gz' and \ + line.split(' ')[-1].split('.')[0] == line_split: + file = os.path.join(self.save_path, line.split(' ')[-1]) + if not os.path.isfile(file): + print('Downloading', file) + response = urllib.request.urlopen(url + line.split(' ')[-1]) + with open(file, "wb") as handle: + handle.write(response.read()) + + elif self.subset == 'baseline' or self.subset == 'daily_update': + for line in output.splitlines(): + if line[-3:] == '.gz': + file = os.path.join(self.save_path, line.split(' ')[-1]) + if not os.path.isfile(file): + print('Downloading', file) + response = urllib.request.urlopen(url + line.split(' ')[-1]) + with open(file, "wb") as handle: + handle.write(response.read()) + else: + assert False, 'Invalid PubMed dataset/subset specified.' + + def extract_files(self): + files = glob.glob(self.save_path + '/*.xml.gz') + + for file in files: + print('file:', file) + input = gzip.GzipFile(file, mode='rb') + s = input.read() + input.close() + + out = open(file[:-3], mode='wb') + out.write(s) + out.close() + + + diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py new file mode 100644 index 00000000..df851789 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import pubmed_parser as pmp + +class PubMedTextFormatting: + def __init__(self, pubmed_path, output_filename, recursive = False): + self.pubmed_path = pubmed_path + self.recursive = recursive + self.output_filename = output_filename + + + # This puts one article per line + def merge(self): + print('PubMed path:', self.pubmed_path) + + with open(self.output_filename, mode='w', newline='\n') as ofile: + for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive): + print('file:', filename) + dicts_out = pmp.parse_medline_xml(filename) + for dict_out in dicts_out: + if not dict_out['abstract']: + continue + try: + for line in dict_out['abstract'].splitlines(): + if len(line) < 30: + continue + ofile.write(line.strip() + " ") + ofile.write("\n\n") + except: + ofile.write("\n\n") + continue diff --git a/modelzoo/LanguageModeling/BERT/data/README.md b/modelzoo/LanguageModeling/BERT/data/README.md new file mode 100644 index 00000000..48168422 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/README.md @@ -0,0 +1,28 @@ +Steps to reproduce datasets from web + +1) Build the container + * docker build -t bert_tf2 . +2) Run the container interactively + * nvidia-docker run -it --ipc=host bert_tf2 + * Optional: Mount data volumes + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/download + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/extracted_articles + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/raw_data + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/intermediate_files + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_text_file_single + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_text_files_sharded + * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_tfrecords_sharded + * -v yourpath:/workspace/bert_tf2/data/bookcorpus/download + * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_text_file_single + * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_text_files_sharded + * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_tfrecords_sharded + * Optional: Select visible GPUs + * -e CUDA_VISIBLE_DEVICES=0 + +** Inside of the container starting here** +3) Download pretrained weights (they contain vocab files for preprocessing) and SQuAD + * bash data/create_datasets_from_start.sh squad +5) "One-click" Wikipedia data download and prep (provides tfrecords) + * bash data/create_datasets_from_start.sh pretrained wiki_only +6) "One-click" Wikipedia and BookCorpus data download and prep (provided tfrecords) + * bash data/create_datasets_from_start.sh pretrained wiki_books diff --git a/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py new file mode 100644 index 00000000..6d64ffc6 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py @@ -0,0 +1,54 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import os +import urllib.request +import sys + +class SquadDownloader: + def __init__(self, save_path): + self.save_path = save_path + '/squad' + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + if not os.path.exists(self.save_path + '/v1.1'): + os.makedirs(self.save_path + '/v1.1') + + if not os.path.exists(self.save_path + '/v2.0'): + os.makedirs(self.save_path + '/v2.0') + + self.download_urls = { + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json', + 'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json', + 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json', + 'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py', + } + + def download(self): + for item in self.download_urls: + url = item + file = self.download_urls[item] + + print('Downloading:', url) + if os.path.isfile(self.save_path + '/' + file): + print('** Download file already exists, skipping download') + else: + response = urllib.request.urlopen(url) + with open(self.save_path + '/' + file, "wb") as handle: + handle.write(response.read()) + + diff --git a/modelzoo/LanguageModeling/BERT/data/TextSharding.py b/modelzoo/LanguageModeling/BERT/data/TextSharding.py new file mode 100644 index 00000000..a6b0ca49 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/TextSharding.py @@ -0,0 +1,331 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from itertools import islice + +import multiprocessing +import os +import statistics + +class Sharding: + def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set): + assert len(input_files) > 0, 'The input file list must contain at least one file.' + assert n_training_shards > 0, 'There must be at least one output shard.' + assert n_test_shards > 0, 'There must be at least one output shard.' + + self.n_training_shards = n_training_shards + self.n_test_shards = n_test_shards + self.fraction_test_set = fraction_test_set + + self.input_files = input_files + + self.output_name_prefix = output_name_prefix + self.output_training_identifier = '_training' + self.output_test_identifier = '_test' + self.output_file_extension = '.txt' + + self.articles = {} # key: integer identifier, value: list of articles + self.sentences = {} # key: integer identifier, value: list of sentences + self.output_training_files = {} # key: filename, value: list of articles to go into file + self.output_test_files = {} # key: filename, value: list of articles to go into file + + self.init_output_files() + + + # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines) + def load_articles(self): + print('Start: Loading Articles') + + global_article_count = 0 + for input_file in self.input_files: + print('input file:', input_file) + with open(input_file, mode='r', newline='\n') as f: + for i, line in enumerate(f): + if line.strip(): + self.articles[global_article_count] = line.rstrip() + global_article_count += 1 + + print('End: Loading Articles: There are', len(self.articles), 'articles.') + + + def segment_articles_into_sentences(self, segmenter): + print('Start: Sentence Segmentation') + if len(self.articles) == 0: + self.load_articles() + + assert len(self.articles) != 0, 'Please check that input files are present and contain data.' + + # TODO: WIP: multiprocessing (create independent ranges and spawn processes) + use_multiprocessing = 'serial' + + def chunks(data, size=len(self.articles)): + it = iter(data) + for i in range(0, len(data), size): + yield {k: data[k] for k in islice(it, size)} + + if use_multiprocessing == 'manager': + manager = multiprocessing.Manager() + return_dict = manager.dict() + jobs = [] + n_processes = 7 # in addition to the main process, total = n_proc+1 + + def work(articles, return_dict): + sentences = {} + for i, article in enumerate(articles): + sentences[i] = segmenter.segment_string(articles[article]) + + if i % 5000 == 0: + print('Segmenting article', i) + + return_dict.update(sentences) + + for item in chunks(self.articles, len(self.articles)): + p = multiprocessing.Process(target=work, args=(item, return_dict)) + + # Busy wait + while len(jobs) >= n_processes: + pass + + jobs.append(p) + p.start() + + for proc in jobs: + proc.join() + + elif use_multiprocessing == 'queue': + work_queue = multiprocessing.Queue() + jobs = [] + + for item in chunks(self.articles, len(self.articles)): + pass + + else: # serial option + for i, article in enumerate(self.articles): + self.sentences[i] = segmenter.segment_string(self.articles[article]) + + if i % 5000 == 0: + print('Segmenting article', i) + + print('End: Sentence Segmentation') + + + def init_output_files(self): + print('Start: Init Output Files') + assert len(self.output_training_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.' + assert len(self.output_test_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.' + + for i in range(self.n_training_shards): + name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension + self.output_training_files[name] = [] + + for i in range(self.n_test_shards): + name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension + self.output_test_files[name] = [] + + print('End: Init Output Files') + + + def get_sentences_per_shard(self, shard): + result = 0 + for article_id in shard: + result += len(self.sentences[article_id]) + + return result + + + def distribute_articles_over_shards(self): + print('Start: Distribute Articles Over Shards') + assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.' + + # Create dictionary with - key: sentence count per article, value: article id number + sentence_counts = defaultdict(lambda: []) + + max_sentences = 0 + total_sentences = 0 + + for article_id in self.sentences: + current_length = len(self.sentences[article_id]) + sentence_counts[current_length].append(article_id) + max_sentences = max(max_sentences, current_length) + total_sentences += current_length + + n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences) + nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards + nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards + + consumed_article_set = set({}) + unused_article_set = set(self.articles.keys()) + + # Make first pass and add one article worth of lines per file + for file in self.output_training_files: + current_article_id = sentence_counts[max_sentences][-1] + sentence_counts[max_sentences].pop(-1) + self.output_training_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard: + nominal_sentences_per_training_shard = len(self.sentences[current_article_id]) + print('Warning: A single article contains more than the nominal number of sentences per training shard.') + + for file in self.output_test_files: + current_article_id = sentence_counts[max_sentences][-1] + sentence_counts[max_sentences].pop(-1) + self.output_test_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard: + nominal_sentences_per_test_shard = len(self.sentences[current_article_id]) + print('Warning: A single article contains more than the nominal number of sentences per test shard.') + + training_counts = [] + test_counts = [] + + for shard in self.output_training_files: + training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard])) + + training_median = statistics.median(training_counts) + test_median = statistics.median(test_counts) + + # Make subsequent passes over files to find articles to add without going over limit + history_remaining = [] + n_history_remaining = 4 + + while len(consumed_article_set) < len(self.articles): + for fidx, file in enumerate(self.output_training_files): + nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0: + nominal_next_article_size -= 1 + + if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or training_counts[fidx] > training_median: + continue # skip adding to this file, will come back later if no file can accept unused articles + + current_article_id = sentence_counts[nominal_next_article_size][-1] + sentence_counts[nominal_next_article_size].pop(-1) + + self.output_training_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + for fidx, file in enumerate(self.output_test_files): + nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences) + + # Maintain the max sentence count + while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0: + max_sentences -= 1 + + while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0: + nominal_next_article_size -= 1 + + if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or test_counts[fidx] > test_median: + continue # skip adding to this file, will come back later if no file can accept unused articles + + current_article_id = sentence_counts[nominal_next_article_size][-1] + sentence_counts[nominal_next_article_size].pop(-1) + + self.output_test_files[file].append(current_article_id) + consumed_article_set.add(current_article_id) + unused_article_set.remove(current_article_id) + + # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed + if len(history_remaining) == n_history_remaining: + history_remaining.pop(0) + history_remaining.append(len(unused_article_set)) + + history_same = True + for i in range(1, len(history_remaining)): + history_same = history_same and (history_remaining[i-1] == history_remaining[i]) + + if history_same: + nominal_sentences_per_training_shard += 1 + # nominal_sentences_per_test_shard += 1 + + training_counts = [] + test_counts = [] + for shard in self.output_training_files: + training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard])) + + training_median = statistics.median(training_counts) + test_median = statistics.median(test_counts) + + print('Distributing data over shards:', len(unused_article_set), 'articles remaining.') + + + if len(unused_article_set) != 0: + print('Warning: Some articles did not make it into output files.') + + + for shard in self.output_training_files: + print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard])) + + for shard in self.output_test_files: + print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard])) + + print('End: Distribute Articles Over Shards') + + + def write_shards_to_disk(self): + print('Start: Write Shards to Disk') + for shard in self.output_training_files: + self.write_single_shard(shard, self.output_training_files[shard], 'training') + + for shard in self.output_test_files: + self.write_single_shard(shard, self.output_test_files[shard], 'test') + + print('End: Write Shards to Disk') + + + def write_single_shard(self, shard_name, shard, split): + shard_split = os.path.split(shard_name) + shard_name = shard_split[0] + '/' + split + '/' + shard_split[1] + + with open(shard_name, mode='w', newline='\n') as f: + for article_id in shard: + for line in self.sentences[article_id]: + f.write(line + '\n') + + f.write('\n') # Line break between articles + + +import nltk + +nltk.download('punkt') + +class NLTKSegmenter: + def __init(self): + pass + + def segment_string(self, article): + return nltk.tokenize.sent_tokenize(article) + diff --git a/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py new file mode 100644 index 00000000..1e5e36ce --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py @@ -0,0 +1,59 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bz2 +import os +import urllib.request +import sys +import subprocess + +class WikiDownloader: + def __init__(self, language, save_path): + self.save_path = save_path + '/wikicorpus_' + language + + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + self.language = language + self.download_urls = { + 'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2', + 'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' } + + self.output_files = { + 'en' : 'wikicorpus_en.xml.bz2', + 'zh' : 'wikicorpus_zh.xml.bz2' + } + + + def download(self): + if self.language in self.download_urls: + url = self.download_urls[self.language] + filename = self.output_files[self.language] + + print('Downloading:', url) + if os.path.isfile(self.save_path + '/' + filename): + print('** Download file already exists, skipping download') + else: + cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename), '--no-check-certificate'] + print('Running:', cmd) + status = subprocess.run(cmd) + if status.returncode != 0: + raise RuntimeError('Wiki download not successful') + + # Always unzipping since this is relatively fast and will overwrite + print('Unzipping:', self.output_files[self.language]) + subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True) + + else: + assert False, 'WikiDownloader not implemented for this language yet.' + diff --git a/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py new file mode 100644 index 00000000..9d356b13 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py @@ -0,0 +1,46 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os + +class WikicorpusTextFormatting: + def __init__(self, wiki_path, output_filename, recursive = False): + self.wiki_path = wiki_path + self.recursive = recursive + self.output_filename = output_filename + + + # This puts one article per line + def merge(self): + with open(self.output_filename, mode='w', newline='\n') as ofile: + for dirname in glob.glob(self.wiki_path + '/*/', recursive=False): + for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive): + print(filename) + article_lines = [] + article_open = False + + with open(filename, mode='r', newline='\n') as file: + for line in file: + if '' in line: + article_open = False + for oline in article_lines[1:]: + if oline != '\n': + ofile.write(oline.rstrip() + " ") + ofile.write("\n\n") + article_lines = [] + else: + if article_open: + article_lines.append(line) \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/__init__.py b/modelzoo/LanguageModeling/BERT/data/__init__.py new file mode 100644 index 00000000..d49f0d05 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/bertPrep.py b/modelzoo/LanguageModeling/BERT/data/bertPrep.py new file mode 100644 index 00000000..656d909e --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/bertPrep.py @@ -0,0 +1,388 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import BookscorpusTextFormatting +import Downloader +import TextSharding +import WikicorpusTextFormatting +import PubMedTextFormatting + +import argparse +import itertools +import multiprocessing +import os +import pprint +import subprocess + + +def main(args): + working_dir = os.environ['BERT_PREP_WORKING_DIR'] + + print('Working Directory:', working_dir) + print('Action:', args.action) + print('Dataset Name:', args.dataset) + + if args.input_files: + args.input_files = args.input_files.split(',') + + hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \ + + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \ + + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \ + + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100)) + directory_structure = { + 'download' : working_dir + '/download', # Downloaded and decompressed + 'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor) + 'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same + 'sharded' : working_dir + '/sharded', + 'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix, + 'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix, + } + + print('\nDirectory Structure:') + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(directory_structure) + print('') + + if args.action == 'download': + if not os.path.exists(directory_structure['download']): + os.makedirs(directory_structure['download']) + + downloader = Downloader.Downloader(args.dataset, directory_structure['download']) + downloader.download() + + elif args.action == 'text_formatting': + assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \ + and args.dataset != 'squad' and args.dataset != 'mrpc' and args.dataset != 'cola' and \ + args.dataset != 'mnli' and args.dataset != 'sst-2', 'Cannot perform text_formatting on pretrained weights' + + if not os.path.exists(directory_structure['extracted']): + os.makedirs(directory_structure['extracted']) + + if not os.path.exists(directory_structure['formatted']): + os.makedirs(directory_structure['formatted']) + + if args.dataset == 'bookscorpus': + books_path = directory_structure['download'] + '/bookscorpus' + #books_path = directory_structure['download'] + output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt' + books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True) + books_formatter.merge() + + elif args.dataset == 'wikicorpus_en': + if args.skip_wikiextractor == 0: + path_to_wikiextractor_in_container = 'python -m wikiextractor.WikiExtractor' + wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset + print('WikiExtractor Command:', wikiextractor_command) + wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) + + wiki_path = directory_structure['extracted'] + '/wikicorpus_en' + output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt' + wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True) + wiki_formatter.merge() + + elif args.dataset == 'wikicorpus_zh': + assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.' + if args.skip_wikiextractor == 0: + path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py' + wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset + print('WikiExtractor Command:', wikiextractor_command) + wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True) + + wiki_path = directory_structure['extracted'] + '/wikicorpus_zh' + output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt' + wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True) + wiki_formatter.merge() + + elif args.dataset == 'pubmed_baseline': + pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline' + output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt' + pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True) + pubmed_formatter.merge() + + elif args.action == 'sharding': + # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces) + if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset: + if args.input_files is None: + if args.dataset == 'bookscorpus': + args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'] + elif args.dataset == 'wikicorpus_en': + args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'] + elif args.dataset == 'wikicorpus_zh': + args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'] + elif args.dataset == 'books_wiki_en_corpus': + args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'] + elif args.dataset == 'pubmed_baseline': + args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'] + + output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset + + if not os.path.exists(directory_structure['sharded']): + os.makedirs(directory_structure['sharded']) + + if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset): + os.makedirs(directory_structure['sharded'] + '/' + args.dataset) + + if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'): + os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training') + + if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'): + os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test') + + # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and + # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this. + # Different languages (e.g., Chinese simplified/traditional) may require translation and + # other packages to be called from here -- just add a conditional branch for those extra steps + segmenter = TextSharding.NLTKSegmenter() + sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set) + + sharding.load_articles() + sharding.segment_articles_into_sentences(segmenter) + sharding.distribute_articles_over_shards() + sharding.write_shards_to_disk() + + else: + assert False, 'Unsupported dataset for sharding' + + elif args.action == 'create_tfrecord_files': + if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset): + os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset) + + if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'): + os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training') + + if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'): + os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test') + + last_process = None + + def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'): + bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py' + bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt' + bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format + bert_preprocessing_command += ' --vocab_file=' + args.vocab_file + bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else '' + bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length) + bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq) + bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob) + bert_preprocessing_command += ' --random_seed=' + str(args.random_seed) + bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor) + bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True) + + last_process = bert_preprocessing_process + + # This could be better optimized (fine if all take equal time) + if shard_id % args.n_processes == 0 and shard_id > 0: + bert_preprocessing_process.wait() + + return last_process + + output_file_prefix = args.dataset + + for i in range(args.n_training_shards): + last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training') + + last_process.wait() + + for i in range(args.n_test_shards): + last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test') + + last_process.wait() + + + elif args.action == 'create_hdf5_files': + assert False, 'HDF5 format not fully supported in this release.' + + if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset): + os.makedirs(directory_structure['hdf5'] + "/" + args.dataset) + + last_process = None + + def create_record_worker(filename_prefix, shard_id, output_format='hdf5'): + bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py' + bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt' + bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format + bert_preprocessing_command += ' --vocab_file=' + args.vocab_file + bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else '' + bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length + bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq + bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob + bert_preprocessing_command += ' --random_seed=' + args.random_seed + bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor + bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True) + + last_process = bert_preprocessing_process + + # This could be better optimized (fine if all take equal time) + if shard_id % args.n_processes == 0 and shard_id > 0: + bert_preprocessing_process.wait() + + for i in range(args.n_training_shards): + create_record_worker(args.output_file_prefix + '_training', i) + + last_process.wait() + + for i in range(args.n_test_shards): + create_record_worker(args.output_file_prefix + '_test', i) + + last_process.wait() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Preprocessing Application for Everything BERT-related' + ) + + parser.add_argument( + '--action', + type=str, + help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords', + choices={ + 'download', # Download and verify mdf5/sha sums + 'text_formatting', # Convert into a file that contains one article/book per line + 'sharding', # Convert previous formatted text into shards containing one sentence per line + 'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info + 'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info + } + ) + + parser.add_argument( + '--dataset', + type=str, + help='Specify the dataset to perform --action on', + choices={ + 'bookscorpus', + 'wikicorpus_en', + 'wikicorpus_zh', + 'books_wiki_en_corpus', + 'pubmed_baseline', + 'pubmed_daily_update', + 'pubmed_fulltext', + 'pubmed_open_access', + 'google_pretrained_weights', + 'nvidia_pretrained_weights', + 'squad', + 'mrpc', + 'sst-2', + 'mnli', + 'cola', + 'all' + } + ) + + parser.add_argument( + '--input_files', + type=str, + help='Specify the input files in a comma-separated list (no spaces)' + ) + + parser.add_argument( + '--n_training_shards', + type=int, + help='Specify the number of training shards to generate', + default=1472 + ) + + parser.add_argument( + '--n_test_shards', + type=int, + help='Specify the number of test shards to generate', + default=1472 + ) + + parser.add_argument( + '--fraction_test_set', + type=float, + help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)', + default=0.1 + ) + + parser.add_argument( + '--segmentation_method', + type=str, + help='Specify your choice of sentence segmentation', + choices={ + 'nltk' + }, + default='nltk' + ) + + parser.add_argument( + '--n_processes', + type=int, + help='Specify the max number of processes to allow at one time', + default=4 + ) + + parser.add_argument( + '--random_seed', + type=int, + help='Specify the base seed to use for any random number generation', + default=12345 + ) + + parser.add_argument( + '--dupe_factor', + type=int, + help='Specify the duplication factor', + default=5 + ) + + parser.add_argument( + '--masked_lm_prob', + type=float, + help='Specify the probability for masked lm', + default=0.15 + ) + + parser.add_argument( + '--max_seq_length', + type=int, + help='Specify the maximum sequence length', + default=512 + ) + + parser.add_argument( + '--max_predictions_per_seq', + type=int, + help='Specify the maximum number of masked words per sequence', + default=20 + ) + + parser.add_argument( + '--do_lower_case', + type=int, + help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)', + default=1 + ) + + parser.add_argument( + '--vocab_file', + type=str, + help='Specify absolute path to vocab file to use)' + ) + + parser.add_argument( + '--skip_wikiextractor', + type=int, + help='Specify whether to skip wikiextractor step 0=False, 1=True', + default=0 + ) + + parser.add_argument( + '--interactive_json_config_generator', + type=str, + help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords' + ) + + args = parser.parse_args() + main(args) diff --git a/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh b/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh new file mode 100644 index 00000000..3f1a4163 --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}" + +# Download +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset pubmed_baseline + +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab + +# Properly format the text files +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset pubmed_baseline + + +# Shard the text files +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset pubmed_baseline + +### BERT BASE + +## UNCASED + +# Create TFRecord files Phase 1 +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \ + --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt + + +# Create TFRecord files Phase 2 +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \ + --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt + + +## CASED + +# Create TFRecord files Phase 1 +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \ + --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \ + --do_lower_case=0 + + +# Create TFRecord files Phase 2 +python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \ + --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \ + --do_lower_case=0 diff --git a/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh b/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh new file mode 100644 index 00000000..f09b0d0f --- /dev/null +++ b/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export BERT_PREP_WORKING_DIR=/workspace/bert_tf2/data + +to_download=${1:-"all"} +pretrained_to_download=${2:-"wiki_only"} # By default, we don't download BooksCorpus dataset due to recent issues with the host server + +if [ "$to_download" = "all" ] || [ "$to_download" = "squad" ] ; then + #SQUAD + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab + + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset squad + + export BERT_DIR=${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16 + export SQUAD_DIR=${BERT_PREP_WORKING_DIR}/download/squad + python create_finetuning_data.py \ + --squad_data_file=${SQUAD_DIR}/v1.1/train-v1.1.json \ + --vocab_file=${BERT_DIR}/vocab.txt \ + --train_data_output_path=${SQUAD_DIR}/v1.1/squad_v1.1_train.tf_record \ + --meta_data_file_path=${SQUAD_DIR}/v1.1/squad_v1.1_meta_data \ + --fine_tuning_task_type=squad --max_seq_length=384 + + python create_finetuning_data.py \ + --squad_data_file=${SQUAD_DIR}/v2.0/train-v2.0.json \ + --vocab_file=${BERT_DIR}/vocab.txt \ + --train_data_output_path=${SQUAD_DIR}/v2.0/squad_v2.0_train.tf_record \ + --meta_data_file_path=${SQUAD_DIR}/v2.0/squad_v2.0_meta_data \ + --fine_tuning_task_type=squad --max_seq_length=384 --version_2_with_negative=True +fi + +if [ "$to_download" = "all" ] || [ "$to_download" = "pretrained" ] ; then + #Pretrained + if [ "$pretrained_to_download" = "wiki_books" ] ; then + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset bookscorpus + fi + + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset wikicorpus_en + + DATASET="wikicorpus_en" + # Properly format the text files + if [ "$pretrained_to_download" = "wiki_books" ] ; then + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset bookscorpus + DATASET="books_wiki_en_corpus" + fi + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset wikicorpus_en + + # Shard the text files + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset $DATASET + + # Create TFRecord files Phase 1 + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 128 \ + --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt + + + # Create TFRecord files Phase 2 + python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 512 \ + --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt +fi \ No newline at end of file diff --git a/modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png b/modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..40193e9e13ba2daa25f70d1ee69a34f1958ca7ce GIT binary patch literal 212516 zcmeFZbyQs0_AQJB4HXCu1;IT?a3>)^@ZcUSxVr@lfx;8qNpN@fK;iB#f#B{0%{#?S zci;DW@BRCY@r}{{(cQbw*?X?J=9+8mb8qllc}YxkVstn-I83QGua)57KwvmH)KXML z;NR5YG6DYpH~dysMf~^g-|g+~mzS5>+1b?j%Y%c1g@uKQiHR=tzzVqi{r&s<`;(Is zuSU?!%uL{SvYXrM>gwv^Ni#Gww56q`qhISiJw1_KFH%xc`hKQN?N(IJIb8}f)*~^zNgM)+c@bJbbCRu}YvHp-RO{!xl8Zl#Zsxhx zBkf%*B(y3?)~rcX)`Q3`)8XLA;iO)_RB?mfOGOi#=xGQfsf!9MLwYINwrK9x#rMt_ zoZ4mNIdu2Cb$cqUV6WMAOd*9rW?zA3+v9X{Y`6R+90Hav91=MkJgVrwfBuWZza;oK z8U9s*e--iHa`10i{MQrw>y7?3hX1!K4tm6a4-9yqa4o!?~Txe%T_7Geh_rn>LFC|RjgIR$%MO^)h#s_&&o zPz@+SYZcp)Iiq#8X>|$r6Que!{DuCO3k_nw)aC(~>kVUr)!yaypWD$B`;N5V1n%~3 z*V=grT))WZN#gK5k+jw6@H~9BVC;_dNeFLAUg0;r#hP(*0jkIcfj3lzp$v8kr_mc@ z;;LdaL3uCwY4)6uYVh)hUokePg}Dw(cLgKnW&D~DdCkqIE)61QLqWJ+lKo0&f3~p@ zTKiqsgZs1Y4-dwYo6kOjjtMdyzxRwuMS9*`eoS8WVJTRWc%i=`NBmyBVONeoLI7E6 zR;LW|H`isT%PdJ4mhQvpl6BO`Vo*vkC%;|AV*E68L=oQ(jY-+-%Q~5E!I<5}3%px( zfAb`?>E7$-LTJimQv&*8vra>hN8)XciU^ZP0Jb|?Hg;^;IQ9(sn5oar+t-f^Wvl&)3fx6OyD?&&EskH0cLaGmO4f)GlqNmHnNG~uiJkpt!&W3d zv<O2*E&+{V5wjGPw+i?wYaguVM{fYIr`D2duy?^gjaV}=_M9(H1 z`Iade(!&|M#M`)#2%`Gf9B*^Dn6GSm#Wt(4@9G)d()Z~)voqg(#p{<#+}VN4VEpY9 z7L=DuK7t=SjZQT5FCRA_eBicECEMzGwfR9dy|Ld!{?(-imr8rk2^ycMF^t|6g~JyY z_n@TM+$bo4zxgU$&EdkbUPRQ|){2_g-Iux|dpN%98^1o0Y7pEZtf zoia}fLf+jwb8aIVr4EFDbzw?fvsO8x#jH$b2y?0WTs?^3`43mD5M~vorgQvEtSz`j z4yz?*1TfzRxu_22Y436@st1&}RBX@hI{4&_Du#BF*k?tO0GX1+p)AvJi`H1}G4gI~cZf^oeSGo6M`=ru| zNo*}yFV?{7m(KOn%f0hlk9j3r0y34UC6Y-)35?mUFnPl#k~E-Ys{Y%>RvIlt<% zHcLwmKZE>pfO4dhuSL<96;d(fSOsyB7InrdnFEW(kH6S@s|J>5_vPB^dX)F|py7Yg9*sOQapzr7K4dWC!otF#PQeE6ceE4(Z{+;*F!6rQ-BsiSVLHQfTTK{q zc5@_DQj%)P&D_U%#e+^Pf3S%&$BiVwcI@L`PJNz}Y_mPm-*WlsS)4NO}uDvPHuwJ`c7?WjbQu#eU@)qh*~Mm@SGQfWhNoI zoZN<3q#BiYpZ9$+wC;QLmS#h&f>`HZVP<3+SyS@8g8NJB4b9{YYuqL&jEZD@34yt8 z{^lXnZg1uG|-4coOC##HCzACK6CzC1Tg!(c%HAIBDHiv?b48SWyU&`dKUN za%{WXvU0^EKc;=GpO+ii3(jY8sVUWyvaf&aCxwP9h^a8qY}H}l*BJlW&vNvq50lY= zFr?lrEBA7H67f7El0dF@EPqL&ExC%k(cEKhRCsFJIpXi;?7IURuGg?P>!kQdiqg)H znp`cLXAVS7vDSItThEk8yCjriTI|QYI@*!P^^57C{3`jOx}+N z_QIGmUMCY_W2{!Gf=EhW@ zrt+@hzEQTn6@YzOf6?APlHZ!tl4VnAlegl#)1e%c!r=VxLh-LJ+Ka5q@|n&=OSs4V1O6u1h@sW*3rb1c_ilZ9x zqIBuHffYmO7>`X~kbd!}4XqJyKV*_a^gUAc;zJ%dB?dAxJ|IgiK1utZTPMOTi?1kE z_ut*lXig`Y=4XvJs#B--_$_U#I#l&EO~r>9CjYXgjy_t>iVa{-JgVrV_(zx?!QTl9 z{CZD%BS##WtKnRFEf6`YyvkhYW|K{b#yS-k8QCkB``fkbE<1It_F8`{vq$<17wN^5 z%4Wq%ks!81jF|8Xix46@$#{#mk;xdiV zs<_yfOK}5kptj^F+sZt>+nbNms>TNGxfA`Q|5fbrVLbDR zw$4;pKU?Z|Y(80xd`fm>fNz`anRBH|(AD*AJRUnM+59&{qT#2EbF%lv^hFw=`#Fz` zi@CzsZ!GyD2qamJR-mzj5T%OZiQo_yIyG0=^d7_) zHsd!%-xq~kI)|MuG@rUR$6nh0azU5Y|C;xDanENC%eLs6^DyW~Jsb9iXvO(bGj1{w zhh=}4c#4HM{2cnSXV~5UBtIW4C@`L3aP7CF7|AHv)J&)Mt`=MEe;O>EW2o0Y2 zf2rVigoq7?+MNtZBjibxWk;-!EamPIh>9+Uw;9pN-+P-!Gdk3MPHm3&AE`H$>TqK6 z`eaK-(mJ>M`V*2$^QOY3XM<};z3SHuO^h@QoR@9$J>E8B!Z9eBWK0;Dr29ssYoBrc zF}AM}v?!5^p9(2;4XP0{B~O?}=W%gMWqoW3;If#)Jp<2H1D8E4c8F$Eh2C1VIU0NhT?q{dlRUb@;#A{QM}aki?zxf$M4}=2uWtc7S?9I#k)oz;*0aA|sXmjC zj5GJ04fW6y7WsC(x#^`{&7`pYB|SsVGxjQGej9S>Q_!nkJl&8G=Gi98!;Rdi(7ezW zgTUCXp!=#_hD!iQwbZzf9Haa)LcjxbzO#--M$KE^+SZ)JY)cRmi&<(_$-`o=T?bif1V0^<1qKtB#W2XUX59$+S^oS-jk5>a)K<|l&Fl2guWy$ z3uJ@VUfK65g>oG=r+cP%yz>i1p;C?WKg_FCC2L=Zz~iknvQVsKC+Q~O95P3sym-lH zJCMJiX}{%ezg_qd7iDAudcA-51^YnLQ(iUQ-XAbDE6w@s`Hjm<7BD+AH%T}O6eZ4E zv+TPi>G(ZI4Z`qu%i@~-lQ%=^RyZx;PPc$7t6+F?P=D9sNZUn^-pLB0_s zXfH!E>azN-`hnwMDKgNGz)BGQM-~HTRWu1E-H&B+E@iIPRG`bb!u@cfhTzJE1dETG ztI-4wDr=n$pFEn0T@2(uZaIa#>i*idVYg_N4QM!=-)&r4M^1R+je)8e-Tf|Sg}i=l?xFR7mBw&r_Q{Wxxo1;HAT z?obJz1nebe$zW0<2ZiN}XaIPG`{W6kZka`}8vjPKZn_h+!m#}SQhWo!k2{FDId?oa?3a}lY~#h*$n~=!uQEjnCQX^tMtrtW#^hX>*Q!= zHuK(0{{?+JN2SVqei3N8nDgvu8s4K3;a4Gj77oNQ5jdy=vgD3D1;1j-2b*L=7RFy! z7kXjG3f~6|8xYGJz42E2JhZeVT8|PUmpk>#I*Sr@37~ww)7gvu|0?p#Fv>@dS>k$+ z(oN^o$1Lu<%>u4B=~Ho*Pk7vETN zl74U}>3_}60PjBCnJmX}j0zr7kIT7lSBxO)HzpUCgOghfoT5<@FTa@k_9aDB-vdUJ zpNh{U2b$H^(sJVJuUcaT|Jw21*Mc{~)O~;m0SLc;{D<0%9JhXdQ@fuWi__QXqpb?Y zcChfM9LwZ;^!r~*6!W(hPrKrZVO?HsT9CPldg<* zL!Bx(f+VT@!%B(-9N6m*dj{x{zab>x9Q(d8R-RZ?`LeJ!chj~is;R6@=hOE!SceT! zuLQM@c8edFGZIbTR%SKh3q#DAR|;Rw_UwP@7A=>%=8j@OvK^CkX$A*D|Bvwiz-{SG zoox((?a-vg!^0={IRsRv_~AVdFpI<5>w%_J4;xgkV|ki8SHq_FP;X_v%bVJUx~l0h z?{sJONhH_aUnrW4eZ?&4??@Zj=vETnyeu^BZ3}^p$hfHyik|PzpFW2+_MA3-0S(-nSX$#M07oSI-5+ zsxZ}r+2(LvT!DF^?{q?cnm+8gN!DTr%%h$_w_lnFS3(wr?DRmY=SR$Lh$z&ZT zpIZDJxk+)P%}MbjY&zOg+n~uFV@}rsB=Ib<)cv!(afM;!s@E!9;Ptkt>h5uP!G3e| zvw(m#N~9b*tfl1UwVSWk2Y31nQKkzVXzMzy`6MDYzn1$7VXDGN5bijycs!eHSy8Ka zAV(Z#?Ibnhe`mjo_CYi&+ zUs(Ch7Ml2&aGXs)^I4et+2h_9z*Xb_d=pA)R!S7Pt8=P%&cXgDW{jn#(fsjM=9dur z(q0Fl#xnAiL}{_wFUt0W9%#eph-V*mY?;r8??;9|7}O*9^`dPxjtVK{o(NI$+~^oD z6ifar(k*tu$u}j_ivFril<6t3&XE|}<=#ur{kxvtizK#L%K8vV8@YBmfPulB*SxHs z5al0-r-?*l){|!tOeFRWT+)9EUV-Oj^n6lhT1ex-<_1cezSvLJcsux8P9o9^ za0;D!j%LbOHy=2s>?H#iD)nNLn#22+vbX{J6BEOE2WNWPLH3Wn$N1X7<*R+UalFPN z5g6UiuOlon&wG*C;Mw4A@WKcL93}vrME{~-{@Hd(mig3eu+j-2@;_)zH~KuCrF#6E z90{XcTixtjEpHIQqqu6 z)tIsJJ5dL$MmOQBtMTRIR>y$Qp)uM1S|$$4h|WPv;;gUm?TRqMuMXb8?*m9@&D&}8 z$JbZmB?bknLv`ApH_FFL+EBh`bpKXEuC&el$V~9s=!1DDKJ`eUCrP59$#NEDS6_P! zFC_?rF-+z!-ZMw~?khBOP?Jzsd)1tu-*`8rC0c&@z3wrZpmZZ+z0(PAD1K-2(ai#< z9T90x;oIzdP`v7mWzy5XFDl?sc@cVEK|Uh>-C_LY#s98}f4%I#e)I32|9?S;jl<7v zgD~j-_|jue0nT~mo<9ree?-@W&X+$Wm4f|$MA%FOe!}2p_sf412eu>u5q@n2O#bxI z-(#JJ?+g8=a70OKTF@E(2TXo^SfZpu@RW-G`{Q?|$mM4W(A>@g`Ooz2cp3Z%hG)s) z{FwdTkD=u2-isLb_wylyFD#e#i=`9GFVy(T%^NwbwPj%lOU0Q zp*ixnkhDxSa+^E+{0jvh^%IhC$*Gvk5_#D%5*B1~>}{Pe^|tOuepyXBP71#jw-{>z zN047Q-H4Zc-LdLpIMnxO)8>e-}+Ng@Kh+&%6vXyo$T@j9B+(-sI&dmpD~yTx>_w|;FB z%P7sR$rPY&1?{_4%2TWaR`}x?Bfb<__9Mtd@$hj5rKyEwl4Z+4Bn}UPZhxtB*d{Cr^LeB@OfR_vwI!Zr= zXND{wVrjS{1VNLC`v72q34G&PfH)8Xhbw~rbeejJZg~$z!*8w z^$vqGD25*_Sv2H;olbtRTjYh|QOB739qS9uSlJN8C2+zd*ajIlj%coK=w zQDYEgI-JUP-R@^W3WAvLtaLpJdD#VQyQ(VWpiJGks=V|-M|k)yFuv%@7%h(ezzXfO zu<^1Af(D4EdV|5$^C6N2ZvdQar7)}@TuAoZNS}D$r-~gQqh#RfhH=7eo4*R)RoOoFTB6M0gB^>VF<^uMoh0H#bRCh~?wH2LO< zJwb-hAO-$B-b&NX{sfBW`2xI&r)-Sjqzec5>2WO_+*BG2nGoqt1E+(4tlmRlI|*>R zV0_1woi5np%T*QHcoJZYDftkx^c1$85Jm#XXgD2OGc*a+en|y}_C!+d0u2mMgus5R zEjA#Uq$N5bQ`is7i?AT0Hc`kDqyQL<)ZgMN=$%$5k+`FPhlUrqpf|4oINAKd6g}1! zg|pXCmQr11n=@#G5`W;$MT+5Fqk4t@DN~LPV9LV@|337*IBd0E?CO_CPb{}q_OP*5 zzsyPtk%4Ulf%pGkDQf{J46~aCB9^HPz?C>8PgHtf_iGZq-#1|Na(9*wo|k|<)yR33 zjsKML+weDlf{%gtoA}x_Y{T|>O)7fD6A6R6$hmKU1$Mye0zeyGS&=8wfyFR{#mG{u z1%V&g+&+i%#iR!Kg~*gjSsMmJx_zh9plV(Gfw6>G`l(Pj6gsT$4n0C}euFS3p6;YP ztqq5*>>>_=Zc$57y@cV0j>@*d_Pq@a!1({RG$}|7$aBA-N@{={iN!J6@CXb52n-kz zYZx*5YBG4wGVO zgje7f+6=V_NJ_xlmGJ@LRfdUEOj1tDKCGhUH%bBO@g=J~9rj8FUF?f?#3) z!rqOK_svhJOG3quH`*Z0%`XD*2oV4^NYO#$W$dsg%c?4mOg-rO;`>!LLxAtF2lGQn zbgpolg8*^Hh(U&-SCih zp~?W=7D936(@~5npFn;d2LVs_dh+(#7uYr;7?>zg!O#n`$oW!C3q!BPyu-lhr_`&% zGFpa|Mws|6mgq_Iumcgnp`wCA(oz7>*)9Bn;PfAf$$biw0SQ?7vO2>Cbg1E);X@km z@Kf131iunMkY>(_b3W|T!5$9X?w|mwh5q7e(<#1%4z1h)JODH`AOjFcX@Gra;>ncr zzlNo1qH+i{2vC6UHffblAv2LYrK=3~L;wNfB3Jm48h~%TBN(Oz znUa7SaGwM@|FxPKvTkMPC#nwPiN&B&$-x6p)~y~?@yI5M19)zJ4TB{jKyNEJG*+xF zLLfk-g>OT~U&0bsI#m%Y;lDp&m_Rm39pXrU_E^#jfW)vDI5A^m>1DwKvCj<%*CZ$2!GHIfGqb>qVxYo}fB~ z1DRAZNx!~L1{0IiA#t#To{-cQ=~Py@V=ijhkU?%ibo+oj@bF3wYGl+Z*!r4D!mC9= z7&`o4Kl6r>MRAf&81xxNM*x+*W=yWYu21X#xpg{w!hNe;G?=82ICTIAEwBoi`vky} z{I>lGQiC{fJyp*))p=hs?-8` z^l>TJW-L;ZdEWy%F_!9TGGMS-n1CP;<6-o1I}Z|&!~F&X7LhNUl3@?_-lPL%Zq3PH z`~cQ~}o=`5mN4Fm(o@%d2`qgKJx3ttsq zbMYAYykW*N!vS7nUGV&*ArVG_ zb74~eM_Tibr~xsZ$YA!32UU}Q1yly=sv`_P8=zRsgnMM+>OM%(qI)Q;{=qNl0#(YT z)`nqL1t^rfc-#mtzfA&1t!cRKxs`xrhRW#z;B2lhnw^{gM3YWJ0uHq*4LInOI>nmK&(=_TckY&Y3$3`?%`yQ+8{gPJ>B6&At)_{EiO= z;7GT#LjVbIx&q|9zi1I8s76ZvN-msZqYN2q2y2TR*c@q*O2`B|3`}(0Fy645ruU7@ z$Y+(A?G~Iig;twzRtj;(N!L zXQX6$O&<50!&(^qozc)O1#*vSJoJVD$oiZg=Y#jmm9Isiw=$6-@7&7Kz zI$*$%v<()1cSl9Cba$M`dQULlIazVGQh5^^`9{-*^6uWu>GsUJ+je<-C+Mvb_(G%| z9sx_k2)5-u%^2P=mncqSZ!u;3MIpmfsPUAwWLka%Zf4d7U}Sw04^DE^@l7lCi+S>& zJX@A0n17uv-14+bIvw2_d1Ca}YUk_7AMRqPJ=%}#>nLL7*tMl$ifE641Bv(F=iIj8 zfgg9Uft8ea%{yK9tOlis_6A_f6mi=u_hc8{O6kTt6i*1Og+AO(jAZfD#rb=ydBw>=mli` z2JkW4ES;C)?Qd(^Z!(X6T{%T^W8k&&s%+!l5`C6bm#Wk_s^&vllMCiuNV(3*vVLg9 zsvm_w=^HMd9y*YWFl`_|1BuIQr&+ukBOM_fS!9qZ*&%XF5PPv6FnzMG^lfM_yZe1# z(5^p?x?zyP6?cQ1+A|bk?hxN5Z7c|h>N;+XGb}|^Yu>_CGT*fhb1T-t{|Uq}D)0uA z<+nqq&9mIPV(pNAe46I5SSn_SV?nPgLG0LDrz4$&*D`^sV{h|(N7;2{HdO|7gC&;m zf}kZ6wCh{6Fb+4fsDULEn${ZU0e;p0;n#ewxq!uHjpL8eihyGl=T(5Hr}V_4hCDnIWfc)T`5&RR*m-#~x(C#;b1vC!8q0@l>`~+#C}qljC#z^9a{`L?AU%Bx$(WmZTo> z5YwK%c^>vtby8F^v0zT3yR^|A5yHl6`9^dJHwYS7c7*H~3sVPuy*hxZ8e~$p18%&`2?-okvadWsxit{Gs`S=&)Vq5{;kA5{hJ8$P9JQ` z3vUkvT+)%J?`o+43{cGH?_kZLlOkQRFINyxCm^u>Mdmlu)tX=lZ8COk@lP+gkO(8_kmE*$Auin4CZ+3Smf3kTjb>6swdfx;F z{q-$Cb#0_#PbnH;u`6~MoGf8Ay1)W$l}+cUj8=rVHV)9McunV;=W`bJZtjMm6eI4C zex&-Isxt79JdZHQX~ygqL$mt3$B3CrTZTiIUo;vfmsz}N%H=5{mRr#G5;LKl0}u`Y z5kn1Oc>8gEJ{@5frBC0HU@R<)qX0nx_x0kFqc$4bhFv0;X>waJ(5_Y`XN_1zjxkw_ z7!IuLak+_HsLqtf1TI}rR*?x`rkIG_t}#LH(>Xyomu9!ely=_yo*o?|*a#p@U`Cq=mkI~xZ0DM0KxXVonFI}JFuPGH305P8~bz}O~{X<#EUX6ko* zG$V}9k;_CGbeGrNT|ec0!Z?_uVScB3!xvEPTX>~y4?`c-;Qxt1>Vt5GbuUsZLUuvGuT>R%a%-Mn0E zdV`)3s^|G`gWqy%Tm>BQ1Y`f>vC)pP(@e2vEgV-o@lz~U;8=l00Es675|;#eCRsRS z&iMvGait_bKQ#ZyV^J2W1F-EqA*WL2vE@MA)UI2F<`-5D z+morhcQK{Gc!a;uPmh{wSY`H{&GuXAWg9N%o}K&xzVlb0aDk)#u$P(0uems4O+6uiwl7+1%F_w+lQ!gYri;D(uSp!(xwT|ZU5ZKXT4La!PoEuy91G+Jk+6T6lsk#blon~79PD7 zHUugxrR-SQ+0`fFdn=>KOhqwo)U>(M#YZy5`Qe9Rdydbvt1 zH^)!xwyV-JbI|303D$`q!)GTmRdXMma_`mV@xVLpe2?b}NbxK{z(4>G)tHTUumJdW zm~iatuK6`2Kry^~@=$N09+NjvAXsI(i_#o%V&y;5<9)Pih>nUR-*$nAG@kUGS-jRV z2@n(6AV`ZWO@E~fw1h6V$0gn|%}e%-HwR(_zxNX3&vJ(r>b%kT(h_`|jLHEo7Y>J| zK?%I>sX;t7a3KUgUfvJukGIEmCTheQDM14DKPcj$L|;*C7nXS$KJQkKe#YVK3kMo? z$la=&UhbWEH(mp$i(__#gj}}(K3&h`5Rg#7PlhIFA8iB01Aq$e&q1=*%dCcijI&xb z&DS~%#!6018)Ex}y}g+h@k@0B`ZnU&A)1spX}_iS9b)gqKP&rS4!X(%R%?1cqj5iz zj<>KV^0WNw2)7$Kjb`0*;Wo*T63TBnSQ?HG3;>2{=gwteV1TwnG1kO2Li0rZ>0cnz zHFk!!{hPWL%fO~)cbL5oaOd1K@M1ZZ&m~(6ZnJIDc*9uq@E5FY$FrV{>(^)gGM~FS zW8CZu2&X2uAqD0Lny0u8LJxsw`V)AVBGAusS~=Fptg@X!EYat*&5R@J~c_!3=UEu)XdfG%3B0K& zUw_M-^K=fRCosxG9Pp9j%{SM**J}y0(qfz>p0qY-KuEN2aGOXULPGZ^BoFOFp=Mm< z4~blZmjGf_7?Q#3Z;J#siR`Q>=2fdWYs2_tHD#UGN-2M3+}a$`a*;NJo?2M(4SeR8 zRn=4VQ5^ox>!Fayi~u~g5YMcZ=vF>})@uFKN zsY{Lq&U5H!AZ);BfG2OS{RS3$X|X@C?*g;U_g1}y1o=-dwYmIWyE+Rkx6i2=<+pdr&^DKqOy(mkQ3+;QcK>UPoq5~ z2-TkiDI0oVFCbavEc-L@+92`5W~To{!sHB}fA4d@liqD#>BWk4ZEo%kGPQ1`?~k<5)?9>xH{}7J2JdYQh33?LM^X_J(h6u?ry$!rZx%4 zI-9prdk%4+B*VJ9oC(f#E7k`w0sV=^L(|P5TB~s@tEfeir3H3*-zLTkscFbv=sq4% z*6R)+mGOC&gW#Yd%*Gy#$OIbh@7{TrE%g}`8l#t@ll87(dafsH$UGMnDrCxYS#5bocp{N>LIr{3I?6!*!)iw4nJ%{RW$si3l}7&u@2r4*5IFQz9I+V~3rx ztJ5ty(>0FRXrjqxkBVnFHta4ePNvvmp6&}DXUZOQ8|i`Lh8TBHl)5)Ze;6MaZnR4- zTksJJVeZ8pmqN=A?>TT1#s_K0Th#zhyKtSmW=#%Q8L#KC`is)v1HTKFm#oijH*IKd zczeFe)^K)nTOIW@PE37y{MoOc*VpZCzZT4^rZ4YP*xRjV1Y=K&Puc=m;&LluOL@`} zyCIi{Vah4D4O^ZCRm&psI`km!RNp#d*pyvP^lBQIVcMPJefsZHF&Pg#iJ!mX!h$HS zm9mdW$cFKqS$vp-e)dDfiLw#|dfVOU)51{b+pp(Oid`(fsASGF9CvQ!vC@h+*EJAG z{RUrG#z>#{sOVrO*oG0fO9JOg*va`;476!FUe$lj)%Lb3?I1q#ZTFbjgzUaXqGz)N z>^h)t@s+#=sQEhi`siIvwc|W}=Hmd!rZSx({!#R@y^65`^>sP2u4bodc}$l`+iipK zsJ#~lYL6#^eI1dr9K$@&wborH{YiX(VD@g<)zT}aJ&HpYZB;%Sg;+^*(KmJ(O`!S7 zj21-Yqm%H6OVxu34HzQT>-)q;n@Q&y{}xP@F*Kll)Luo-k?GU&dQ@(qg@lut+0>De z{7|-IB}M-!CKCQD;6+$PIN~9^DX8j}LO|FX3M%FL5F10p`Y#0TN4|HV5{E#|`a?&f zPFRXzHN1X8p)5sT?dja(JYNaULHMO92lbWn2 z3O=Jfm14q?(Xp)olQLP;$92uv&SNGqFz239kRM6{t)PJvGnU3+pBSg;1Ipq%X<7yQ$?d> zP+oL_0`LaiZysZ5kOP1rtZ;`B+s|OB^fxYR7O%h2`+&%}*%PXtCqU z{O(Q4ic*P9o@s_+Gsfz(ooe#jY)vm0D>tr#$m?r*&IP1y(p zd|M}kwnAi0uk}%6!-%R(D|a2ZuSrDh{c6R#?Q0ei4)>z zA-0(;fI#K#$Z$?$yK_ouNH1ygmrnI4}Ml%v$+y{Qx3c2HDu8V z_A)?s>}W!qiMR8@Y638dtvl(t4t#h+2KIl#r_W3F z`CvMIlKXS}T;g{1Nqul_^y}A+ht((nxJi<)hThCr=JW17$|H8oOMV4LM+|rMbO#1d zkVhTu3D8HPy2>NafP;<^V;eqE4?4;{9u*(|rPX>(k zgpZ;THSNO3w1eroz$HjG6;IDaLKFDRUyqqy;EY`wulqYq!9K{~D#WTl^aBHP2~h7C zD>*1W;sXtui$yAwcJ=LZZy^DTLi+{-zOW|jBvq>%T!@R|8@y^7_1GclML?2qe`nzQ zKJa2a!FRA9AG0dH!)!2nMM5kB+}Z=TRg07lt`Hx#VW4etdUz;XCERLNm^DTP(<&R7 zSVO{C3%KU;WY=<9%C6=IG(k(XyNRQ+*y>X}Vm9`E3MXZxwWX3^;&6=W=@8HHa~N43 zA<}vPU&h1P^K!H&Qi+0x!piWlQSE0^XQh?x-LA@|j3uceqzl}o-Ox;cSyO^|hrOI?Dd6YyRbREW zAfyt_NfONclB1dT@pZA5o2sw+W>*4qvRqH1&qF#1#fvr9aV#iH6)M(5h3x1d#hl5R z#m)+w)r&{Dk(bc}u!%=nld-Or@fC94>x^+ro`pF$l}t@L zc^29>P=ox_{Q@*hups-IM_By^28*tn8QzKs3}zqGkZiP)ept9l%S$l=L-!)!>KbY4 zBaA_lf0fWwB1bo#w>!4#UT2>VTfXjXEoBMNu~w7*m~-M!RcI`YUS%UB)^NHUqy+BS z#4-rZ=M!BB{aKG42)0O<<=A2mU;(1)ymDaOV`ZY^VIWg{?X za|zrf^-qi1*ph7(oq$lq8vsIOBlCEXWfMFg0rUx0x2TwYoxsHfOPakddJvTV)uP?A zi0kL6#hhxjd3Zq%KG8vF+de&+{iUWZ!q;v$dyoo3Z#?y<7d7rEXS!Xd$UQZ{<6)tzvMp zctzal@YaBxZ>ClGH|<<1<`{oS)Ol++&}{<$XAVXnXV;Hjr$=eCklBu%CFWD2G(n?x zXf6M5;#+ouyBhrBcOssBtOTp_(p!y@N(E3#k}x{+@?Lv8Ik~vFczL<%rUx)u4Asy9 z4Q3|&p6R9-TXvcmLGI7G*Lc+4PP^`5>vlrB#u4?`CGSYIuZ^)Vqhu!ly4nHulpuGJ z61XzdK9@3YcxYR-#Q)Q_`cFmhIeonh7Zk*<$~tN+zmwuPQ~5krx|(sIY4Iv}(Wc9& za-1a;TK@!dp=LLz3Yau)l+99&80yXM96##_zHg72^kp9(tEzLz9psOLn)hOZ5c*-Z z34i;<2@AN>Zde$dpFBP<$@pt*mSsZIlIwDGpwURp3I*2{fk7u0Yi~8 ztbQd{N>F@=r*Uw#bE+7}ljbH1-D<&Nv(g`g%+co8f9KP$j=LDUx71JCe)dmGjix1N+%Y2;A=8a>mFRi2`YIGH zNYe1Eyz`1~7i&xS!IKgG@nl*88*FTF2OUCuAM#w%kqmTVqU}xiCVz8I^8 zu+w%sH@n$4gKutDb*3tEoJ+3QJ6Y?~=Y8X5^p5*k99l^c zn!o9YT$V0aTHbZ7JRwrzf*sr8TLz{b@KsfBTu?c`GTP|%Su2I8%lPm}KK>_~|DEk( z2-T973zftxPrKnfweJBXKd{DE(2LkYsx#a-m}B=Z;!0a!CjG z@F7&RLIv%8S(MN~Oe8#-T(S`ipynBpB%bw*V~?7`5F=)^`_2`z&VeoY^zF#*U)k$V zmi%SR#CBnE^yRmL0sERwhVGwweO$UH{s#W)G1l34+ZrG(g zks-QC5@k}GGy!uwE;TzTGTT}KH-Z+3^Ki4R z&t$FSkMyC~6xl1EJm6pDv~j?N0@~}A^=AL*@UNG`Wmh@x?CNy1ee+~o$dhT=xqm*h zEPbstFzy000%Rw(Zo1^i=Cl#OrZV@uDy&^z+7wHmi7qipBHC!S*yDt0FD|R9OKANG zJVpEmJe`35H(9Ry(x0w#Z2SH=Ir$dNBJ?VL_COXaZua`OnEo}e140zo$mnz}o}`z<9h(V#4RxvT zQFjmZ%_guL-Uo(&QMF}<3Rh-fI^+Nt!le2IpI?3N{dDN>0cq2;Z9SioH!>iO)N>l_ zXV?H78rxn~Lc&t}$cz`Xy!y8%0KN&VA&q>ffAemq%mI5Rc^9sojc)6Edj4GfR#Sj`V`0iVz$J^2nm zTmYXC+J8YyVEs_cV;DOs26?sjT%L2{jd~zYM|zq6_&#p8hEsB`f(>#H`W_-Ad@FDJtu>nrtd|Bfi=2_aOS=}b1_Q334`ZXxBscAVE^ zcCl+p152!Zds7sS0B59aT`z9DVsoY}HUuf$$U#NopG$!ZFhEa=-@9zNUhQ9Zz z7y8rD@B6RL*=P0-Y?S}829%>)O&O^U6O(qA8y&q%(yA)=z##+{QR?_LOiDcDp?jBl z6Fxc$Cge53F*#T&M14Vj#fsX$d|L=?ih*~3``^1a4KDfA%Z8iPsFbCOkA>R9zKwSP z&8&RlbB04WjU$zAU9ZGXS3daydm*t-7W-k-qz7vb30j~fj)9NQ zuL^=!+V7W^1j7Xl>5x|sHWbA@L&ZFiO1$`!RM+vu(Y@=v$H^PrZ?C|PJ+Lp0d}6f3w#=I|XN8r_eY$HK~de^g87u66M0j(C%lQaU=F67y2(T8uB=UZKHs* zJulx^p#I1}jA=j|YuyAHC6?JU@6l;DLL{&S<6xWyi-rNGViksyd3-$0h-OK*5@ZWR zJd4|$jP+&VuFA&C9rCgel*=!kf*2#g(Oq|CQ`dJZIS#=j@mn)Fn*JFmqiwM25%)_E z)a|W{iL%^+2!^1kaEARjb;C4*PVDbxG#N%zS_1bn79t$NMm=n`+=E8ty#KKP6mNqy z-wGMw0PolLOE2a1a%rNXrrs>i`moEP%4?yKE>#S*{$dup}-Wq|&5V zsKBQ`hWas%(seCFv$-lHCNdILx3|GI4yMP-N@mogC0E`trnB|rg?s7$8NH;sX{&T& zcbGqyz9o~Q-m}XbVcDh)OC#*ds^S>$dDrB53y~hmkaDaHX>i)c`}!4S%Kb5wEk5Nl zV5c-Z%It4%{q}0yC2({jxjGT`1> z+1Mf9ScZ>2$Fx;h2C5iU%9%O?YQoQ-PS_4M%QVs%e;?n{Pny773hk)90<};{ZD??-aF>P=q3e|5^r2EzCE3oAB!D-bUhm1Z z`edTzJMzl?qdA_^t_vJu2UXOYqotEN3>aA?JlsHMA>HIl+4~U+v`!~26<$5R-_WPo z48ebqK2UU4MMvTrM0%xvTcpZ6;LCCw+=6*gJVI+r2bnv1c6@u7WR}bEBf?CS15k67 z`!rfrOs>A&z%}8Xmofa%SLJ6LBuD|Fk0#XifJyuXr6J{OEOlk_mcc#IshumX>#Uh5 z$#nNjw!K(R`@E8sHr~lAsH;lgjX_FzC&k)*lBlwK#)R(piHD#?CcEF;lPSHlK&Dp9 zIKP7OUBi1+oewVvQN9W>19KDRICBGW5JS}Y_9)u&CjE_E44`a@U@)2R+JZmsul|VaJ|6FYcJ! z&OO7{Fl^bT3iBTH5}fQH3JP2+^+O#*(V_?9y!BPz_`kVYNCz4Fp@H`za`Q7;VHRrC zNTaP44~Cy8GtUSZll}fv;+$P6SIBFZ*cd*gH+($}MYEUy0gxl%MF_9Dyz#XBounAa z5xYu~TuUCqxfOwF_u>qTxu7xt?mO35XJH$ia94N!5y4k8rc;)J^FA7ZcWWyc8g?gn zBa+B=JpOQy!R$@V=u-LX8fq{Jo|L^;PaNw6^#E89gCBjFh>-vgLr#{sWa$FhwKuCM zZF##v=eDzQoFlJio5U)Q5<_?}@>S#{PbS#*I`*CF5xDGhZC}v)f73=EZDk#P%Ba7# zv-tuI;WO#qVHTAib=6$lG>W*o{9DvVP768|Dzg^fBU#k&)MGG)Gd_&lh%#$AaP%DI z`EX#^2*_zGGE7yBrbp$8;GHp|VHTkWY_CmnwA{+hfq$TASjG~486d+#y zTkuvw`i+M?mTGVnZ-?7Y(S%p+)gThm)sa+_VWz=p=5k}KIrUAnyx2y`4>*b|E* zE&dwrjhD>+j{0dT(|=>n4hu*_vW_Z^+T>FbAKW|&X|%l;fO`0}Kbg%Wte{?AL6)@2 z_=PdV_6^r$r}paoO{q56La&G;k6?21H&XC1*KAd|{CLTq-zBf+Xbc9;Q7xm649q}4 zJb3pwj`?dGArlxSzb%9fu+I?dAWYXR`;?|`B?ih>VBD^ zLzF4!r*Ql1zq$qpw@r5qI}H^T6`7ft2P%|^+Egu94(g;0=my{O>&~Oo`NRNS3>!CQ z*9=1SKUrO=%-lfGysGOFSQGHNJ-wYUmS7JPR47SCRP}a^S61~8zep3fzdyYNPjYdn z1hd9dgXI`Z&ide1`zw|zc-OJ*?UKq}iw@rUcTqj9n^)uJw+XEiEtgk2dk5iZ`-C4} zbeM<-YYSMd?(A%pD7L(+)bkwB7i#MLjr+m78HL=#QSiMil|-(BeA%Hyk6!e&5Xj=! zCq$pBrK(4QE(%yX!2)BF1h3I=;`D-N`}s(*vH&Zb&K{WWiaz}Bnv)Ry6?gJC&bpif-5 zUZXX#bk*4HLP#MDG^UAj%<+P1A=bAsr%)1YROMR^Jvj#ts+%uC31Mp%Y0bQz{%~*R zAj7K}B}%eLWogg%m_t(JY@d~(O^fLqAi;?jp;y5hgdDyU&^tM9n}{PELu5Il7h3!O=kb07X_0??_dMA5i5pEbCf6gdUvW!Vh4u; zWj*l( zhWTPwDa3*{>4=v3ETZ(%!SSFUunnrdxgc}4ey>Z10HJIvyOuY-VBi#>wXZzM^W7WZWmo5)gX);tqU7SW)q}Q4J+wLQoWB#(q>`C`z>q57Frl+ z_=TjwUCQ`ZN;U!ieK_Ut_D+Fd_w{wWb^9{7Vr9y3qfAP`B2V!P!!~N)Qsv>>H-Nb@ zFoJ#sF((1cX!ueb)s{hyd4|5uu3v#C^^sf9PmH8QZ%=Pg*n8uO?dZS zw_J;IcPDt|z)&3~ybi9xXH`;7e>&(guvqv87C|mx4mb<2C)1`$7BxzFx<7dT-X9`h z#j= zAnh)XnX0^MIKGZ&coCfPbKL-qO;^}ZDrJp!M|4-VL%rB`W2DDx-{jMhD(+6Ln$)cf ze`sp&-y6riDl@Ho8jFb;z2J~eE1^3@It<;!pPaw!Z$GY9O!qJiY+kBL|D56+YoCr%j+P^&N|T3Slo(M9b|njjRS27i){zpR%Esq*<>#B?fNZ4s*PC{9ks^x z^}}a2n;(Z_g>H!+dGULtmCOfj3AQxJSpl>lm0hP@-Gts%HAVpiJLPVcT(6+{r#^`E zCc0sB%=e4=@u+uO55KBlQz~a=dH?aomm$#2$#pd1@Ir93t9ShwMw4DF9M0-sF|LVe zbjuPj zjx(+V49S3k1-7?y5`{m)hO@hSe_1RJPpOAxww~!ZndtHjbS`cANA5W^jEmb57S^qL zbV=1SKF%gq=tocLU1{3&ls)b{m`Gv6K_2GCDSe9kz6a+^eYL$~_39_tpWFLywyT|fF)7M&BP}@`B zPpb(a>erg7)wZ31gw6-XwqCh5^BpC``i<--W5mm6)Ny&JYRsWiBHLw|3&5A^SNN1I zblHhR*=S>M8z<9|Er!}f8e(Y13XG^~oHBKWb|Bo=0lI;r%o$wjj@lSot0`;vWM7&r zQ_JMnm{`f>87JpQYRX4)Ho8d=PYx`vG7o2D2#NU&vV8CeK1VeZ^fI85&&Ej5KNv*2 z=QZ9k446IGeb)B+JRI5kHYF3yU->kX?WFDAdct_(&2z;#Wc}3>fuFZP0i7-5ff7#% z6#jf*1#Il)PdxT^@5lwdYF1^;s3wzMMa`<M#>U62VH~)97AD_j zf^$=FAi11oFnb8&Q+tq;**NE`!LG2SF(!3|&1;G`k3RERxnWB}3is0C!)(VD|KO%* zPRJ3$eHiOklB_ymI;$p@0Jmpz2QS76w>4eHWKacbvs17t+{6sJMRXEzlAr_mmITPR z^H1@Dw4ppDbPd3?%Kip9oA;+1WVYptj$G`n-)`<+U+-@AgzL{AM%lw5nP5C#AZ!Cp zIP0+o38dRkwNt((!Q=-}==}+~rwyCn zHOZ?`jxXxx*qGbh_L^SOxcDtjM6@@-*FO|+#X(RRm}X0(e<4cx0$GVVbSii*NR;`s z^m5{6v%`4po6N$7U*2j-9PC(BTM^yO%_zLcoE0)n@mReYuo4H}(s9FrHhx zQ!Zsz9(aC0(TiRL6ssjVxr4m(&!f5}Q*!vJE5kIZ?n`LA^8ho3H95*HmoFGOnWtSb zG$J|*ofPl=Grg@Ba5@JcVn>yzla$p(j$#y~Zs`qjhSwWErJZLNLM6MK5vpZSv9pO8 z>QUn}M@$n4Kl2xrIJPsC1JqQ|Mnco4F8fm^ltC=wtU7p|1G!=z|h9RiKs7&XxOgQ`aUa(Pg}!py1q*w5=b zeg_#+TDQxgbasa@0$=DKlt!r?z_@_1Z`oBHhCFy8`k|9Ak7ExTQ(CYh_2sEt_PQI5 z4Td9^U~0!ppDSQ2J(#wY#Y}=sNj|CY&|VzEUHJ~T*Wyn&-a}LsT%@OlBBzLbHZsAo zMg`E{O?K(E8*?WkStc{g8#F)vB}IZ|LE7*mR@D2p^x|~=1B5jtE4JMWW|8E-HvAF! z#`K1P>i0UGziI-Nsw3CEHcDMxv<~vCA->ERTEqLj?Q{>YRD?$4q^3nYEOn=fBVDRU zbO)PE0h>Rs9MqM!R$8xfS4$~?bwLu_9+-#v*Ymzd29RQJxe~>{Q?K5&3Q*3D`TVeE z-tn!7(-pM0w{K3p$#-n(T)K6t`Q1=?6Z@31Z*I#YOBPGm7OzZ^eaXTb{p)m`%q)4X zQYhxV^@(TvploSBZ8L`^sH1DWB3wmDKcu}%_@JmaGz^xDehAo2cpk%mKoThn8?(Z7 z{CGzxpKaZru9e=gZXx3spJsVJf?4WRrN2m|r(8gMEj>gA20Tkt+67<6N&`7fZl)7V zj2v9&yqB-joAKhui~^qHqVICYje#u0>$s0Y8OxUY_V!BgO7TwnyfyA?CjVA5SBI0; z4z#xk9$gY&#K!K95OKyUl@E~EA$*PRM4&aUvzlkGrz8C> z#$K1{f5F{7_#QbdicAa74cO!2CoZ7SNH_d$%{o)+6dI%!5-zc2Q+hNK=b!)zquDOF z7zDR_2^N-Q$BIV4qJG(cZ^to94cX^LLW6bz*{C}y|c*V`_tFS>zfHlW)+Lx$^^ps z;OhhzN+2?Kk$7pU&||M>%zd3jaH+`W0*ket`W=klVX!13!S<{e`E3` zTDHw*cFD1>(2;^My@kS%Wn~c%2;4;mV%hhk(;ro;mOv4!m$RuPXO%$6%^N~6S-Nvt z3n5PnSt>*w;;VOCY%Np{uRDw(2AAJ|9|I_J+z?+9$ug5{R?^0(Q*G_y>>BfI^GH$m zx2wG`Wj2kgcu`~P!yJ%xFgM-iX}wI}lD`PYJT@T1qbs~bV1SbLAj`g4y*lsIL?RWu zAYb+^S+RY2kvEJJ!;l&elGUc_{JNQKmA0B1n`LMxXOgyg)T_Hd?&i)&?=L|Zi&&#I zg+0K5hBivRW|GH+;wnRVBL)HsR2V0VE5056Gzl*piac?T7r~1Q8^5)R?>XkP(~9&l zG}ILiS4oM3tC%DKe>eSZ&P8g10pjAHz+hldbX44jxDmIsA2w7FCxWx1{QX)fG z4QWb_X4Pz?cc^C*lyi9<&RWH!6J!nB;u(ow8QMV!e-tv$d~%L3I|eJJ*rS`<(R{Vj zDhpR#*|s`ugB*T9x6Hv0@47)!Pa+g1YL?JUSB1u|!F*`E3HEWbAgANG)?&UMP$7+uQ00AVk__ulNhFF(oE@ocYHa%pJaFQrOhe3i1u4sbD8s4q6h5c?(-I;%RX5O*1)|XLOpf{c8qh^PJtGTifn|^vy zP)(=$<>v?~a!!||&|5kmpmIm}X%h+PO#j*rfB;|2qblyz2w;h2T zCHWc}NOWVrvcA{maaS?xuBf&!MI5T)x?Q>$N8&$)E{J7yRt(%8`;njLL{3eU$qtfl ze7k{laK;h;<+JvPUZs8J5brRKmEs)o&lq9eyI9virMA)uO2Tay^Utq9nmI;%9)27>NW!*oTVea7C$0nzZ;FqV>~JhnyKRm2mFq^FG$i zv!mIrR|#0j5f`qe?}kUAB|_@S^`ry{&QWitef z>t2IO*5~5Vlje7WfJQT2;a1sp850uG8wwxttjIvoXkYJ1iR=H2(}6-oVLaC0mQVi~ z)r^*r12H>mYl|DK@QrL-+5q8YRqzQ`V4XMLC$yjCS>oF`>F z-f~GDdGGSz&ZS^^t_CHYwahRAZr10vciFQT2;iP~qg0r|DCo zs;!>Nmw(#3)5@6?UvN{(LXz1+D}FD|{!);*J6bxFnS`rbO`-PyIn7qhP>}yFhwSVq z^Vj9h{X{3#AGH?2!jCpOO4$gE(tcXVl;8c_-qOVYp(3yL9l9t8%VcX#yee88E`w?K zfI^Fh#kS!roMWH+IoTEjv{SE4k=JCV3a*A|Z9YwT0x60q_mEm~-&}Xrw)~9IEI~=p z%rJ2WI6#doXQmB*HJ zNjr+_ZKiYFF>5BmdG4v4y2BEe9w?Jds){b1%kr7}&raq@OT`cB;IhN(AtnKB5b=Om z<@64uW}(xkUwtdjj1?TLP&LS+tMyHrQ6>mjA5o(bJP&@XugT^u97&Or9IYD|$&qs$ z>IzMd4!qv0LF|=E?SJ(jOd9RjuY2g40Rtrt6-KvLaeQ!0ko{M#=yODPf&Ht1xSy*T z-Jl>6K4`hfJ1S!K^3l>oZ#^M!JF}CHx;cB8(sO^qj7|)~foLQ~~JjcPZgPFr$)Tvy?dOrJ6XZi%5g;VVfSd!JOA+2Ct?}48{FW z5W6~Vy7B|5EIL*;Q*&gnn=jSJR;zULF}F*N|5$*iFO`hVk?z@bAb(FCyU``D))&a) z&Eilk4GERor{Q^yYZJXTUBuxu-G-0K_7MaLH-QhiY_MskRX>B;l z?PhD!?)#$nkOY)GBkq3XrROgv10TnDRsW6gWI#HP$p$+`0&pFaQ>AWZ4&|w8NFZj! zng;foEoQURthzlPdvKiUzWxGE6fm|+yP;4h?&sp5O>2{Qu!_*f`qnwt?!1;(4Rc`;n|kO%@p@tZ0Yr zbupob>o39$#-Zl(W(+L2b;_X6T!v`Z=bJDfIT|OY%vJIA2I?=sEY>r!!Ug3)D;>|w z>VbIG1^D350cJ*@0!w8hfsx7At*N#2c=F|DS@Uuv=J{M)(51QTbG!JY{NW27H@-Tq zO0-fB!qqB<7X?m>fBnNUCATACbu&hbOZ8?aE6Wsa3iy-tX1e3aeWTtPw{IIgC!EL; zI-gdX)x6=lfX=+E!R$)1!U1Q$nD=68RX{V&hGC!n9=D!jokG zE~b-Fv3Zw{I`t~>_C&myThF+9wCy{LiaUyRCwGr5(0l7BG*+$j!7?4ajckcdM7Wof zdJVfdkgN>Ekj#Q0WP}CePwmYCO1kAw0w6eltU@?9hJ5NgS&`3_7cpmOdQVH)7srHl zK*HsuY^9$KVvM%1#*ut#dXg7CFIG z+0L8lX>?{8X`=>1p$fQ|RGDpNajNW8O4wtJCWho>6peu?c7v+vRcrElQ7l3D{1@^D z7U7+2FiI8{&5wE7Ne+XnRIr24WE099LGyIvgQ#^!g|(9v@jw>-`^vw0#Vs{s(fXFu z)Fp}D(mftH>0h*w!hpbg>*tt~$5_jO(&sj2|BR*Bb9@(iwoIfxY!Tbv?OrKqIlz!$ z>aW}7CO}~y7P7KXL2G>4tA*U{ui01ns;PMh$sAo@=xmPux(TJPE_f|=a5bX@Wd700 zG#$fOL2k=Ic66yP?m=qDm?&RQr1$OCsI!-=?p zIE3Ce$J*%|XW&j9jSOh9r|mXs%*rIN6*dm1JhF+NW$;3nwFw)1&JYj{d@WyTl|_ua z9>D}tZacXE035HPol~OAYy6!~*5}WRvX^8u%LBbS-rU_MDobfi-l2=-nbk{pT(mHx z>6~CRIB3b7{d97O*5b~0S~X~Od0V9>WTFbqhgYcf z)h6hj?y@!{j1NG(?CMkM6AUY*nXl3AbQ9-P>{tl|ye-g1OK@hzE4L!CisN#(`|@w9 zygs|6f3L<=>Q)r&ghD6q@3u>S;FxvESs<>Yq}qHO`CCCRWTM2&`NV^Yx2%5e@JPIz z%;$Vx0QBW@XhN5cW#<`;$mYouhR541ANklHM3O3xOlJIt*f!6&d$zHm#0^=9Rhr^ zJwmrfR})IGyY2fZ6mHi##LOHh+;tS2l|EEiDuGe5OhcI$xhCyI538=v>yIhppz+ zp`}sLuU#HhsvcZ#O~xJ{cSeegW@qi<)OVk&L1QU@a)*DF7|ykzk%NNJn%y-cF6wv# zC7<^#&E=sHzg!2#TU?3&jzaFWpK&Ym*I1``?iu-T9kajhb4)EKaI8+*-t6a~cDXOk zvAgsPwilfMxsL64LFGyOFhyyJ8W`Pyld~<6-CVSY>Y9SbnXvgGg`GY^%|VG#fb_*c zK}Ru%gIH31J!y1K2(h}#`kGPf?^5W{G*#17g@1L$hR`hMhsv**sB84@I?mfmKbsbs zJs)uLl0vz-F0wNs;59BZu2j_wawtX!lqBkBi^mS=SMqQBmeBI!$^NfE>67OD!gShD zU8juLj>Ip>4rrz;0^F&->uQnrZq_v+2LMsBhUv7^=>K_s%LdU&LFe3@QQ*jZ6)K8& zEv*$cKkq~S2ka!Mbvj1uTD%M^gvRC`DdKq%`?5C46Wv`5&)?BJCq#ZZ9ib1$c5fq_fi5h z_-bWFaa zjz`#y8UhxhYO~Q*DNoHk=fBN8P=)AL#P}^AI&G|WPxevja(-}?W<5hPxGP(n$!W8l zjX7IBTpJjG`_I?!N!a$NacevMdfGSw6$$yGW}h6x;1&^QRB*I}PVibY{wajp%%~mnXr4Kif4)(?8KV{^p@x^9E91AzZPhyS+TTUEk&&RzRm33BB zFGDF*|C#Oa=6oRb!`Kn67Ba6vD@MtSr%x2nBs|&HY)-|C!qsW&>#!i4W_UcalHf(A ziTBC4WubKoPFyZ%J2<5`kHdzY&l>}*ni7iB_o*b-oJ5~!_rZ>97%4JZNgG+)PCkP% zuV+U}zWR8QexDxK_(O%6nD0pXoJSe!S}{)9S!d`}zUrjI?6yxe#n3Wsw+=^(QJ*Y9 z$=+;P+C)!|hSig~k8HGDmCm2OfT_H}=jjQZ|9ismc+m@eufq=4)ot<{H#pqwCh$j{ zgJ6QBx_(39!Dy2mBCsAry+Y0=G{hjkpE%>QNNGPSjH_FhneEvHrLh2yduYZhNFIv! zhtezs$&BERmHwg^I=u-Gg>Q`O5Cz9omSV{{$f#xt(*~tgDMD3hw07sG70w5Fz|e$! zGRz+5IfOprZrb=|7r_4w@Jb2zceSdovoVZ3S;q$-WvUbK{+L^Gvpn?oYG6Y7((m5( zdK|p`OHyQd9~zm#e~4Ke$mVu&(wpr!48X7A)uqPrzJXqkjPI_$x7)TedA_xfMcFjE zWkpsyCG#iRWFiu4ve+tz8Z%w)<&P!}`K+>PMT6|~9<^;R`O)aTh#f=6Rf1&?j-rjw zX;Z+VARllj0PL{zJSYBjC=f2_1`*6Bd$+hs2;Xcy2#00_4%sslF;(8Q#)7_o?1rzm zU+q`hfaH5;UbBVfQzf!}tG;fjW}^3zcm;maEM(Hy)lotuUnlm-RJ6LeL^V)@&)=-8 zn_S1{3t3(Za}9s^Zi(Cfm0R<^aVZr1VNxw=i)n!kwl!wXZbdP3sOEj(go72Pu)Qwm zvjqw~FkiPx2$9KImPDZ_eP#)0M1Dn5oAf;yVEFut6adb|y1cr1m=8fI^LVIs3@+YH7#dJF-Eb*BSIF zvFlF17Cv;LZPY_k69R?JiFolsBh)QnS}c9qQr?3oIIh=~{DdUl3noSUo|#6m{+v_SC3pYstr)Da_vca5;put zn&GS5EKY|@YgH_y=hYtut3xi{oHfmz7M9Zn$|utXzjvH^t6$JTyaVt5S&G@B2XMtkCMi0reAo zJ-;cj$hWP+Ej?;xa@YJm=Wm4N#jogq5sp$1M%mia-4Xit?pPhgi;k7_ZHas?CABaK zAHcqh$|NImH|h`6J+8;OI{8`sQ)GzUlH()bY8dTm|0GuGdgPlpo}N!;r83GR={?#A z!ks@K+HA4TYU|4x*KD4mUJ-O7DVWe=C&OgrSPp*B$H9z3-rC*}HHX2UTVEi`^&?Sb zg48zK>N|_qYNIIle1O)VRR$)7|7dgO`QtyW!52Vlz!7I(m(bD>XynTA(~|QZ*FS7E zg@sTC0UY{LFnLz&O@JssUj2g(a^#Pme^F_1Y)xj%kSm76JOY!aKgcVa!99lWKG@Gm z3E4r$?$Bj^&h_MG$fe2C(~ObR>n*A;_FK}&VD@QEv7P0x@J8j=L5-kFVv`h`0=eA4 zPvvY+YiItQ80rQAtoJ!{imAc6Pe&5+A0@Wnf^3&VJ%6m?J%J)}h6((}TP5XuPi69GvFs3E@LyoV~uR zWHZI796!%17RP+gdUG-(`c3%t<+`Fgy|=;+s}HO#zWz#mW(jY$%eNNlQ}@1BUz$ZS z9iN%R8b}M0xE#HjEei5{F$dV62$8FH#6HP-p8qH7soDW08nb{!70#(KJIo+5iZZI- z9BvBUlTsqG8J^xzF(eHtnI_o;w4?PetpO}s7sxLLKEXOx!%EpyXdC5#cH$%@?^Z0D z`w2Bgx~dqF;#o|EwAvA#lqwS$Q0I zxR;y@-wOkZyKxD!*j@3Eo$VCfSLjAaF+_Z z`=9W)j#dblYBJ&fNwZ)s)A;_*(1cRCY(*bWO0k8Y)3%S6Y$>>rx#o+%*0q_s{(Q1f zbAVkR)R$P#xJ_8(4C7kiSKL5i7HvuREu$JWTDIDv_K0mrhv6+VWAgJrtbBB$-vc~{ z^*r_S2tz0fhTd>%(fI&!)k?z-{dWwJHd=o)yA}4ff8-xgeu-Km&9m8_Z$;EjXR1t^PGk}Y@O1gzmqG`mdr59yaJ+3h^x8A4RQtP3;Jc7 zyW$@k$^1#zwt1dRoo8wW(t>aaqXv%SAY{0t}aG0GS1{%o;O$s2tJSy-jUHgrz zjWPM(>WwB(8My>$xxyn&J*(YpgM0Sy278TWH3Qlu+`#{lrIdW&Ol^x_sZpx8{iAm)E7aZ@_O?Rggv)z27Wlgg% z*}A6&WL9uya@&8TMkxPDjR0-v#k9Uvldmv(LwW~||MVYZt@YW)D8Gy$b=K{t`&sgI znt-^*;2wGKtZdDHw$o|1eC9ZKK0Yi_#C!`b>8mNx>V$=uhG$C!?X;xmLbZ@)U;gy% z6@~M`xZZ=(B1&RHkc63uEG%*-bmqrQO5uB8aIo^j;VoU75UJzk>Id(elM|v_5-gsF zqpM(@x&9dP72C|-^PSBzg+f(VZxTGo1grAA+SQI$Sis+k58rquQR6=ZZLjK14yBR& zO?W=Lo^>y8=H|=k?Rj~4e-4o%PKtI5;{r@|FH1XjKzBDSGo!-w)~C(Pq;A>xQxfKQ`78 zIb!TgkxuNw{&fQ(ZaXVE==3VN)#%Lg={j>{0of8`wmP-Osp^N=C5Z2(^Gk}w zYcbdG6Hc6+)R>T{YfV)ArU>QDy#uP*s!l~(ehJ`LCF(uKXMo;a#lwXv2Pftm<1BkY z<^Jj?(UBmhWd#J$E6R%yED0a3%9n^oW{lO3G7=`;FZcqCTWX&vuZe2u$4XS$6|+_@ z?AFYO-wuTyC$Y0YV?~>?eqhH{>+eG~w~^&->xgC>dU>cIKNE5*;Q*gu+q|u^oxuzE z=uFa>Z)5pTcP;_H_{Le>nV}vXBL?-p9&n^_!b&BE!cFk?Sc_hydJ^#_+$ZA<5^d`N7))0goR#)P!Y^R#N&#!Ad`s zmT-AOwWV`i993uY8?%3c7i>MEC+Qh28Z8IMYg1&_JDD{qyRy$|6SNrz)#$NLw$3}x zeexb(2wV5wSVz-R+Jw3;#`l0gT1UpJ_OJb``N^-^}DoP58+rz9}-i!UQXlmKAq6Y*q8*{mj-gM(tI znjB-2SKj(YXIJcuAWYE^q_Eo(HF^IJR9>-WM1j>%=L)Mg-TdWUWw=i`Wt|*0zUZ~3#K_@dP>!a#pSes`q zf~31ugU=A82qm`bvC~!SdEabh!5A>d#H$@q6W7vV&47tcclMx ze??sjj`cK4%U$BJGY&3r-r$*?M9h*RL!@{fI*)Snn-mJvkt_r4s9cJe&Zc7-?}ES^uiflO1+qvJ%W2612>4PqpnB?nrM zDU^ZibwX7a&QaqPK&kH$ycwJKdC&-e7dBNKh+a=)3E^z1hV^qa0!%p@WKk&+ADsEh;+C{ zjJ{++vp`G(apFp zb#fF1SR>~R76P0>PveW;0xrwM6j~rvd zA)O04b-wFFF-&&ikvLrL?YkqHPY0!;=M~K_C$kn^R-@oAVsS%5&D*Qp9&lb!L4Cz0 zK1@#%;~#qO3)tosq_x$20|L#*KVfn5Xi#ayfz3v9|mw{NsM_(%f>>?$~mnjND)}ax;ya(DYv|I!@E0#QO zo!Y+FuLExSaO>z}=kVW^%S zDe*AF8#h5v0Wf`_MA~IY+8++>>;^YGx$3$fYo%B*r~jzvB-Cff6`|0C6!fCO40O(I z5yivLMmImL-R@=!i-MfmR=Se5ChAtO2;S!)xwYemJ1e?*f0ABdJMGt7{^4rG@^SBF z(XUilb6}%(fG#!8GNM-IrVndF~PWp9zYo)a8(%kj3|6DkW zGR98$0`^%o?I!`vK`7?T^9`QV8Wwj65SG|Vj!DKiKX)gk#6$yI6$0XhP}cU;esl>x z99u20-GL$>;jW-8EXVQkKD=|TXes7f1}XBpU(SareQ@sG&hk*we(&-kj`ZfS^Fpy9 zDbhED3gy#pat%QF7k(aYcUlIJ6g1k$Yjm8@vcaWsC&`2(!F2X;i9FvACIQvu=?Wxr7Y$>Pcp1imqm*`ri*2fbqM1qZ~)32MB zI9vjPA-~1HFj`){#0_j0`aev)Wmr_*`@enPDk7jDEiiy|cX!v&-Hdd1NsCAfDbg+7 z4Bermbhm&EJ#^>5v%SCn-*G&z=hd3M_S$Q$>vNqa?Ve;-$#x1^UzT^QHf&4;nag%; zG`ywLpN$=UYx7x4DRU?#^*y(a!6BAd;rnOqc~LdqP7hf_9vFS59YL10uN{*D5%>cU zJ?Q+TZg%&nQnS9eNSH+keOkRqbeRVP;l>Qi36 zM5beHn6=+m-?*_zSvzx?UwDLTGQd8=Iwbx)D5R@c6f?XL_GwjCX29j=6%|NXoHDF2 z)+CWGjLYC{cPA$Fr;P)EhTU=>fV9+)L^%7S;!ueQ5-Fl)%s9?BG)rID?(W#)WO)Oy z$_mWjS`<%jo!3)1BfF4JWD=qWgBhB8$5MG~%ZLIn=xPYblyW`Y{)8Rgz$Z;741iE{ z0AvNTcb<kfE*-NrD>WFOh1yD`G-V%O4`Vp!KjOcT(N|Uk}0ekZBT}h>bXLj=k zvg!9E86SaqHLiUYXH(P<54ekA+F*r$o#b$`(fG@aIii`dkgTi~((A4Vl()-S#0fi( zyHW7tN-YN-ID0}QrXSt&qtXTKxJhF|OBmsWVuO#UnmOI!3{NnB=0q29*UhRT?34mX zB$Vm+f&Thb$~_G9a0&&##2a(BzD%$V`rfKt8b-oyd;fn~fOTR$%$}N`2`N?#6h`}A zc^O6jy>N;x33K$=*41qzo`<}eQ_Z2|Uv`OKZR9o5BPRVGv3!+tf0uMuPQ{Zp?$}De zrqh2_x|_Ad9Dx!=3*hGIt3~tQd0VOuYCv9vvY;U&(j-9tifeAw?orJz%K!$}eZ8Ljse*<8eI*i7m|m+Q94! z>U=-nzJ76#$}YqZQyF_K=%eAv2k+k8@*5TM`hbyKyUh_P;DqJimE#PALn@_#d#yH7 z!S!KJqBp3ZvRYfk?2kSEO>W?Wu8f)ptG?U*V@y6}Tj~qT!EFn}q7ZxN^}|*@cV1h- zqSp~vlbybXS*-~M@KpY1+Ona>Y*4yeb^uE#U|Y#u*joGat%)vpy9J-`;lZol6US#N z%INm{rG<#i)|hxt)VUW!&-z&RDJ+! zyEsKajnR&7wb7VowuP4{JEIRXgBAZ{R)6kJ9}|3il*Zti${6o=jvyVQ@fYeD&5%fie%i*?HPT*FzU;zQje5T z6~F8Ky^|XkVNqc+6QnQi^)3@-G^DGU_%Z)m=f}GT?_kv?=JwV)_c>mXnG*x=+25B1 z8JT1PbFFTQviC~V9i89-b{h7}VHd?!(I8J(;iEXL zC!dGgykG5y0CDRTS@*(y_fK06(^dYB~hwk)%W7_ z30l6Fi8+Hg4=4gjq1A8~$tN&qX|ozscK0Vv^)HmC6}skq3BKxn#lL8US7>*V6llQ~ zJJpEXX29$qL+OK7^~?grEFflc9CLNF(`Q4SzWTZR-8yq))V<|i%>1>WZTyv<1M=~7 zc0g&pf|%spVRAs!F3D;{w-8_UxIEiA|3TE3elsUF{jqp?F!cqZGC*$kh54+6#0$+V z(lc7{A=9MY^b%*+OILqin)+huQoTrzuxqP1L0wuVanvPw{j)a+mtxywtDx1%@qgvXQstZ7sdVykN=U_`$qBloDq{UwK05F0<} z!HZrd!NDg(*|ucBlDlV3MEkQ^F_TBE@L!6_!)L=_>h1T)vTVy^+*;O8_;%ge7m(x9 zIp|sGDjy&GPa0nse zlL_Oq{q-c+^6y?jAyj~~^vwtz{<*=szcFXu@(vL#2E9Y)iyyf;bySx=??@IFOMVyd z75;Eyr?4%hzB|czG;}LT#rw(uCs-D)HcUNzg0u0k`s9=F@vAUCzB5yR7a|DfBC|sr zCA<+^Psq?#osqIr59YYO{lbsU{h?Tq{#SUAn|E0xEpr$`{9tz5@02B`L-MI6{l}~U zrs_p+MUHkaO%9%M{|wdwuJ@3H9N?;2cv>8)B`tjZk&>h1n@s^eUlSuboq<{2MpF6> zvbNhWJ|TdOn=stj;j4p(l`)We8}_%5syJ)^uY%~_-W9l05Qa?#KvoQ~AY%AMum{?L zu~1Dx=!}VwFWqowhoXm`nNopwK>Cx{dRu0X-|6%3VGIYn- ze>J3=YwCY4aPP_djWzG%$D9$mP=p?}dao%Dl@Ud?!86>5o~gLQ;K;k83pKkJuYFy3 zS@tWVsm%o{6l7#8W4x;tuVwnp1k=#x-+%sN@m>TS1LDu<-2kuF-GxAVuuRc{e%)M+ z8O2KK`i1e=Q@3`cz)w0uWjwO#wS@K!c#7I%pN|5;xyY>j7Kc2Ao}~e60PvPvtOZKG z)hpfy@<5J|o$3p|$+nneOM!MaJYw)dG-i$;QTo$*lvI9BZCgCMoG}ffgn4fi<$rMhF&gMF#ACqHCiEM z!8le3om+oc-#Q`g2hOJ&M9o!`veIz_1J_dq_MIdsOt{szs5e(z@Q|V4UbZUTT?ejg z3!UR z5P538-FNF<-yfjVR9m_UQbMog%~3o}`ufQh+>(!Bi|TGS9%7U!CPGXd z=D|8g0cbN@UmiVMdKx7Q+}@hhEh=0`^d!op#D(S6eX}#DFZ;UwBBa*chPh2$dEt$L zl8kHGg6zze1!AHT416c{*P`v!6#myHGzvcndfW-xcSeDdK0*K=%${UAjeY* ztr+Z9PF)ri98AX4;e{Wt_R@vXmXDfvF=5M29zEtak|d{GoKzZ8PFC5I=ak=^4eQ&u z+O0{CN%{ooP1y3#r+jsEVobMBxUY}0)5}z%P`1jm**SE5YyESu8?ircDj=Q~clzZ_c19*ZY`6hxBtR7>iY|R?fm_0Ubr4ddf zK@1GZx9+}n4f<~-T&DJI@x9Bs=e(I%E@0+LQ7ln~70_Z7l-H~jfO9lU#C9jO=eT>D z&_Plc`ck*EL0jKBPj@ed$c>%~S41hp`HJ+vaK2-9%+@`S^>zS`E2p^kVowc{MVl}6 z!}uSJ%X#iLHs&!}b`mfq3+t?eW{mm!G4k%yVf)@@+$Kw9M$8B{k z$bO0oYic)M(oY`THIPcqd@^h;h$q~GG5;${zndQPmE0@lub=L=%5JL#f?OA-Y3uo_ zJ`j;^SLuRgqI!%N8wqV?Ng_3uqu-oiXGUax?5>6KGoN%7YCPg3f|E92e>8qV?@-bv za5Ue(NdYkYsX^aElY0VY4^AH=tdiSlY29qv`#3+@5E``oun3b|_RlQmcgVcP_XYkE ze>y{G6sIjq?)Den#4~n$G!$D_6Hq^FzacmE&Vr(`#6oXTNMjE6`?1pGy;nb>%pkKT z43&`BsRmp1Y9ro=RQp2U>soQ=uTq=Mg>FVy8{BUJYKO^5!3bXy&ayt=P6lLYoza=@ z&5hg^;~6f;u$e_*pXb!iAK+Q+jp6kYvmd!CdLvs23zQ?SPvyb92XV0NR5Nw0ya+%; z`vcI>DpI`fTYq}E?z-28vWSC-^Rw@@Qf5uH7df23T9&OzR(XR(Q`7szjG+XNHE`x# zg>-18w-@4>zAJz3lXTw)Tuqe8DBA~p-4DH{q`6MbQR=k+9fvdvK7P-TJUXuW)%uQj zQx=7X0eS%oS+wCCEh&>{t_8wlpHtwA=zPl}EWOqL!mAH~D7|XMLYeO_vjoV58^;Ej z8}O}r*`bI_zxB?M`;PKOHmpcGZ*8{iJ<0^3MS)Z}3x!s*6);TZF}JHAoNy68|s=h_Dm8j8?7d9SHtAr_Wp zy>L;^zK?I=%_~>EkwSaj@Jf(7Jz>==zEfd^MVzbeRTULNhbyH*M34oGGIXb92G)JIZNTd=$zP5!0R1<5INAG)Jq1Rv(}Le{_TP{zQ4DUCcqai4M++0(894hACzxQNx!AP0HrODPA0Gm!FMyo?J(CqLn-N8NO~4`IoUu zDcTi6_9jnU+`6Hq|AQIHKRWzJ1iK!-6nnB?))k2RZ>4rq|6(uw%G!LUF0X{vL$uS( zH8qAd0eo!Ku=P^HW68sM#spZ9D0`@@%u5f6h(DsKL?XpXUS}?uU7I!3_M5o2jgQm9 zCyFOmHoO4%@p*fBf1Kk#&?2>y!LHgl3nn#U;&EfVv-iubj4|Dv?f*sSY|HFI=V;I+ z#Y#tT_-Qr&ZUlf^zmc#zRa>mA(5N0&{Yuvdfczgv4jBKEvXb2{cso{h+%68-^A)im zD?xWR_tUWQjUN&ShU=bP4c@X*|91S8>lF;RtVqsMh>;^aZOTGYQz0rNQ71euFS={( zvGvRvu8uenYk}$9ozFoP=T)M&An~zcGL)PsO~wxomsi&Vs@HLk8l;SLIvqu>>Fvd# zdxse2r3;cMjL+J?ax`=I{yd#MS3M47%kQi$_lOP`gGAtTHV3^fQ{1$r2rwXJ(f94m z?fLodi)5wu_~BdRyo3~(qaVeWdpL{B5gfmV!Vh~s!rtUYiz6gx!y;vQN zSlti%0Q|*yIQ-w%x2)3pA7^&;JjyY(jNRmk@x zU+3w*i$p6DvEkxGfIg4JL0pF*4hyuiKr08djM@lgYHG#eDMWz()|VKtO`$kZ6;MT2alVJrm-4%I-hJw9r%fMC;O7h?q%bAWAAT6H$#<*jj{qU z--iOY#*EvgL%8prY>PpA$qHR4f*D4}ez7g_12?>| z%LoHIyuZZSB7RPrT(q5#RCMdf>iiC}jqb;kVdV*DSE5lTf8;V%vYw=44E|1MYjQxO zkP+FVfV_^aI9!K`J+?Ot6e$nAk(MmqzCT_J=Jl-yRDW1K z#b8MPP4I+X5uU#@W=V#cyTISB`dMB~AU_+=5m_9ZFE}`&V7~3Fe*AKA-zX@VIO17H zQl!$@dR}jaJU#uUoY1yLRL_8s-~q(KbI_9;wG^q@rb^lAp>PqLTSZ&*t$;_xls zEQl((2ZLm@xKL9{t#OgGIU8o00V)zz4WqLjZ;5jUAZZTe|d)_$99)Uzlc;6STLo>mvq zVNJ;C(}8B_P*hNt+0Pc;Ivc;UEgL0v+hKjLTUVfstFsA|m)}rjQdILl1FrIJhG*qL zFKZcd*4pcM5m^W_{3U_;_H-@KT!9-C?yoCG9((e~tY1o~*e~dcjdT`y(@z8E)K=?1>sz$v& z+N?q*1III;vcc2@T}-R!@PUO6K}sPQjM%gJ691IXKFi=pZ!XvvK!N;bQrfMsLfiR6 zeJ=>!R;-RL>U?i z_lH#Xqe~Mk&5HcVD2E%OF(iqp(Fdv6wL8QK`+b&aSYn-o<)S-|*gn)9fVE1zsD-ON zxNQ-;M`4E-MHP`96npAbwcu}MSjYAN)3F9b@T$-pSrBVfierP_FLA8yK~lmO0qNd~T=dNG+B@O7GbVKS)Jm@grD$>FS| z`+PE7q_Y+&B8NOR?UX8xR;4ZITe0y^{l)aS?M)qXnemGRL?6%(lfkl%SH~aVXJacQ zEX-1(lnD@j$c;wz{p5LseCKz`!t|?=__tF#WMJ%z44KdLc;fLn)7SF>?Ou0 zBx{O=k^?!gl@JirwtK505#&AA;%BE?3c%BD*F{c)dH$v#8DFVsee>)4*@y&G{N+pW z`s?fn$qa1%GNkq6ZY&CZa7=;>uc|Bk2==05;R@K1C`8HF&>*xbSogS>yEdQxl(lMa zy)8LT;DS5X`<)7Ar=IJL5D(YoKwqoENIfO-hSjk0yoQ%`CcZFb?%F~VSCSqzqx3_+G6_8SW!|6r4zq}_Ds!Ci~dmfGEi-lRT=?hjU1fYd(OigIN;agB#p zE4^HHj}J=@+ZrAh)2;S2c|?=k#}z3vXpr>S4y_$^aJG~f@w#E%uu)m_z2tGS>+(y( zlS?*1rWO4rC%BIdT55+P+205rQ>CH9T~@WASkIKv@Ssu@SRu)>RIUr$KcNpOi>dS$ z@jEYTcuw-n8Xpfj=>?t)<{<5W;N@~F>$vdlovYqxF;i8K%=o`ecxu}{Z4QRU!?2CC zq{B0BZ60EEOVRJ^egr}*OuV+tXZ*Jx=d=? z7-noMu|37dc{VjKtIwpl)*>9~FA3#b4Iq7ULR-sxgWI7_Z*zr+UQ5>|EsIrk%32P; z>xX94X;nLwDKcrYcqIN~ct4Tg3fSPqYxrM>lU1pJ2s*iW&j6S#SDEpDu(3ml?`<}v z>tW!tH=QXB^G7pTECW|1!Qp0uu#mQVItclNUY^l;#p~dhz$*BJY&vZ#*!6~DG|YPj z;4&EPxOg1ym&Jmf`0#b2@Wamv%^J@n%xZ@VR?fNy+cw&zawk|J+c`Mn;bLuDhR41x z@1jt!8q$d`h`*N$G*q%D8_)2Z&CS>y> z=W3Q+m^9t5?KFtqC`KVAKXF4ZatLoYLlqvJDon#VT!XeQFGtE@x+sHR)Dc|4_DyH* z%l3Y#5dl60t8eE{qw+kq-IVhlyJCeAc+a1aQvRu&kK)f99(F2{_u7KxnK+pjeSFov z`l46*xeS%|oX|8%%k*J$umcoS()mG1PUcOj&?PjmnytfDpybGPk7t#DQeBXH$zii7 zfH28hzKbVoel4-j&rJBhayg7rmFZB&Ak}m_Hl{4hdgpm#nPI5|&8p{aF%L^*2FSk# zKC!=~{~>kSTMm?AD;3K84xh39!MI@5Td0M@J$pP?lSF%;ZdGWMhYUY0FFIC3?DJ%7 z;G8Fx=51Yx&gZvmDpZ5Ag<(C})#{ASL)NQh)`G$ZiOQ6WZGJ;RXR0mqIQHOHX%ip% z1JeqK$IhDe>S1r;oc2LSRr3bZvG2Cpn^~=o1Iqh~wtKMK>+~r71(!n|iW*2U-^^d= z8H#kT>0Q;ISI_pPe!NMS1GT2?3}FIo#Zp}}U&75W&woFxD~|rm#F-8#Vui_g)a*TY zX2<)YvR&p8mf5GLC(=fCs-!(o_L|bRaa!Lq{l;`4dM*vz3l%o(1kLYwyJidA!iapE z)m-V7VO~spWrLzAUQAjxKOKuz$C5gsD(KmG8_R}L8=XM?cQ4JTMIx#y(9~Ob`ne_z z9fZM55tUx-7wQIo%)0e@NZzv!ufga@S-$iwW{>Nsb_c(zItDJ^RD%HR-p3rKQnJ1v zI3!nvC$8eanOvrJT-xg1v0UO{EjeXN*G>>blSU6#NdnBf|)_nx7W zf+L;)ZbVG;`+tIcu-W%7fj*Db2E}&3(nWcIPs?-An<|Ai!M$oSwDEl9cjQYxHW*f8 z!lC1&=S4VxwcA}+xl@zE?NhL=lkqKEaA@u`z+~dAW-R`%A?U`zq|0`nATAr^p9`NL zzfMV9D-KNQ4#N$8=lV{@kj6kEf}>4aFYimerUqE&qQs@$9N_)O+YKqL6J#*h_Tme~ zvJ!NFEp2XLlK)(@M$^?yNyqEW4(tE2@TjM#%n?LO>u2{k9fuPNl5p~ccZLY@I!mR= zTZw9I(P;1GKV+7%ep@Jq)vgDR)d@_%)tDI!%?>Ur22jj7jU8N;Rb%&jkGf-8of z44Vd$9>o877Wv5pM(Fykvq8rdkcy#PZWS*A@(P?x^>Dsr$>zR$lMs2{$b+lz=?q0; zx))!S6#XfHl|s<5C>guz9$sB%j;Tzw0qLf02PkoAsbwP)9p>d5cYWJ!4~GZ6rrh)g zJgetB3M>dVyL^A{5X{i`AAM;X0j#gw8f56u$jQ*Kl;59UU@>VqR$CCEeI~PQJcM?- z#zkWP^D*g^%1am3OwsSzZ{y*h*5Tr4w*f0DD;eu;F%wp%)i-$)cR34();Q{=A8{_A zU>Q1!$1+ODpf={Em%7>uKL(wIcDo>e%63P1kGFUfX!`G_V5}9>)EF=0vAa~wLj#%h z;&9o67%}|qryuD!Hz_w&ar~KEq?Py}#cMe(V{m>#FP7+JjUxh(UVW+52IP&3Z5ig6qj8U2-((`C#>X^Q&?XFxk9G6g zekl=^k?)YsK7aq3u&Y5sszT(QV5qffLncntblhW&b2#+s$!p3*D@o125Elo@RV2Tg zqjjm>kq;^7zQ}>1=ZXZ~Hfh@c*tQ4;Va5IOy?XZlQQMNjv^%t`KHN;+m6gu1cj0EE zeTwMeNkXx}c=qi_S0i>fZs#1qA5z!me@g7%75;@>RhT}~4t;C2m5GwO2H{g(#deDB z#`ETh&5kNozV<_%W30wkg}9KoWy2>no%1N!R@|>y!jCtNGDu0?ZPMR8`O;j7?-_B_ zt!mnNimqgm1GLlnyn;0>LiFFdFDpNbtroDuIl2bpA}^Jcl4OTpGBH^FUlyRaHa6*k z7`Zcy_nGmIcDT5~Fp2-bxYR?sSc3wSPGy(ec50fa5@5$hUCogzXNJ+YgJoyucPBgN zBN2y`l`z8KHePXpnUsKA#_kl=Z|@U@Pq!7`-t>@EiuRg_#X6uCr|fXgPM~|CW-vfm$e7AED#>8D#Bu`{6HH4rJcai zNyi<9ZKym;I8D!SbdY}F?V8q0k+jkow*~TLZwXqqGc7$#89{i?!lR~UYUVj%?vlsu z5K&>@2l4&NwQ-rLKi$^x0+sFHl(0lgg{ixt*LAOX{Cpx?a>Gw2=tU(;T@H$sbk&qt z!e?j8I#h3U)c`Y5F4=pC0r%^^+?7?K~dNwuY$-mVTi*g7zI0}xez+T|a>b1Wf@ zHURYSJ3LzhuwiFC=RP2l_QTM87;u9$V@h^<4YNdZ0t7l*g1oOo)id9{1>}XKj}DF^ z1#HPwoMv!J;j?Gow=3c`Bp>J=wK0F(%lY9jT+#s$x&k|{T9{o?Dx^PdAqHXTI3_2! zt(HiOCU9$gYyll5m`k}f8STq%Ir)0E{?>KiOq6>$KSyW#X4Mxb%F`kfKo>?`2iebo zXocQ>`b2vIxiwW_(`@AGD$Y)&{c8Zt)F+XmceP1UA;I{wwYW#H{2rU7{pV3n%mSI$ z^jx;b%H?NePn&pvm0{IcrkVr`MRofMwA(Q>I!UI-#tJ+f8|NV$bKxgxnl{M1KIFlh zb4vV|0IM#;5Oe%_C54$vWqf|*7U~+20L|<=Oh_Q41mOpKgeE`wTKy||Tjny1)tP8* zw!GJ8mT)%ps<-@1P6F*i!a_%8=!N`POj+Z;y@2svS^(P{BVG#~%*HL+m(Gl6Od7Gg6f~D2WxmReO@ncVxJ%Z5nT(b7_puzf(w@Ki&+LBasU7$iz6xOMuH2 zQ9Y#&7}C=59S+2NitWA71Lzu=>n<~F*>GWFYtdiFfaj0FEd3V;7GDK?2Py2jSqS2377AFbj0{ z5NN(K?;1%g+;Va4Y8qyS-j-}OAQ&;{0TNVd&({C)R4jZu2oXdWT z0VsE9BTmEe@;u>*ypEwkO%HOdHY`!A0d_A)0)`@43?DZ2+ApIm?gvaRuJ4z!i-{@K z3sQF5+--xBOpXTxF)2o5*F6MHq`NzJzZlB?V-Fe2W2GEtGR|FcxC!*}9@;TCE#(<} zU}SVVsNCB)_-?}L*Hh0p#F7Xs`HRQs%cd-a*3So&K)><)AAGZljA{;`){ujsGcM72 z^TCrR!J7q5>R0udpU5zBv#&03!c`x?8aCcXJ{h! z3mP3z-_v`$cm~wuQU-jJ7@%Gf*3991x*r#57hBNMX#=?|mD`dCxU+S9?D2{ZE8fbC zPQc&8%j{p!6`IWTON{FHo^(~NM3OvvLc)UlSlqIDviXcl^~^gfOdtZb3v9Z&X+L)M zG=)Fvdpz%V7HLoyY#Ztz2m`AdTYgh3_Wii7a4cI`+(o^J;*z(k@%S}Jt5(^5B&vWO z$Mvw~n1wBb&VP(8oi8=D2UeG$8#pV3U7{XMLnUSBwybA^tiOb;%NtZh zh{DB}(H@#Mixq>iG*!>jV+4MybIIRyRjdo`>?#ZP_1=e1R9;UBmJy*nwPR*7kv`8E z7E|w{R!z=fWnfMDq2~Wj7aP(Q;$&i_JUaF+*T6$455?Q1-m|uRJGuGyX=gUZbwfR9 zTN$+uM_>E~h*UO=nf6IDVtf*ur^g5~dm?$fV4ic^WmD9h+kDzAZFY0leyPNW*>nam zNLssDh`RPUjXGS+oi;*Bp64CS`T9&O&k^)eVK<1*?AT$0IA)_Y0^EL^4U){zQ1{uy zH*=v8|zPKqP(2`AEqxg zOIv3)>B0S2dtOGN9wsanMdG%Q@LjAm+V!7*UooeMpR%RwtCw-snI!iBYS$o@!d&6I zVa=9~8^hgZ?>hn4W+u`sm^LZESd$5C*~1j#-kz8bYO6h^e|g%RtO9x+t*_{+_xP_} z%eAzn9Z)NgT#CMK!wK)A{$hoEMjAScSH9_(zAx_d-LIe}L*Q4(N4%U$&+WQ3m?d>pr-dwr z%&s%6^Dbr|@P7@TcUKH2t`#N_#ze-HcajQP*Cnct*e6^AyciR1DC)BW9D1DaTyCQ} zG+Vea%?X&;33ZNhSL+Wd%z2Oes7&Z)IH0-QBNK1`HR>8W3aJm_TWfS5 zjdz^ZP(Q*F*g>-&2oqDAgo+iecR{z_I6svwZxTn)9H&mD>cW8?(t_8psR`(j>6#mu zB=q=E%l$GhbFEckUj;3iL~dO-V5?hiiTXE89+MMtUnLB5vV#m{F6-ViCV(k3M_fcE z2{n=LW$*8Br?XsqY0W3oE5ro-GveryJ2)K#L*0uJ95F)BEReQ7bx1@uq5@%ILKB=J zcuP%}P*VybP`0&u6ISDlIx2lJtiX+}Z5jX_)Ss@DN4GYcDZWec)8t9@-IWn$EQ^6e z)+s#jGbmZJ5?VS2)*oNC^vY8Mr8@U#KCU;gkOOAp!x|86lgedFNY{$TDCD4F`kMSe za^8<_`%-gnA#*3szidsiHHj7dc&V4EZymX`X;l>MV3R=KZ5>4|e zPuJnbRU2x2$V|yC_Q27K|6!^#&(-uBk5>k_+gkJ+)-LTeg{}>Y7%8K`b^n?705TS ztkt-BtNMDlVQ1^7DL*Y+%k@1yL0msm=krj0=F~Zsi+goDnx>elXA<~Nf&}I}7h)|; z1{jlLvibS5S)AUUDI-+o8=9I^?dNYo%efcdqjUx60leL+EA!umn~zZo#(ZT*2WI>nBsO1qaN*bM~4LW zFGDG0N#7}Z2`tLx*G5}zxq^=ih;yu$d!tkdZ~{laMZ*j(Mf(D;zDX(PfuAUq{}gk8 zWs%aLs&?)6U*#Z1GIlPukEd}ls-`A=4b}l^z2uFel0igIXke_dnff>PY1jn?g91<1 z*i2wP!>L}-B4W>X*fhYX@@z7|^+pBoqAj8uI|R`ER6Teipe0~iaNCVBOJE<^*wAun z4-90!j9sQ>m%bb$w+J6gAI*5FP3!jY7{Te|nyZUBV+M|fxvSV_`hH9Ob#*FwB3O>q zs|nMc{;D~1(R(6G5)a-juEUJ{QlCD(RLxIepQxpPN@r{3JO1F5@AhXss<*YXjdLG& zNeS=rAC9a&2ts^&y}`1M)+g9=)L z)a*_>I!`3B+ppJRy~Q%;!E!tHo~S@0a}C65$e7rDK}aps>0U5Ky(D>sq>tNgg!RE!-h5 zU>9$|Pv)#>YsfdNY=iF!y^sz-Ntz-j$Ann~7#si6#zR(4i0<`mlJl0qa%?4}lU_0H zp60h}A3;d16UYdf2$6W!{K=uxps|0xet(+^{X~cPqiJe*rNy(UzOU>YOIA zWYJo~_Z*n~rejmMq4a47@0}|@-B)|E7=%viwvR)0$HymX#`&<01!b`2nV!nmZ>N!4 z)!1C~TYR>g0fpHHP6A48Nssf&fyo;!F2kndhC-?e7`gEU(j)Y5*0uP1tXO*zjBYl| z-&Bql=p?lE*fP^dJu*~FWit`&#tfe)f7KP9Wy{yA5nb(iXo{G8)C>icaz+*yNlpcD z-wlApVefoXbB~T0`k$v04mRvak@cfK+3XdY<6&cj!6Oqhy%LHk4h~Li`*zgYPR;mxzm0&kV46=J3$OO}~0K_Uvb2Z9vwP*nc_ok`rTPO=zttq3$da@J;M6t}I zqIDuB$gk)+?usC7LzGK)Y?nTZJP8%K!yf5I1y^F5;$q{Q2eY;wy)>9XMvQ+^J`&S# ziTv2{xv(&quwi*UWtXDH`894`jard=Cy_RSh8gP`6@nS_Vw8H*G_f@L5lPdkz?ti} zBVthTa*972L{oev3F?;Zg;Rf0ri&K5#4yW8aLCCZv08) zlvY-+%I`1TnmgT%Rd2&%-U990UOjGl!V42Rb8wb$v!t$W+Q5%B7N?>aqqf&+rs1^8 zrQ-Dlnnk=6i+Q`@0bi9hTGuQWYQ(Q^qA6!7N>WN<(^Jag&-&6YE1FNi#TsDTC6QMc z=2`}3u>L`D=!9Ao80AjJ0DVP_4aJWhB$xm6YpP_{*fH7+HlS+atB|2J@=kjH(ts8> z`nb;3`Z^B!hN~;!;I1BFW;Ly~gF)s(52pI|(~ZB&fQI&!lDv3S&s5D@#zx?5QD~02 zAwMW~Jj&!7TtcMiY{}7>SpFz6C{gAya%=JvrFR|uJM6szzzjDqwnNd^SNUehC+GE8 zqK{MW2<|ndiu~&oq%sD2Cbs9p(aUYo0P?QAvzC#mAH;~+K$C4`Ps`dhcyY=?!k#y` zA(3~a%GbQ&%TH(DaXnmA=vY+tH(3S&N1?=_Y9&(i7IPfp7g_!o%!vTQ!C$vDVzu1v zOMz11ds5F^@=^d@NheRu7pQ^vW4G@QX^AEQcwYZx2w~5W|7g}=0B{*+Ra}Xt8DffB zX%No`FKR$@?%k*_2&BX#dxG$p+c%3FP2ko3Bf05u+cV=nuxnYcBoEbS8cukQl(nDE zo{dN)P?R;IL2ea_;2p-?Z_&)BCB6+$76(u~f8hhYSCLXRW2TdLYNk9at8NQXv)?{{ zv)tn)qcv^Z_9`keX94Rj7xk*OnqOk1Olr#5fj1R~CF;;k4~{IcZ`S_|dEJc^Sm7cd z?K8ALch8oXR$~&~*4y=3wwU4WTMUse-2&%1SE!Aocd#mNdl|n5QVO;MqqP#h=0+m6 z`e^Gki?;{ja78XmaCpK`={nf0^ojcZ>4ji`1xM3L#F@hxFk|0WTRejU8<4t{<2rqt zDE3t8U5b03BD5_-JP4IO&=>K&(xotT z`HT@E+9HG{cFFcW#v!q>^z#cx5<5T|dvSU{*S1?;okzvxo}l#7YukfIV{4qR@@27>;@t{_fMY(cDjLP(5WS%No0~3Y zfvYJimsNaW_(v7Qcu4>9eW!;QT4|F>Vg8P@b~pkRAZ;){C@Oj{TycaCF2=lLugk!D z+vfG=`?CQewnb}V032kI(QCRq0OBqEN(0yMA$drL5ig|^e!Ss~%Ckdbo;MwiFq#>8nWKc!Qw3c92r^@b+Aq76M;LfjA4u$ZT z24e06OGkU$oXLXgJXl=>S4yn70I_J4pX)eHt>vmRZL7;efxNtR(EJTJA)b&CVeb!E z*FXH#lG4;?Tfd0(rnnZ&SsB|MymRuof^ppPkh*#Do=MyE-3x z93L-_hg3M`PjSRJS;|H9B;`Wvx&fot#}o7K%igPB{F8IuDb$@$L9Av3N6g~@Isow` zU{H6sP3@t@uLZf%7fkQ#()5>{j?bUxTyxUG~L|i8eveuc{p=% zQQ*5bL?R_0yw-by5kDD@9XROO>6QzA5q0?%!4GNZXwmBmLdiC*Q)(sxdm-nA3VnVb zMq^}KME$fBHfPss4qf(eIHGQU?{Q1*7s&Tj?BugVFIt#Dl4j=D1KR)G-Ia{wqaaS; zp;Yz1p8M1RNPUIWW(szuzXRVg$BbeQUrQvLdC>`620pbvo&P+3(AoR{TlNb0+nuFk zG0t}nywoka9)*Ie0>dl&en~OK7-1TtPmne1Hz|^}F%V>L zWa*4cu9jqXQ(z_MZAB*2V#+wd=esryX;*;V8BL6vKwI%m-$pL?i)BY{ zf3kPX2{14{Q~=UT?X#p~El1GW+mOupd1z2!GRgRko%ct`YO(G>AkhFP1z{+^S`0BZ z6wj&jb<{=mCA?<1ncV(^CIj|ok-q!>Bb>&Pj;z;8P} z@6&qZIEq{amh1dD7-9!bbIRi|8g+}dzFo^R?o984B>s7}FZtaV&hGDg2d^d!3OH7f z^0^TWSkx7)YG(xMwXn~H7UJB?2EvA=308SPx#IAwDvf&CV;xtJynI(m!4c;U6Fr^P#};e>>QJ?qmu&Iwxt4;$y@x#0U*01ZCRH|2#BGj^fw z#7oTe6y3NP(2ymvqvw-?yxM&vdtn2l{N??2R{U7V z#>d{Gwh--aLO+F+`)}^1k<$aYe{K*vY^MK?7EHGB+8v)U?+PPuCQuuDY}1u^vp1*D znc}0FhtH$`8SUj+)08;8jJhw*>$E_Fv}ZUU%+^4Q$%ajND1j~-vuFD>m~_PMC5OSj zBe~z-4e$l+`Y2S&`{#5XF_G#Tq1+LwduQr<>c|HLzu4qM=3qnz)u;>E(k|OQ7I0S$ z(p6y*#G@{!y9ww?Q|`ap0nX(8Al5(qnpn|^d{=||;>0xSky=bR4Zh$`$=+ncR6V9r zRPQx==QCI}2O>682>c4&)ciV>$kmwtQi-c2H-_^o2O^DIi+qNiHSLQsy~m{Wy-r6N ztSvsvi*`eMS+QXiAH$BC$lB8&xZ>CS@2Wiy{K1yKMS9A=s7_NVt&g97M85ieCmx4j zih_fsa5#3TN*ot0Ukkz`HV6>6dZ+Tvvc{Pd9`(xiv#8op@G$PsFR&hQxFez)fM zS(RKvneQ}kcAa7MFN*rTs}6+}=Sbs9EY+yh`J`A=VMReH5G&e6WP&=}CC}?Ik_$wA zT#)93(eYc9|7*VQxbn38u@iy1FiG*S#8((*4lpS>x2t=aoYf~A4~_ER0Tiu~VHtDk zd3B}wJCZkxi^oKLC61ryw8Ip{cp=Sbb>{2Ib`zZBp+BdhoK_^`r!U;~5mmA6Fl!Gi zP6xoE0QB}yRC~QJpv1g12t9V>Qa8Vt6Buz@>C~RsvqT{pX}QatF< zfw}L#)-ZU9=o0$bbnbD8XrXLU6@gb{>*%U0P#rT}_NMkE!66cGr}T09F3K_QOATyE z$7QdD#4M49U{y$ql3e7>mPk^rUDX8T@zI82YidAzQP(O;Knjx+j9#@xW65?bOFSbvDzz5&8S-y22*Sl>1BsjVr z)pOwL6d&a?AqO&e7Gu*I2`2V|loopf50~Ldo7%}bYS5krS!mm}L-69ag2l@YEHx*~ z`n=`-^dF zTdO3iQ83m2`qg#Sp+TM^aW$YS>kej4UE7>VmV1l+&7+x&&7YM*^eWopnvQCTu?Eb% zW0^4by$r~1@m-_H z?2m3fnu4U#n%;_qat~&xi)#vYc$9gY3J2D}#$><(nL+Wv>}H=W=6n$&KRwdl1lOmq zP9_Y99$JtZ%g|g6-5n9RH+Op0m1OSi3C#{~P*?Az{S2KWpbHTGrBtBV`laj1lgI<> zvz|W3@jR55W5DfeO#6Hp(@`OwjEvQ2J)xtM=R%oY+Y#`$?ei$zE~$QJfx9RYKfC_S z@x>eQO^m7(LTcBT&grVRw>waqXx_W7!44CSABz_FeS3%Wefr>c|L17;VSQ>oVl@1P zvKq;MOf&9zT9v;~l#V%^sjn-t_xRyDw5$0$ker3-nTjyDXC@`!F0Hn=IJ4FzWZvaD z7UfQvP)^QMV{o5ldDHqv*Qt9VK*OfRWeMFL-+uT)d;ZFjrDG`CFZcgM;e_9s9xjbn zdc1k?GyS`U3=2quSi{fDG;J?ad!8@L?XUZl{Ik(k8SFSDgJJu)AipAar>(nLJJr=_ z1e6TXn!tbGP4z6VfdqYHHRLvQ+3#x^?~31yicFnfl~W*xXKB_q+Q@D~RZ`sRi;{@g z&(^=7yyrSh)y#7;u$3O^+9;Y<2F**t`q1cKPRZju=b>9Jql{k;#NW&PoplV(7Zv}P z1psECi1FU4lffsU1~_u9Nbc94xN7@fX*db}6?%KA{mo@mz3XMa*!Wg>VeTI*QT}Wt zJAS9RzrRO?zDxxz(q?&PG3hN=NP&2}ml*%z++ig6cwktWkN=h}S}?*M@g;*ZP^~yk6aMRdCpQKkPXQ z-BlB+F^9=8ta==JTUVI2)v<(29Ag?{0=)_o3cSyZWLl#_v??(oK?&(!OUsA2tq*#- ze?-PxabiaPJZ*nnRj8+)H)i+B;{Q8)OoPLTC>M0t zQQ7KC?LQHJU&8|DBVQuGKzZoOx`q-!d^GH>aGic+<1#NrN#nB_t|-?88LsAz3=bZQY~>|K8e>qR*iAT&9(A&V1;Ue z)!2&b+f-18#*l17?WHD%SPXIH;9b(Sad4Ibzn4Smc0c!@oT3nuiZQ7kr#ykq72b*V z)zf#K`W`0qvT|}YdL)a)d0WzvPv;c-j|NG_$hEEcw#LlTw(%A=j2y(+eQmNa<2%Ia zbm;m1_K)5=)G6H#$ln%@xfMM9&bGWWoHz&Ofdvblhi;~u+L}#vRhh@vqR`RlCpb!L z&B&!WqoH$PuuL5l=XXEtR@)7cnUxrS#=GCiJC^(w#+t`vHcsX^LmJ-!QfC71p+4rp z7K4DC=zflSDv@~`HD2kTwMgwra}`u!fnLcGHS+Tnh^_yc$6kR|a9o6?VrWoDbzWeA zX$Anzae_&*mpyOfqVAW=qe>N;Q|tlQV1m%BZ)RGuc~5ZO`%gtX0B(r#E%-epL`=Pf)AOFg3c zQbq6O9}PM?N1Bbhi~4+ChXuJyA)GNXt13piv2HGmW|Of6%}>wSm|lE-nR4;00Gs25 zYW}JxKcUOBft7Amo@X}1k7wi?L!x0{GXJxTZQW9~t8lFVthN&_^>gPjyZN|OS!S>C zsw7^kA_P89zvo-m|AagT%(FCiWNsLhAf1se((*}X?Fj#ALo!Ro8+e=;#|#PvhNMe*7OGQScs_h7Z5ixF2lYQ(dFH*@Y~Rh zvY1J!bYAdh#xDIDT~q(@s6(mb27+^HcWcjc|4cifYRC>9@pL*XCUj2$J#<6_)=e~5 z6K2jBT@gGP(y?THZIj1?rQ1B^v)8im zNFVLfCAO5OL-p;(=32Hwqtz5`FHNY z2RfK{txYQ1N|kjWego$U;Sn@zOtGtJObDnoS0P%f2but3bCl$~u>nnb^6W~RM-fro zda&7A-#20R6X;@s!Hu`0qocF)X7cCZ({+@ZV-9&cZrdk(ZutcM06qnfJov@A9hTro z1gIpy2)kP>_HCO_R!F{@{a7GGuVW+Dy^b{JIq=|N&K=7ZHDCP+RjkZVUU9Wwfw~`^KAy+W3g_Et0(A^qSHHX>i_OGAWh`p<{= zgj`#v(C-?);T?)(gvSu7nu>!q)iOdVfQmUoUJFrO(7apL5>YwvYz zbUb=b-bo;c>KuSJ(g?d3%Z{m`F=DcrsK$m31>PX#&o|b@E8rS(^Qq6ERqK-xp98@* z6IZ&NR}_dA>6fgR_$Udrku@#e%aP5g*7?&1l?f2r)NOp3@xsl#F_tZl;5UGfjDK@` zR$!*Lv&MIA*Ro+%JRa^X_BY(j3ML8rYRUbkLRY_CzV8(p z&W=A8J129z<+8_Jx#wE&;n~GO7E)(R0IPDchUF;QbGfOl^kPAPDX)67_i_^h=+Mu^ zdZe>j5x5@Qz6VX0Na01NuekCzyq?2Cmox!I9iou_G=55hxBT0M4ma+$hI!6LZZt1P z87E+QJGUS9i1QEH_)<`i=<*H_>V|D!cbf)oZ%kiT!dWs0Zqdci3${ygA1+0bjh25Trdrn>G0Vs>*H&{u20 z*GfFIVCGG?Mq}@nD{P1Z_;6Hmb$_4fHe)xvGL_}HwCLU z{E--)1AwJ^5i^jZFcLVteVK~MHX_{?RRU2&lI|7OH)I#FGa{FMg47WY&R94AU&L!r0S6&r<97K5s(eJBxYsw zjV9Dsa~jYS2|Xhl*B2%LPbZ%4cUT4W*s!4A)u29$^H|M{S*kht(Cg1!P)7#SvlvxI z=$N?2Co`1}EvuTug%=H<7X2WD5B})TD-~75!_AnVE+cYo^rqE)^H&%d2zry#ZLIdA zIZhz3NVoUs8%C;Q{pLP1Qbatq)R+uz9-)U6(=h6E4T`REIlCa=`ym)BsT@FR#2`{N zu?G|ybceW9YF5K+dkga}U^Rt$MPGaF)9@2jA%jwhZ}mKiw) z6#}A{C^;0Hi}-TtblZDKi!)qvFZK+pB@KCW7=4tW9|fLt>I-CoZvf@Kx*z{hN(R{`slO9wYs8V!kRdgk z3gJ7_%sBChZhoIoLXU)Ey8k?@#1+gbU^?nC>i|thjL1-G5jILTyV~G4e((AJZqtM7z*K4Y@5ly|&iA^xeN zT;lFKo2olWH=8eSkr#jtmyOH z9;xo?M4FZJ@%4-0gY#qEEvJFXV|jS8IF3S2&jspxRSkxsAbz?1o^BR2^Hq66XE}dL zLf)GkVllr_ulf*|`p1*12L5hmW3qiN3($YCF7OTe_{0C7)iUHkEP&!60o5ZwvKmn9 zUC+{TTot+ZOTozKqmqS+Tg;*_$~8zP`xl%VYAb1g_9SR80Se{6&Q0um&O{y%lVl;| zS@h+2(xs!8uXyxnB(x}C;vLKLPA@=+oIs>aQKu{n47|7}SOV9IXR(*LkoeM)oC?O4 z@zUDg*)*GLO7kTlQ~2NxmBFUGCqcdjj(1q`9VOErHY|l6O=9mqdgnQgO$7`^3oC6c zB(~*VY4c!=_CikVLBqOqDKqa?i~)V~q2lguEYQN0mVK4#FIW!v9x%cQh8BTfSxHlxXFBTk);fa><}3i~GkAQuuBq|0ftIy}h=JC= zFt){R(|d6sQP0YtH3|L8w(jX32nihHXV ztO5OanH8S+4q*oR#$K!J@_NdnTi!DA7!)lEJ*sw=nhNW84K5avG(ys|nBEO+TS=tS zn=?7Smc5%#5~_mEyKZ^Ik|;X!<6#+3y&9|wN$rkX&ZzjE@{u>Dl8s!lK-U(AYvY5bREPI=(&VpdBGAPq8ZTzo_95mUK_{!N>KtyFQV>(&aafghI4$Ig z#B+UmRW8NA1bHT5S>?z(+Vn|2pe5dxru<4;3upEzdJLrL@l-4TBjUZh?f;lhHsl zb;@D$RGAm!SSf_-6tA4^`)&tquBl9Ahg2KvBc}36W>bANt189d zX4P*kQ?bJ+ydAr6v*w?lJnHZU?_n#wLKj1~1EB82(U*)9!8ie@pl-%UFB{;I^Mt1s#b433Dc zWz9y`dVsE z)^+42uLi}O3Efg)@PPfd6-2#t|6@NW-@JcT?~#;!n)p9bcALSKkL+=vb<}p!q2rh9 zG+>@o8tX%j*E;*-yLEt?pQlnG4p4!fo+IMu357NlqiT^9*1&F8v%b{p2rie8zRP9fVIkjnuyEEmAR|12;x- z;z8;m?Ho>D#J(S$P(Ye}ev7<60w8&xRtlpxuH6?pU4xGuWPgQWUZEVo` zPoKTkYeWQnHNskp7|&c_XIxIlaJ6FX4T=jB!NbqsQO9yL#?y95)w_2m-1Fj7UWk21<`R6YE zxib3m`O!Acj4~olBol(Qirp!14}gpWh-^3R$&`LtqxE{5!|^$j(NkJUC1}G{iQcS} z4YJ-PbVq;k8v_Ir2Q8k$KTtGa1|=!g)hiw>XxjO0FjS!9Ht3ZJ8Qi4yQy6j-5PsKa zk};rCsg(yP2>(BVZ;cb^azot9m`7&btU4u@n;-khv<}gxwmq;umOK*i&WlmI-Jaf& zu`uLs3m|^dz_f%=8m4L{Cxh<+zdjDV=-}h8!LH9TJRkfbTnw@{dQWkTD{aHeG_K20 zIpOv8Cu8q!eAQ@DH90X`%j8bIRL6N#@!QF5;iEkn4|F7RTy36%0`$ecrwmYdk=lz% z+P@c?d+n9-qY#cRr5O}04A0Po`&LQVEFAhk^UjuF9k`_>Wl%HBSPuPf) zv=v0|xrZ`)a$-|H;lT`HOt6B5820nqB>n65d_H{C3th z-GrCmCTOpLYfiHD>q5{czHec5`!8MYn^#{ycf9B`z)yPapeDd}9$qd;;*T$NwTdNBASNM==z;ch6he;TrH*Gq3s;2bc0zMU zCV8FbSl{aISmaN=OdO)lwRB$b7pL2Zy53-#M=G6Jw5q*wQQM`6w=i2kQbX` znjr(ISS6q7lWh)ao>zs1jgzGH&EKfM=~=t~^V`7nT=;bu2XV7(>r=~L48)V$N{(=y z@b?qi4&?WDx04DK<8RK-9qZ5IJR5vsDT#>n`0v>%9X0i~0oN0-N@NEE5XNV@H0Hi?x;Cc}d)f$62aY0{RBJTK^5EQ#YyHm8aR|Ktqz1nX*) z{bVm2v&pUr`_wI~9LoZI5?2FfP$g=)abqC}&Z)&9G*DG7*Ii(iCT(eHeLIi^o--^z zzv)pY-DniCYS{XwEWu|Jq<`$lPAB7NdSf-G9=tPv;dM2V&XB~gC6VoTyQ^Hfg}-L? zabC0PB*IQj0FVW6>(lXZcbZYB$d15Ad&1G=v&d%nK0IQga8^Ci!ckINmw(iF8g9h= zT2PPpQhYV-=j5C7NmAWg9VM?W=KHi}HhoJix0^5Dq+cL^KqaH2BZ=$C(9;PC$-S;^ zS$+^{9 z2EqUkxtXL(8omT|>}r%a$SeZoe^X`sC9m5-1>}BfJT7u)Zi+jCH)7~Dupf;r7Z=ho zD|#g8(=z_BEmUJBxEi6JOJr@4)00;pcOtY+( z-SL`{q3Gf}y!Bd9FBMK8uZ{=bzkEgeWh|a+$lWkoa6VVql$3WxC7XND!`9K+#iqL*neQ_#}OI}H;qd%f={nt&>l^l*N*a26} zOeS~yV4Vyjr!ps%g&pg2&RPy@t+{#LMmj_eGY19Id9Ih?3>V1Ex5`*QaXyp&?mzWD z%VYOJ=CDM&0n1M@7=7z z;+ViU^f*F+PFV>N%GpqE8ReC?NPB=%-Hf;OJJ5WoAC>(N!95=4;?lWmfz=W z#M(AyJQ>nROSie)gesLq76vaN{ZMQx)esRri#(ON5O3L2Qrl-I<+n&a1^q)o9_-6hn06yK7alkW_n#?D4)9f^hzxIhH1faHplQh zJDoMMpQ_*57RPHgB%vlUdSD(+M;;9jll$GfJu-W;ZPV~Dj^r4d+DpZ3@o<~tJu3ap z5oN0MA(u@67RIY=>m_g(2QLu3DMCX~(dzS@%|E}v(BK2&!9^i5a6n@AC_n8TmvaOy zoC-^a_0?&=$#Vb1KuStkjPPKn;?f4_sld8|Z<=Zrh`2(kD?QXvFL=)JsP$3cx?pY^ z24S@?`UQVb*HD#WJN9qQ7B`R^)2a$49_%YMqa(nT-gIP#oQG3uEwsNJj&aDq^Gnc@ z?@Pmfn(t_YOTTePQ^App>Cfgi#Ithk8S?yY+M?rSBJcTwvP*UEgK%w~nb#}K5?dQ5 zs{)#&`MnR+g@tMN6eb=nUWVlb9CpMX*lSb!1-_ilQtbv>Y$J+Z5OtjGd=8#DP zlzGW=m+Xw67YE8rs1{a$&2^i#-VJQs2k61=4V`&Q03~wR90AYS-=Sp2=EbGv3VQ3{ z!bwp{Nyv(I%h>l8EHX{f*0@M9_j#!W%rJx7q>o<=`Ni&id&O!ocW8r^?oK&*ROZ{L z?-=4}Ja3H1?o<3|;vy~L@uIAPw`HZYzRFRN%`fb;YJ2b?HsBH5O4 z2|u9#Qp^caG3L_>$gJBF0FB9-EQJ|Az1DzoLotIZ-dp)vE$?<5*otr!zqvrfzB? zap;Pg{W+IH9PdW85lcqPH1%e?N={?LVyBbzkSddOmK|{a2i?z<)jGnIe7t}10jX&3 zLh}Q2H!Y_>15U*=Q>W{$QG?6XcoWQ0Z+96h3)t&>Y!||9y{pjwxTf;VeMkzQM(?bR zRgWWz@5mm-L`8aq`H_edx#1mtr~VUwM3>fX2IzA4Hc{_P^IVbpp>{duQ3oX|<9nF) z!{A4{DC&N$YD)`zN^S!y=b0%7(p9lt{q+{_e4hk~oX2a#1ZRDFK^G@_hdqXDOF|#f zHD5)dt>UU(^eS#R`oL$``Ijs>v$szdTw|1mY|BZ2!iDN2ugoT$&$$!KFwq)}R2akg zp6w$dfcdJBTKTRdzu2P{QxsFryo;EE)ryvTT$XA4_J2;fI zTo1^2mW;%KbOU%D(GuQ3<&ZWXx+`}|?ZE$gXd#>GR!t7Jw-qW74X{NYx1Kn{^pGY! z`&mLFNPaQE%J9SUx=u%S8_widc_)zyWe=^5oHf~;x0F?1Oyy(JeibI=Oh}Eq#57h; zl)0D37v@q*lCkd-P^zpc2w<&xJ|xjeKXG7ByH3Rjdcc!(W8O*bAua$Qe_-4ILgYEB zuiRFR&&9eGPV??Ozx%nP1#tsQ?;VO+?P0tNFT`LcVr;bm{TZVM#<6zLd$sOWZj?q3 za7abm-UBb>RgO+#G5Z^X7!V-7@zwcv#RB1j(MhB4M6}}VF*3NjWb8|h;f@cRgH~TM zW(AzG!6e5dGkA>Gu3?Cdi`be~ATBmK9c3e-EGUDEuWjd3(JZSGj5aL3!2ne@Xih~Y z01CT&*ip*6r3(Gdn=D6PTFOdg0uS|b&xU1v+bAoG`-S?g2B_}mOMs@*4acYqD0ZXl zq{oXY!O?pUr!AAd`!i95>xon`1wA9vfej znwNA(sK7}GpXKiEwMfuTYE~%_7nDFwx^d753NhZmIX{r+GO+>ii2;iL111E_W}P-a zDIv+(2=z5Fan8M|m8?IuI$y_Pq38@8EW4ss8Rk9rF=i2<{&vQToqzSk*TW0?U2qQ( z&q5gPaFb1$6rzZ{8X-l4iUb?AEOrf>iH-#+q1DrD-aYqGZbG^1g0C&Yb1A*D&rLu(UN z#)*zNlK$<&)^}ApkZzV8=9Dcn)*QO=)Zpp+YRb5pLqbE4dTDwjn&s%)n&6`WfuDX_^9*1yVKCXyRPN;XY$wYJ&4A)`zxniDM7+0r7I%%cfc^ z1rxCczY|EekZx+lDraSu@6Y|7r)#aXOYy2E`xhQL9NydZWdO7U0FOXZYqNyZolwhs z|INs1e{?fM5|{X1jy=oIol2U+Sm6q&jO@Lzo*-LLeB~~}wCT&kgi?NTT9_|xYMi(h zsKFB~D4d>cZYrg9a!oU|n^(5hsM60HdNTHG(pnvNy|0;GjSfWLtA3ye_!oeEx*jti z^%^E)V|u+U*nE!Dc2?e)s3xZl{vO7AS3~MwT8xU-M-@Vo4>rloEGLc6!oC><)}QL# z3WUa-&W`6D-yL@eC zh~kkar`j39|HA^<+4kH2Y4kAWRuw)fY@&w;ZS#5H499;~cH`l5nftwE=5Tboa<+SK z&tGC-mF6K^s6?a|RjP@Bn}!dhSluCKH+Ee(^AU3X19JTqYR2)Yp<2N;^P{>9c32h8 zp6e}@%GtEL5TR95$%EYS*LnnP*_87X>9wpmnJkrv-ZOQ!`Bt)*sx0R|NqZbp!re}+ zi<0v+rWpf*?*-H;U!2R|57fz{R3v8EkNM<`hpn3Qz<6)$=pvsG4h)y_oO2{KNj!sW zj6$O9ll4VFO90o_kM2Dsj^49ce^n_l4b+c6EP$t55X?kEy86Y=ME9i~eJL_qnkApY zck*0Nwb$K`dEv;*Yo;;J`xqwb`n5@ z_ln>ZcGC7_Cvr2fsUI-}4NQFmG`EVxx z%FfE*NSb-|x`NN~J_I?zk?)nXk>CcUtKmy`UbuNp;+q4bQ1YA(%A@sf2S|km{hECr zShCFm&Nmnhqu_V%{;Z!f3`IxMnn{xuTmzVr@W`(iW`JyM*^^#}=#NChPbRRpw69F~ z*-RlQ{U+BTOQECadOIYUm!&pEyYo#vlf{=xL#mafF11B5OoGk`Vhr%HQ5q#7 zrPhxXa4(cLi0;Rm$V6yEGUAA@U7|HFy=(N`hu)d?4g7MMP;hp(x%)YJSg3*lon)ot zL{*Xg$4-NB#Ml5pxrr2e=-U{F^+hkk<{9^M6ur2jLx+_4etfaiu8}rYj>Tn=i+jgE zEv`K|zrLV7;%PU-l+&*aWXiYV&6I#yrf+zvARojd-i&qLj$M6U4mPs`Zcz4_wRyrPqrYN-Gk3P|W-#zY_Y~419h7ZR(Hy(>>t$*z|L&2A{9<)Lu+=#7XQ+VTm z*nB0o*f-E5+fB8n_T)5VJimT`op;#p|87ScNwph9{u3~uQ_L|`$-G=|k|n+jP7k*@#HL)sN!x-_(FmMkS`OwL!S z84$V4sc^G*uWEL3tFLx)a?cPthrXV(;#TvLr(a%}$6PJLnUf2C%skV+W#E)H2u$*A z0?bZV6txs>mR*{*Ey4rmr9`4c{m`^#B%xZSVfap^r)_x!teZoCk2w6UX1(NG)U#Ht zDwxyLWn(Rm2#6lt|UdOYuD#lM(=qvu8%aM(_7J`6d8yKPJ3!J2if8# z?XKs;YKF~E(_Hu}mdy(_`$P18pBY zQwg*?vKkawDGNJBU_mZCY;Vz|M?0$Y7i@ZbW(E&-Hwct&Yxc8Z9wceU7bGy6So@m; zo9C&Oi(cf1l{7EDfZSfembzSOBDPB=^UsPrh00u27@{(vKkKLJN55B z#GEtD#j%pl97wp3ho3Y)d|3~CZ_+vk7ED;QJMshc3y~ZD3N#l;oeEs7-M4$%+7{7i z{K{Sk5xF8-FKJadvn4AA)HDdfjq+e`PeqkvshCMYL#>Qn%sc}8Re-=rr?Y1}Pu_Cy z3u|6OJk(!Ndq3&Mzv2Z=s?aW^6Qk#4LB3HbZF0q6CwWdj(_Oy|pzoDiNhy$?0HBDb zYd0zB(6+t2mYmj;N+SNqWU1^>1}f!P_A(->s4vLhIDi?WRjcr0yW|r#k3xLMvMhy) z7eT&$V1jz?YbZ4oTFMzQ$QSHB-T1OlUcF~qPYx5B{f~nt1-@< zDu>KrblcAAbw6o!!q=_N^p^rncIuzoJC>`X zyq@HZa0bYcmbL=3*KOr4qH|RN7z0%jaF%nj$Fm6jdzR5fDVRjwHTx=8FHh=*nE0EWc~vuAJ?-Fz&G^ z8f|tiRm{W55y>;AhZs>SA~4!B<~VA_j1y{~KO0t`&XPV#YDj|bXur#SV@-45VB+AU&Dw8NEf|iqltiIR$FGDB82NjJ4BwBB;@ffLy4=E@EA73!_`hlj z_lP3MVDKjQ6oG~lK_w5{??7^)ZT!&n9>{Ag*}gd{S7{>@>ZA02jJT%UOH!EU1RgW3 zr7X@S$r%jjACF6MLc7+;L9D!1>9y@pP4unwd9qi3q4 zVZRmmp$K?jF*{SLK<>fLvBsF%6YFaiYH3`6{J|(a`RMb*`ii4S&A9l|rA4Ryq!Zsi zk9ROhFqF0(?vp#}NnP1q$4JX5XTJfSd*9r94SAD(+d_+By<3yp2^_Z3F(o^~crYE) z2um7~*a6`sj+?@wlEQ>a)#k^BOy&KV-V~)&r8}jScR`ZQP~Z;fA+msk{HD-^7 z-a4>p(k>--WaN%g#|cyl$cdF{Nz3}H9UJn? z+tB3o2=@DB(M23;+9xr;bGF(QqoryhxjeW$JT`4As<$}rK&*m%c0D`GgxR_Ktr?)G z2U;I76~-drt}JR;@DGXVyK;QYv>45grz5u`Mv8GJ)H3!bk@qZJ3V@t+MN5BZ$FZ7$ zWW>ktyQ+C+q2h^q@h$UQ*ZvfhvM$i0DTKc7;6f&Ntoh%%irEKsyrdD}cjAM~2~J(G z)X04*QT8tttCDFTqTllrW59<37D zMaDKv+Uew&VXHoyDMbTJQQO=v+RLYb4&W!w)T_urd-e+1iJ_c95VC*R^K6EGz){nThy>KmEW5 ztipu*O|~yLL&i8M7ssERmMv>|%Eb`9WFRL|IOovyq#*+|Zu;jxosYI`#jzw&VSf0v z$%p^Oz(RC!^Gf!j72p7=^Gu8i%tVM&I-XJ8NwFm9FTHrt2<5{5~6pPj>x zbTazG7@>I-_TD8q@2ej`_Im>`9208F_m?Np>=)|V=9gRw0cOZIXDHV*o6ySR7v(UwZxOLuYQa`1!o9SbL3g0MD{F?H{yZAYz=C|zSDih0Tn;|uy?t<6S2S>p5u;Za`kF(u(77WV?1Czni5R6?U{^v7~D}D;~&ZwG~JLQ z=imtWEhdxc48w5}bh2Z*qL=*4qZ3Ws_HOhgkZ!T1gerX^!b9vt!tkJMq>hbBEgeO1 zWvZJ`=h*;>Vkgy$$5!%}NzOH9>4eMvBiWDj%IrUwq7pQSNH%(Z{3=(5o|HWB-*jJ` zy^D5oUXnd!q;1elg!O1{y5!ObxfkG*=TY%^F{hFQnK(Se;i@PiRh;&iX*8p*tP#Od zcgZX(Ss1q^_rzgK3GWjFgQ62{E7O~UWGeXhP>hA(qN~66*#{%0xLybj1gnMTJ8FB0 zfW+z00NSLQHWuWmVBmJ{<{lg6D6;Dx0ZlbidN_)BB*C0u%9gdJ%Vk03krn=c*hKZ0 z|J7pkvSJrM`i9T!HtAcpbIf43bg&=TP#zN++#zXsNh9>}-$~<^83iPJSwk2d>YRio z|E&I*AlbaVbxxL1;~kWtK{XC`vE+*NUmzx@ZWtW5&4T1#Q47`T$EUqf#2PDP_xWs~ zHu@)}==0mV0lajd=Ic3HU5Fgf@E{=$<$)7Zs*Ii~%gO$zuqS>nF{N4)`2psDJ`1oG zHp&WI#sKcSKJtIZGYn9kfOAEX2P4PIkG^@&AdvPcu zs*;PPuuH48cg)U&@iP`K7z{IpawBt>zITk--ii^WHruO1o@2{f@9*;ummvqResW+4 zIBRC+_}lCYQ(2!-rp+v$=^ObJ*epwF)EpGK{LF1w?jzg_OWvchGRBzWCTr}=6^%7Q zn)*-Az&1r;%e~vaYC_U44r1}Ca!lw>7Y(+c?OG5upXO-`!Ct<#Ok{`XFR9lmSojCo zW9&gM0Rp@LptXnVKIso8QtPl=G(;>)O$)?GZ`kH1l#kKY&j5+ycPA;J8vwAm=DQD_ z;*6-%F_2QLcg4sp`*>Mvuos)rc{751OkQz@30xblQK6(6pj2L}>HIcl?Iv%d{PdXRKxd`=v0*Em zD*Ei(F3_+oG;J^o3+ey|JB_#ELFINFD^ZKr=iZAD#kVj2O~I5@wYGO6Jz&$VD5Q%U z8BYU9bLW;eZU?su7V^MEOGlKe$>-OlV_XPMw`dHNwzM7-MWI6fFac&BV%gwXGsl7l zq8Uj+Rh^!S}15#p8a&Mcp=`@d(ubel*;VWFb#F1WyooFC%rmNcx9ExRv&S7_F#}3pwzFZ^5 z!*AtZ?=Qs5uT_06)5i(jWhJy8-)ZYHqe}Q4$uRe$I(aIPtj4iKW^JJYb0)ti#55D( z4cWEN6Vg&7zP;}e!}Mo1s!KOjuR;TBGBjtJ>^kDB9n0E6{F$O!nY*mau((d;HgBIwcem$t4a|ri<0?Vh*CS!gG_WX z?HDxBSfWm6{XWX}td1${g7^Jey&tYlx7M(0xzl-bo7WlhSO_h<@$;Fj`5gmhdFm!d z^#u8TcftxP{GRR78ya`aR!uq<+QdZ5P@$AMp+TgV4M!xK7=GjHb0*cBs*qqn9avVL zATd5p5K)(@!vDS6V*ZFdqF#h3K1KMQ`WQZg%i$GkX)L-4j@>qDEYdWsUEUHFV2isP z%j4o;#JtSqM{=X8L_3>-;j*N(v)rsK3}d=zk@b8j#h1|QG$mo>%CpY~yvaXlx{5yt zx$j}^!|z7AOxBYkAn5vW-|?)d%8jSE!^T=U?7EDVvN8J#$DR9n5p5RjVid^~m=Dy5 zJ*BLyl@nerIt6!elSo_}BRb;CNxF=YRLkfne-jBi&dr0&!& zcAciO#~)|WOy?WZiY4b~@9^)Q*EjLztYwxjtfAvI_uu${?RIUH@R)LpSJzIICzn!M zI*@fX#OdMbf0OZK@28?a*LaX|F3LX;B?Pr5|FIT<^AQf2IoflQJV4mLBJ$XUx+I80 zHduAuotl^0k`SJ*GYoS-vx61QZ7A&y+$s0VJRBwT9Gc3~BoCkX_SlYf_k|g4sBF5w zN2gL<*Ub*$W+3zbzb8`~{TX)zW>hG(Cqwn~@f}02+@ve#Rq?817ZvwgI&bvUwo@E} zM8w~rD4aO{{ns&0wd;z3pCp!A3_oQqb-7U0qH|>0YohsfmWjZ>-(gddGXH#=$Slo8 zbx7_-U7wcr-COeNBrSY2c0ig|N}|EC#O(OGu*WQ?T0i!yT!SyHuJ$%Fgb$v|sQpSy z5OwJ~l*S$7Zn3X_*F_I5WuXDu`qWO0RQ&i_LLk%;8~;N?ERs6B-2rw`FD>F{H{)?5 z=c#d9f*SGw;Zdv+ZfsSh#FaXeT#c==tj2;iMe%+)Ng=&f|N2q!slW9;sOnHl;lp{!q?+N!}~*OdsQ-EIINJ?SsyIy2f<9 zQainp(8Xo`@%+wXs%SQp*7jBIc!yVlL+a3&t;6_69fDJld@KM=J%UDxVfL6k3jnP3 zo$2I=BM2p^(V8dB5$dg5`TT3$*u@Ys_MDvuvta9Ls)uqCGw3yG++sWK>4d~@hZ{A1 zXJn~t0#zlH{-6z@i!^+Bw&|>4= z6G^cyIX-v|%j;hm_!)x%WIBaE`G0duVBF>Xiaq8jBgoR{a7r^?!o_UTKIwY<$g<=3 zPgzBDL(pbu$KGOvHr<|+u(sgg{i94Y(=u=L`=`Yz(YDuW;7Ep52qJ1=>f=Vjh78gd8H3+U+t(VsbOE;bju)1fdj2ETw1O1c3B^67dehEDZSU zEEDx;`aFZ+ae5R?C^4x+A`UcP()@xMo-X~6s;=g8Sd3?<@xok1QlIvy*uC9e1NZ*( z8JRI=#dBukgygB)_6J+FQZ;g~9F*gIMiWgD0b<$D%$shRF(ZrbOpR?``nJ}pIZ#!F zPg#%qNo^Y7zPe+XJRbXo6FokkduUT--zDjV-^FZkcSJn9`BXg+Q9(H;h0eZ z!t0uJYMz=(gcVM_U}{Krm=b$xHba_T=+%cMkcw~s3**Y4%{8{yS%ZGp>jzlngRigz zTLnh36XsTlZrw52m31?7Nz<-2JNkT>3vp+_#_HMDOw}1Fzn(uqH3A@!CJyE&469|5 za|hW@^H)wF)2{TG;6A2o`>A1d$TRE9RrP%CA~HjOe9Q+}SSq{b#PM{$U%!~GG_9?b zPo3sB^aGMweWA`1}`EY|z8~)!nsP z4MaY5gkrs^_>A7X`;JoOS(Dev z4LF=Kg|W&Em~U5Va6kRV@wacu`tRe|zEAA%wA&UnIP*{;R=l*)$mR~a&R0zs+BJLx zYSn~Nxg%5|JlTdS^aA{}2aUXuyA^f`a!%#(LA2Bek-H~5QsB>}>m3=r!{PCRI`XRR z!5>w$7s2f2I2)BnT(p5qQqL3_@d6|{@>kx3>Ps%$|Nj+^VJF*vtg_LAM0`q;aSKPA zt<-2in*9b{Qu$Qe0EH>H^Lsp7-ejyw1pmy4jOpnn+kr5m&oJccNF|}vdJBn3?ESEQ1Ay>t<0TfHFb#eg!*S&>YOkxb+aaoE3Xr;ZFvBtqVr-C zSwr?TXGh!9t4aj2iD$-{+>w84+WC5L*CmXEF>$0N|;njo)OsTpC|+f;+k@Mk9^@SIYoBs4Ci2ekFH)@JUvj z4H!M{=bI&uu+vV70-{Z_pM&xKU-{)>B#)z4^SXeo#~MEi^w$vmxV20xQh(a%Zh;A! zU1EYO1&B}o``fQTR;{n6d@$=~%S&)K%)*oyZsIer1TJY$Rs6vn2*@^TAt6t}AM0BJ z;s4dQKlb*B0&Ql?kEhJz>*hRvzYNkOiTBrmJ~p4<_c+1>J%%20dR|hA`{zgz?OGNV^`giJV<)#t2W_e43mLI_K^?;`Z)1 zNFXGd+6HA??g{sf$u=7ec9;ObL48>DDdx)`+WGF%Mzo>N=ad>HkvHd5#}PAP@cGK0 z0E{weFLN>DyZbZbG9tBqqG;YJ&;tcbv8yyVf}0^A|1_1CbMyDs^tcGi3$6VvHNsSj z{FT6H`=jh7|NRj8+f_!@ z3e&F21()u$-@yJq2+7>*x?-PHY&c8Xt`2dD^|naRqtNtMjx3}W-9PqM10IC zJ?j&ALnqT%w9HZ&R|0`gM3Hk5%YMlw+;Vm%K&Y@^u_c+HtXuV<0;`-F0a1%j9+OeX z&g0k<06#jK59()qNE!PMIpoaNL7HA0LqR`kEsq zlqFQ~Xi+XVCQ@#v$mJ8;OQ?l=lI)~P?Y=}u5f<{zF7A;`{!eu@T%8-r;W?$}AhJ z3brCf1KNH)TJPmCegoYLFM@-8pwqOnEzyVlC{EIUlO=p3cuvBOeMD(!ANL_)b~xKYm^WzS4Z@`+s9nL)EbU#Lz%fO+Lc6 zz(J!v4xO|!mldnQMet^qT1eW7pb$*dAh8x=J8s|03$+tCv$p*mFXBMxikoEZ2HwB45^T_*;IeDC#o-Z_HUn9~JQs(+4A z^WrJH^+^1h$|%|jffs#-xG|+Ek6r3Ugr8Wk5d+=Bo-gk^IoKvxpvup3 z(dK4uXhZbLT7iJcTS-Mkb`c<+g}B-#h7R_1aa*3uc?w3p2ywS=io569F(cDFiK7<0 zsY(A@KU)@j`rMJwz8gp;)5{k3eKk|^V<{X+39&M+$+Iq*l&| zDJK|E4gD2RF()NLd<~qfaoeg)27TnJ?ku@@kT|7NnpE%tq!bxT%cAFlfsu$ygW3bR zbw8*(U(d@sHL~dxe7;IINhy}W2$b=tK|hy6qRs7;T43awRix1Zb8mmhA%C}q7+rYR z^!u2OW{-POD$oERu+^u%-+^5+7woo#i@Dv<9UQl3z{j_Z;&1-e~f}ZZ^L1~e6-DNlSRn+gXP1V{zeZ&cD(`Bj0YZa}v79X6n ze-72(P23@O;kNeBjvGueqIKC*r}wC1EyNl2xDcpF8c_{*nk?hpx(3KhV+$=#V}`{ znM)}`CgtebD-1i#NR`3(FN zZ27{PsbA#@irfz+07ia#D4}m*o3whMAX5X_15Ffla$jLzK<{wcbXn=!7bCy%;uafk z5hBkiHcuCjfhE?Tbj~2k2&}$RDl%rx;Dvd&qJnjJ+w3MW&-@~5Mu%Vdt^<3xf;f=3 zbLbZ+%*p}#>^;@Y55hUXXJeLh=~;oEPLsTN`LDMK6Yn#2K3<)Wt{#~*EfB51S>I5K z-nTvP&(UoGism|o8dz3s&M)#=D1pP;hXy~TgpF6#oJUQ>Vk*N|4ihH?t0|0Qk^lns zd=r2VH--*)SRuACnn01+(3ZR=;0;JBQAKl$y2(BWt*dW|dL}-TrIbF8I;dVqc?zdh zz^zLs`SAYu8IF)-x@u#ex;COsjuHvPtv+o;sGN2ZZle8Q)hZZvMN#)vi;mmuXVGPO)7Io( z1eCS`IKZdALl%R&)!qF!>{j%NY`K}{*|_5@7>wZ8nsg>ea7ciB;X~Cy z=-`qU*8IbOfWP(<_L#8R#f3+JQLMN+jq+)-G|l}DfJ)aK)F+5W=DR?@8HqZHy9jT1 z_|-`K`kYoQD9_9XS#Ux`Sw9*87^=u#W|jk{Hm^~CVJO&FmlRued!MHYjlJN}BnOQJ zerR+o%kHqi05gr)nVNTL{EYU_m}~WCZ^}v9uR`wt;2QPvSRSZ1aGOADc>gxV+zSQM z5(bf^TN|pI>IO3W+x92@A?v-~=K4>7`xFM9(}cX^>*fN=b#Abynq>(7PlEnK9e=hD&F>57-%9- zpbDq;R+tPB65aOm+YLhb_nFFxA6YxPc@6%?GD;&Jh~JQx&; z=`z4x9K~a{{xK>oqn&4q$^q2*UF68mV-E9YU%$UkE3+zlS7v3%c-Z)1YFMG8 zSsJ84=!Yx0+fEiM+L}?;Nb<2cMOIZ(uTEOyKF@wpz{H++>KRgyuv(9U-JXym7LWq` zrcpn4<`NpHU-Rk1F6~|O^eAd=Q_1~sOT^;4fIu!|V7LPa_B;c30td<#SJxU9BAFcP z_3lh-hmm4alZaZcCoLd90}`giMW9cSHV7yQv|eX`{)q9~!cSVSHgMEITi{O^$SxUM zgq649zjy?k%Wz6H<=;7{zO~5phYV;?++XK_jWrhZYb@t+p4kaGWzQoG9d3ed$sZy2 zF?=zVsd>k^;?q$41C_EkZBZcqsPdt0Y4HvY1U2d^7H<4gHc# z^dB?GRQr$5Te7(}at1gZ;6^md2|>*miZ>jf)+Sf~csWxcOu=%R9+Ze;TIyAoBBV_Hh(?_t&|M`SLmnYyOV%q{1N4w3- zBu&?|h0)rl4uZO|Ky<%%O}$E_D-tO8^}mxx93C)WK}oJKrXj};!UQH*>I=srr2s@AUE3Hq@%HSbgvRQM+LMn-um=?@=KqT|{ zx>2z@>429uQ1OXVejUzNvZeCYB}J{|b@j%fn3$FG-R17S_||F@cik{v)HGAAhiXgY zpcsDD6UdJQ*XMP3sblm-<;h$8cgOgFE9J^0kcVEw7D^G`Ukl|P6-bCPVkpo&`w4hi zVD!nG3@*Ng3SGY}B}Vo*4Y0o`4T!52cMASTqcvd3#UI|cCN8SmS+eAHRR<%ZSqD{T zVED@r6*7y-imQlQo|1vg3pGKPiV@KnT`-uX!Srj})+Xf585Mi?Jh#H^jpx2WLJ;*w z!Z5r9^_MDFX%BeggUmi#71*v4RW1&w6I88F5d+t)xnSSTS4fRpLg!_Mu;s5 zYk?>nX>&yzNXnlHV<_&x_mWxOy~RNrhS%PXpY7H=k$Ba(1X22nvy7fGHl@hCs9xN#=b*UJo8S3x+$+HicpU~i6&Gz)JPZ5 ze-gvq5*3~XnyMccfb8IWPyq&_(mqTRA>>5;nDZSd>A9GwB0$DEylcW?YjTbYMdi5V2{GazF%0{)%nrsZr)lMN!|l;`FZv5 zWRi|w1w26i^R*S@hAAFM#uw#t##JU`A^uAUWx62w3m(Fkb1Ivqz^+_tpz`Yo{e@sG zrJOMID?N6=j`D+MI*N4vc4ZwKW(p9MEPZCx$|^n3Rsu*u-D~=e1SGN6;AB&N+o4II zoRqj+8n0W~+2+>~Xb?kfjhx<_N@EF^b9cSeel1#p4!C>}j{Cq-UqA>%WN%l>DR9I& z5q|3H%CAfY8@?2$ZO8_DpX^0el9UN52O4^!d zaGWK`YcGd|vO1?VU1pckB->iG-a8h9nnp~xCYP!+29&&Na6%^ZYk8t7P z@K56=j8e;VHCvH`C@4LrfP6nYIlp))6-*E|rAK}NRqhvZF`Ew05e8Or{AQD1Go^qKJRz{3?T znsAMiMW?cgq@(;~IOZ?=K@Mtqs$)*z*C${R;?v{AJs<8Wby(}Ua;khxJfeAHb z<el}-RFoM>3gE(I zZ(@$(H5rTxOuzgWL%)JROb^+%6%{0^%sVly1e8xcyPyns>(k$NrQ3QO=%ci=SpzaA zdA`d7+((jwQTkdK=ryIuMv#tUo_4V8hVl5khn}4Rv_dYZEbyR`)B1i#m=<((R%NtI z6;0skJJX8CZau2cTZ;v`UGe8hy)H#-%_i2*RgGWeTKc_!zyl?RZxb3^?w3hP z?(Q^slG^ywN;02;%iWi$nj&&loUNn%A?yhQBtT=Y3S{7B?ok$@QZ98Nrafii03iCx zZXdp%RA01NEgTmb?N@ydtvA%Uj~S7vnUr(~==sEK5h-#M2U~rOzgAy5aC1Ps5RJEo_vd+uf!s8iRhihCO#CA zN==zjv5ytirF;#-U$VilNZA99#W+P9u5Vfxu)%c|twK<@5$T!X&(NiTAI&mo0!MDu zxCBqqfJT8v#b9O)#wVZMMT9*y07uckEna#ra8Gfz;nlngb`r)r8@6Cc%KJ0}^W`zg zr7x*ILQQdP>oGeWQG-`(YP6A3CH44d#FtoHaL<#d8NpVY`DaT5b9{Ucu@XX_fcZLN zMaS}hYy473L0Ir$cOYI;@2Jq-1amlf$&vl?XKp+NTiGHXLhG;TEhW+4x<8JC_8-vW z1{Xz%6Y|0pRq0ke0vg8}`Cch;KB~e=+$H8S;N$D=Hr`HTgSI`nm~#_E0T3iQ;dFY z7kX>JYJGxitopj+0(49?85j#ghGy0L~b+hLN~@!>;^_kB|A_+AS(Ecbtdu zHP3AP4hp&sMbJgiO$;;}D59n`t?nCSViKSMV8R-X?7o{YY&fc4eqz$4a!6H-en+Cs zJn>R_;;I8w_*5NM{8^LZg1n<)rTdxrn6tFb+;CTg*{{7T{W6%WT|Ya@?EJ}59g%JR z10vi>&gST;9f7;%ftnz9gI?_NnVnot@lM*61APvA{YsbX@jlGvWNhbJJ}RF=W3_8RsX8oZ9PJBE4nuHD3bZN<(EmgE_{BZ!e(1p0bYW2jZxPLA9&9amZT>QeTL84B7kIZ?3eC`xeO_*n=N6Lety?nB?{zbX5m#^ zGQ!j3u@J`+!h`#FjnHI+aA(rp>Mw49)1<$<#sd10lmfIKmF8nHJ5-d6`N0Sq3 zZ+OsW%3YH3>7LtH;z@p)7e>r~`F_QQG)C?zpA2m9j$b-r!!TE7H=mIxdGg8u2X{0iYBg1 z>cSPDrrAEc;Hu3`27B7uRFaPxYcU@TpS=iRJHuw>Y{8$aA?ep@aB%BeOwGGtZC?mk z;~%gw@ZAkw*>fP(VMhq3C`az7LG9%QGiU_WjVulvX-VMC!IEL4bJKcisesf@G?c2m z*MmsyUHGx-drq6V^fd*Z%Q0Fg!O)kt|ArBS-~p|${9By#<5=SqG*uL@vdBpX|jf!Wnvb~9YzXNxwnb# zjuBT7o|J0SDVF^Vzsvs`)~bnr)y5t|u&%$7j_k-bo_g@{`-f!b>1I`O&|1W1GSD;> zxo|rn%Im`U;>)o zEq^zCKC#6`Z#`#3|MB|zYA|h$Ij87d0$S%r-Ft6(se!PikcB+8$k-ZG1Fz_8?;pI1 zyFd25?=z5H>{ke(!WBL<;vtmKQ|T%Fc}TGTTai*k@2_8ZT`At7Pt*k}NB3F2+ynwO zl9TpP$|Y4Xo4xy4nd_D^7=>1;mEEakJRbTs$4qPA%OW%ENIf;hBjnY3Wbk=ixeY@g zD`zL{(H4aMkW_W1j!h)k?uZlc*Y5Z%Z-R&Y?FkA`gKdB~awW^VH*|U9c@Q&v zdrj$;>jU`o$XEL>za|JUH3xUVB`62Y=GCr$h{)wd8Em`1)c$$_DIfPe_qO{Jl@==? zccs?A`xgm7r;BaCwRZ+@p`6&8c1Tl6i=*=$3E-TV)I}vE9zt)yiWmg})o5aw;s&tU zFV*_J#D)(r1oM)eN(P9Ppwg&3AXHZxybchpL2Wnm-i(Sof)%x2YySo`%Rs=16gf_4x+!gQFehf|Y*$a2GK0^-2BMIE3w5cWil<5u5;vD?@Wfad1c@cW@^V#<6 zH!DKN6e(hB5%Xi;alZ)?+z?5J!3KPpkXYgI1Nj+cEh)}tfd3J$ zs}kWE`h0Fo0_(K7`OE5=?-~Nrij~P`eAW2z@~PtX@k~0xZO1`xEfW19USo7Q8;@bU z<_P@7YcwD+!N;8XVf!L#WJ@vt7}PlMTaxjw)^tELt?w_GvexilKF?8eY#k?x$WgUwb4Iv5e6l7KjHU1@Raf75hesRA}OBL(7X-)m?`=@7XX-K;BWWTk9de1?SYjR z+60Gx%u_Iq`nyMHG2+zI)!Zrswiiv%=d>=R;~cWq^!Li#TNI^zkGazGRi8g21m(BU zSs?s06zsoO?25pSF&5G>V@Tt|8+X5==N&+;ouTFZ&!cgv5>?YSZhMtTo+qouJvIeu z8(Xa$CofCTiIg~=14q|b$jgW*0(e~DAg7D2`EVWs%pt4?p|v)(T@Ry8@qPWk1chQt z=1wP}y0!`|WoyT$ilmS@2ccE}Lkx8UiPqlIv9b5;^3lop<7`90GO{MX`CzxBa$@z< z`JnJiO=kgsK?l^r=9D{g@U^H8uTi_9xoXM@5X4O>6=XFKxC@Xz82yN>916Tt>i#f72xcs;@BO_1O=?G| zg`*@iU?a-NMJOPu!r~&1i~&e+#{d(?A&wD1KqP2rv(%uI(5Vjk#q|k1;6H}Ndp>Dn zeX^?`9}`_AC51s1u6w@(3fM?^(~)&_z;R7f$XJQ&69|007?E;VIGq#lEKLu^hCZ^@ zBv8u}0P3Gd$0S#aXgQRIOeiBeK!B|aOW0-&G{m|$4&?N+nmhOdKZ4Y7 zwS0xeVcHBI`HEB#wt)V_?T>Fixv@!cIEDo*o@3bn47e?Mz?cpGkf!dhlZ3prRL}FPjH%b>n@BzW{AADrUgYNxw*x&u-*9J8BfAL)ZyWIb~ z-2c1W|GV7(ySx7#)c^19{y%|j{=2*XpWNM*LU@vi%b(qW3;m?3kG0@%QJ9)^PNOWw zzcH8E-4>;RqsrUidK%n^Ul?_|u9zmj6NMPHx;5QfOr znk?xaa96+^4J?TQ3Xr?^eR^B}4I6Z3L2Nr%EBX5`7IEO3Nde${%FO?lqcMX2wKIb4 zd9@&l!b94P?Nh!s;TgaM_fhyo2<#v? z_CaaWAHvZhL*JwQ>F0~84vrWP;VY5gv%c-Eb^2ZIt_@y0V12K@4>uLlp#?Ncd;*+8 zC|t*(pe36pr3kEm$4dAP+%@@LdUeKmT}4Gj*l?l2*mHA$4~0k{-n@UP5nc|YZQ1bne|Y!D52Qnma5uc*k&^1I4zeG&-XbkW zy^8^^eO|Ds*%J>jeT4Y{4*k&`pSPBBNT})iN@9P_Tv!x7Tc-fuD&M<;kn=-|$$tzB z`)@Ix&lfvdCs80+qOtnr;eXBw_A|REkY12uP;iq59DNMs>2FP8JQToNHAR@L@c%FZ z#*=hc04gV$$p`-HdExpE>(A3GVW@z*-}>>^%zldQPj0|I{lD)ue0ae$ljQB*0_Mk_ zbVw_IiU_E=@IhPXdGGx~o`{*-`^I-qBYrSLNRJL{&VFxf)q?lle z4NW%#rDCyre=;$>4t{>THt+&VuOlfp}b3wCa5T5i*{1uByN zAjfAoOwWv1RtNMxw<*R73iVoeGC&a!3r$^7yy*7|8>qT9(jB-2c(3i2x-~XN zusqjGzv8QGe~7bhL?A|aHaB5MEClFEj!qwXODjGIE7F3JQo?hId&!o<{XjmrNZY&T zI+I57QUw2y{Q~4;Qa7UBML1AH=q1SXn&+r725{b1-$^D@r162I-P9VGDYrlThxBqx zAE1@e@|+9`kSV{?1U#%^fdlU+Yzj}zc?Glq!Pg}JXDN&8D_9dcr5KC|IM zW@esmZaVW?U->pXSLeLj>RHc*f=myV4Lijn(GMXO^maepIdodTzc3s5sMWI;#XMMk zx1Hr|84864?N_p|haWBIrR}({4)imVP!8<6`-`Hn zH=)0Z__R3qa5t?fQKt`G8t7jCn?K(%xCc_gsZ&c-sRMapsF;tO*0LvWdRVj)$Gt{eTetDBseVq;+zxs|G8sCJ)b`*%!+b zPjQFUC4@_eE}_9nbyv!F2uMBuUT}(u!ghUz>%J(?bM$zX_8;A^pi3zuW@#aDa*lgD zkhLIZJLq+8M6edIBsoaDs=2jAy=SMZL2gkWL@DD4ZB_?s;E}I;>3Y-;9oXv_#l1&c zh?d|VH5BwfA6#gEU(4}eV#(%OD`kdA=ktZsNkO4(LMl%@@%bZ(N5S~2><`navOL&Vy_p1= zsC2fyUOv=?p9S|Tq4Q(fx*86gSH*Y<2y#Z+7~pd8tD6TI1Z?E(^AdRCnbU0`P*72n zfK-PU{1sIVGo$=P!NCTI)j*=BA%8f@EcIcGH^Qc>G8!mXX zDZ7+F9K$_31pUk~#gQwjl;BL=_tND1^FH;9_4&kmk`tvA8&d2G8d$HYndb~etGvHs z=k;{u^3WJ28;KEWn_LN}KLo7kOuBsHZ;=ZF^$q97Akn913ADyJILTrnL(k>Dr$F&} zdyZ(Ah7wUsI}_*hINq@!nnDwX+R^>%+4R&rfu7ycK&+v+;c{tD`2lMqp~sAX`g`Jg z!=X;BNy8J|H2$Jp0smvtNO@LSVokCl{2}=|8x{75*;siqVVEE1=LCikFxz-KiNy)M zob_7?cfB+OO&roWghM5gx|((!@;|H$`92D1pLH<|rKC3F`8X+1D*Ps51pa-sKX?SBNXtrYWv(r1SHFAo@caY#!xdSv9b-U5UH33hj!xic0)hO z2nX#Zx`hE5TEA8)me6?}44`hLv0gF@GPjIk|J{+nbJwjhu-9)V^8yl})!l?BPMsZq zUy0yT3;2ZI1o3Z({9Z#ozDSh7^(gvhLtE+>OuWp6y$)nJ_nquGZZdBIUhofBD#upF zAJZ=FeMz}CZ}RtYRtcv^Kq}3W)dHOoTRxvP=)}Bf`w%B!#uGhXKCLv{`eg&HM|4)y zDfeC*bX{xs`5l8BacE(6Ew)2Sr3@Z$hvl?4D!RAf)5geylqz^KsROJ+r|n#|9txy z+T>i13}$_`*Tsi(?#pekJ}H?Nma?MQ`0-{)ngMNTD+rmnKyuCq1N;Er8`ial5&T5s zPWM0J8iD`#6g(V>&RH%iYq9foI26Ax?+IAP?jWXEApZu7sy2N^v3-%dCu^eQYe{?s z-CB!(ZS)u5Vw%9uDl{cy>Do}m+DSy2x)DSsKJ-^{0HSa@kK+A%ztMav%)h$FQi+}T z%}BcXxuVOGV#0Yf_s4kwLFaB-+x!)+*sftczZJ#ooYWfCu5WTu%WgTT(}o{<^bQ;| zt_$Kz`%-0%s$QQik>G{#9zx@-a-QO!kPZ<5gp_2^82)YH%4zQo&QibYHD?YG4`Hev z7f?7XsA{ZRM>A7r7i4<^KA)3WUZHb1Rm(zZehRMS&ft=NYFw*4oQ1ZLDKk!-VDG~i z=EE&}2wXhz1m^IAbBO$B+w8^g?;;G%E?aZYT2oaun&lfmY^OAHvyBziN@kH^vF^)A zXrS)3d(=~jl{7t>kJUJ;)GaK6w}z*X7kK}$jdG=fvZ0D5Bh3ANga9 z+}93&I8S!RCkEzz#V5n~8<6?rBA&fJl@~|9IV0z_J}h#WnQgAkZH}M;WjzHOy}ZiJ zqkUK_mS;Msu1yMxq5K%@6S&yCU5~@g6za?@`zS|cP^9DAm9|l zaLmJDY)(%?*em{zr8a-yZEAp}ywQwxer4_L?b1%OD4O(i+e5;c4ZzPeS)| z;_=OZixlZAKiP2;2)r_+T@(Tji117g1**iYG==IUw_1wDLWWln9LFu1E7U9Pu5R*z zzQuf(w%jF=<_YFjy;c~E%M}#{&POms&{&py^|v|?k@~1SZ57WIm3Fq?sw$@cam^kJ z^qia1tEM%yadpO<-qxGb(>3sZv>R5t3zcmhkPQ>sg*yCD!P)!{WlBYN6m9c**TlaLjsfn#tgy{3hDIcGld3i%gCB5Lso%N>n2BOeXC{(tc+-r`J zE0xO$(2hJ zoO-T+Td}()1aR;d}wCUO=AeHJpKU4 zFIAPY;}-HSM+PiK(GUgW@3;P%icczRkw*n)Kl*!We{C%4KzImf{`JB4Ps}`o1yJIy z9D#lBZ|v}sFQ4dPaW;{#@>`#A69J%x0)3>>-`x8Py=uY&Fly@;Wi$Dxt6>V9vK+bwpU+ z`i>f`h?`1o>7T>UPmv5M0m{s$ZEN!Q)^}oHF+u65`d`}O;lHQ(I~U-$EFGc&1rAjw zobvl~f&71*l>hV7*KD(^|6#v%u%6u`E2VphhUOJbg9g@z<3dB@n$Ku$a&>dtB*{BD zB_=*m$>aV9d;B$XAAC3z9Z;*+%-GE3ZW3EWiOte56pF@t7B;7+0FsF`9vof}i0(V< zD!2ylcGg!q&@I&we&P4&T`c7KHO0fRh10Ni3G17%$|C0eT%vZr?8 zMC}<0@BcYC{$(JZV`_e}d$ip=GbiA6>bkXl>HDQ_ITMr;w7NE31$tmWd4hr`YLi$^ z-*|X&x_8!PyCil%I%D@}ey(GL4gW30Ewy}4SsuTlh&>y|&`-yWlX7NU#``i8fjM(h zbz-bOZo_$j?m)$YMu!mQnkX9D)S{19^_M5mxsC)gt&e~^W17&QcWT$Ezbz{W zO$Vd>5Dm|H*k}7oCib|n3*b%=z}(8v%1}(e&QeU$!P}kpzTB3JDpaG)8S z%NN7U_9T+8*sPV;lc58<&=&XbjQ1DCS`fi;?tik|NH7Q+EfZUO!n3Jf-7I8B;iEXi z+~p;2Bg)!yC(u2u^Lc+R1*Mzl_;~}57Tl84`2vV0VUKw?#CKgXfEO2RceQ+>UGSAe z`{KR^38fDS9v(?Bry7xFSecNAx_)7@7l*IE=pTA<2hNj>wE1aPYJ4|qK<{PQGxQJ6 zu;nO!q=kSC?g<)L(79FkxleyO*`5=rL0m#xWCLUeF6EpD;A3gFh?d(dC# zHzQdz;;OfcUd;s68FBN2n=hxzbsDoe>#Dp_I;YmQ(1M10!#IK$}yfeRjt67ma378 zg`#W_9(}Mt(P?JRmIB2&&{ltafP{K(SZe78UFPku)KT_mVOG%c9D}D_bU7ih+Quz&5sdd88oqrT}0f%`bmk6`!2!UXx{r(pT?&~|KdOTd+A`AT{EytcBs>_8>uyEmbk;sCINo@m&vmPSQX$O_V>- z9Vaeg2y5+$P@5tNcM6Rt7DZLA)DRGAC=PdPvTI{E=fReC!_#KI`cDf36XhpT3KiK{DP| z(q0%H7u@*{pUBG{@*yEP#3QJk4eN`h*@l`0pVxH#aTqG|tcIru*U@kbsw(POxG0}N zFlE{#sD?SNHqwWwL9%6qMTHRR6*StcQI%Ar50=@~_UFVaOCg)cRsS zs2IT(C!_iI%##tQ@bxf-1bQ=qRGTbAndbw}6}v@WYdbu^%t(VXpqarskw(#Hs`>$L}}L-IvZt`7*_mD>7zvD z!3;D^u@fip++0_8RYSdOGNyjE@{y0M<%N~A^QZ)Qy@be55fkia@t9Q#x`$gtiv%(-H5IH`o)#^=hP@-D}H z`2%dAKIN8y7k#1deX7pW_>0Q8Pz@+!`b0cLYl`~bJ?0}G)6tph3_LEvf&$ap*?h5m z#??<1eATJ_VMklRU5_?7K*l~i%I`sCqFXM!w6WXWsYa z^O_Hpt71IZLQP!~9J!+r&%jD$sOJMk3?A(06 z+SK-?#|7gxb>VS}#$j&Ks@m>&3e5`)Z0=ZKF5SW^cM-Ya9=pkXG9EUpbPXcbFJjw` zUaruq)Yk$n1SMYX%XdpDEQOSQ-y+n)@18qL09!^OHEB7w&17(7osx zg)R4VWYtS7cf$&9evPT?eK}K+xb?HSW3Cu=cDzsq=6@-0xcCbNH|xZm89$mmxQfON zxg=yc;Rwq$qeT_a?)BJ;=&+O3gAoVJ5sI%Q$8o%bDY{9j*_U}$V!&M^M>1PYbIy-h zdE_d%2`yZgGDCvb$-Y=Eo7Vz&E`N3Iu`n<<94e7fXj4 zT7E;8GPUFx!^;1F-hfbJ>*F-RKK#Ph@0M8;JXBVUFEWrDjVUZ@f=wU`A=h; zUIw37LkQgxN7G0MrEVoQ5f3C;@NSttV`4DvhwDY)R>0nJz=%V|9PKbIA+ zNV;IJVR?hlXhW6NSncWN76sd0ImKBw^{p=+lPYGdl;^;WKM~fU$L+plit?j`J!2uf z_HwhgErueU&y5y{p+nT<)-QfQzWyT1W`gpqPtj5WXKi$}+mXxvK92r>6=%?`E_4*v z@g#4puvr8eVwcWOB|CfI35u(qvfHw+)vPB@$pLXaD(6tCGbZsfL*m?KQ1%DS2 z^}{EKUn?01L3sxo>(BFraOBT0I>{1?H2AhN;U3IkONt}+^w#kJ86+9?N7R+e;6eXDL)`TvkNEulPiBBrpqmlkpkUQ`% zk&^M^K@hFo{k83_f-}1S_MtpOz+UD|OtXN^-U-I%sN;XLVd0U;-eToLI6!{a@zC;2Bt@u2& zQxV+3NPGL`bq=2-)KSC9Eld^eTZ}`Gr`mhf>EOb)6>yt3q(jzlmqIGFkWsI=xRHN( zuW@}K(Hk~Gy;y6v1(m)4&lwd90ei*8{`)mfpTiFu;iim0g3;}W#UK)VUh2+X_L^^w z%Psmy!$n^VKCwBoUuN&yhnU|cVx7-NBLeo-3;l^`v_F|6%;Y0y#_JKibsynhuL<~w z8uMOtv*I?FSs)|+X;?_pgn_#`7zO%H{|#i0(msW75>*C^|9G8 z_5Who52S7DH^~pKnW?G%<&71ALlZg$8ajTkx%Csv9k`^j2Z~aGx8`>T&K~gvSMohX zv)g3-sT#MX^E=z&Pip^P{HxC}?IeR}MsgYJ)5U+eo5sUG4exuh2St2Z9~9lX=VdLkyXPd=T;sB)Xn^Xgr+nTo+|fw51KD7 zC3}t|1hWJRMVsez^*!8w+{_)HA~<(mS+p)htx04J()*QdRApt_QAH&oB_^mRB?vvHUy7cYi(}s%WSic5x%FNcQ^wg~d1?FcL)X{{EqRy#;6pVBtLR+-3HHpL($&tn|i;Cc?vmWP;0PEpKla2i$O3iP1>P*6O?QNYcG)0$~He zDBOhp9@Nx-utb4^6Rcq7F5Qm4LEJw{p=KFtb@D7i>3@C$v54 zQdMRmxjIPzyky;>35FIr+QBZ~q0IW{k9r!ma2?k=A}5xh3_hPJ?;nY4)YoF56BCJSNtT4oAfb!>)ylSG>n}&L9DJ z!!1nTn^<^{d!8wSx=5^&Aq}O#jTH=z!Z4@I#j_WyObKqZ4a%F3tAvgk&oE5h^~2j$ zh_|wJC(YEi_&ioPB-T4=DchauBB|8BQ00el>hca7f-hQo+n3DDf)t+cqxUOJ7)IHA zg}RgWIc&?REmUyp`JRQlT}%$FG=$I6mErIw`a25ZMEsf>8+ZCi7u^`w!kS@B;*9!I zyz!FVt&Dt5P8a3IG}6O5wK*%>Dmh@g(7*pBvN0i+w5yg$T^BsmkuTodd#2RKr4{e8 zVwHBSrR)=q0SYzw&ak4q>yOok>1nunj!X}z1`-2u%XRhbI-NInWpkCGJkq+a zHNpDooFVEcYHt}$Fog|#w5+=H6N23t2V1v9BpvNNJo2K#Rgdx^bCY>82@e5VC_I5= zGDpSC3SD74z4^*U)o2Z!?~r;4*CFr3%;4&LZ$e+W+v9MYxCiyplp4vCe6-Acmp!z0 zX!RR7E;R5HKf2|tZ-kc6DoxBKdp4@=D~YGNl*;rG_!`Dkc4e9JD_e6Ja$zKhUnNNo z8_&_7XUjS}N4#v^YUmXhOCQBc)S!;~V!L^Pi?;f*W~-DEhZ3jA`>?w&)O(qcG1&Gf z>cH-cRT`4+R_!^3Y|O9+lmAS~ygSK4TrZ$KF%~tp$dFEHd z2KhGoPQ_RKnxHbr^<+Eo?v4t|6XgV%`VIs(Y5sL~W+Rd;=8L`@L1XeUnkyk~!k10r z6sY(5_w=y$5rp8XcC8+CcCI&Gh>`vD(F~cxB{)m(W_(9>F0l@nZ*KiZdlI@Oci(6A zPf2QAg)s4LBkb}SMfYtxd)b2r&qA$@fu?`OS4{{;PHh8i7&N?xPUe_u-n-RhsJ8K@ zJhLfj_@{n)aaCyoe?STni#`>a|TxrImU+IZCTmCXp4xvkhWd25fa7 z%y<|-o|`NveSB=@P_1>{z}}Te`$|X$zyXIf6~fIPda{*!xBO6`HprvDW-#$qcW~uGp zCRdX^Z1}dGHxF&q)p>Z{@5-=Ns`E7<;QQKOh%V)&_n7b_tx5G!+Tz!4X7fl59aL96_8PLpGC*xaewvWDwkk@%&pezK@-gYvnZPv(MH zZe~?MN--sSjxm=kD~q4#W2TV4Fs%2-yT5Y*m^N_XKT}{``W`!!aK|fjK{>vQJ`%S% zUgo~JC=3Zk5{?jp)v+Kt#0X#1CVSCN*>!NOPQBZFY`%$+?TK^0ayUCmp`BQ$F&Z8D z-4n;IP$BKywg;aJWBEQo$}`*6!@TZ=LursBA%W0DVX!=5iv~gjkD6=!6wCZq zgLcEdcLG4Z6c_wUtKMe^879aC3-NOgNAKB`KvN1N!|j+*O(27myB)9PbgRNjWs?$+ zU#-SCak&T;wmD#it_tJOo%+P@?dB<$dzTo=iuRy`r(z)^33L(ehaxr@YsW{LD?2Xp z_xM4dh~Z<&Sls85*BQQnL*9(fDVUCP9NM+2hwyl2xbDZlY9GvT9%)%BNQauT?lw=& zdlat>HN8^i8kpL4+rZ&mv83YErC&4IG!6;|?l#BUnE&D>A9fe+Tp|Qhr#;36r-K6v;He zhfqs9g386Vldt9p@W&WyG-NugDMA9jk`HKlh7k%Rq!FXXH$SH);m!4*4{@#SoA`g& zd&__*yRLr}5d}d)5D*wb5Ky`q8U>_5xjW2E%I2k-lT zp7(jr$Mf-g5@)Wt_TDRgEB4-NZOy~o(ViK6{$R~j*41%{o=KrMukyx3;G28$sjjue z=98vnvgACe-(n+zc(Nwbw6VV%)Y-F;@&oBHoa*fzXh_`(p)U2|3uo=wymGkzLG50z z{wL?&h|M?fdK`27c6>Agm&RQz-Ye;yaP(Snz0b`z3&QH8>E6y}W8o;h{+7nsM{*Q+D?%lF{KQgIiGNdxR5)t3_ROO`_u>l~D@7{UTK3kiK|aQC=$b!} z99T%r`K|k77SudZZSs1xj%)S3Wmq3DF-&xMU(!xZg>uX`sB-F-u^@V}Z0JaxxaS23 ze5$?G=}*jxGu(KS3j}1A{?ey?NU!|R?d(GSXSS75$7OBB9Ib!cVa;omD#;mGR)R;K zgkJt5WrPz!*r98&3RK=iTqJ^}_t~K#t+$qiDMg z-{Gs_&sVv7!El84s%N3DN(dF-wMt??cE++3??JD*Wa#CFj36t4Apx&z{HNbtPtA4` zT$N*QM#DS#G8$!d27x8{C-TVz`Brh*yROt&>?e_ZUx4n2^%e|d%6R?&ySf+mo zdNk1CZIJCcrRZE=iYt{h($=f&4)Ls-&T=OAdW#!Jy)TQ3--%6P^b>a)kT2fuF2y`) z8@T?sdgK%5C8APRv?ccze~mG?yI(PiwfNFpDTCzg+ExXU9we|w33*-u(%+2bUMmqdvtBC{N`Ly^?Xlf) z9b9maGS4PjI}HtjJm(60^2}N$UEA9GF09S>CklE%@t)YmL9;ZaB# zIp*X0GA(U|wW3>EgE(pGlnv`A;?2vJc8fL z7MKVQB<>Wj`(l}xz1=IzAu@wVg6bAj%e6zrY5iZq7xyv~7Tp=D~x>xa5xFCvzB@L z@q@h}`R>iPy=MOKR!(`PPH!YNmU@8D!25&sc}Cq5D^}8b+(HuOWGWlQl=!7--XGT3 z-X%2=!Na0CRyjT6eB4Lq3nDiPhvVBO^?0sRCd8o3M{L8};NVpu2Sk4sbbHU7wq-2v zVfntpxb52KwKn6l8r97`;cx4wLx;p({rb{cF@7LE@k~CoO(2;HaxjFbM_ScBlVB*Dl+OqTAG)ya3|bKTUXK-V zvPwKtyh)925Pg`c%ZNAQV)3nEmu>npWgd5wk!a53>wr2nSr+bcZ(w$}?eX@|>Q%_g zHG=HA=8pCC6CHMVi2?822Zig&K20l%uRQH(a+iJ=m+*f;IN59+y)$jgJkkr4c%$-~ zv-X~diHiT)cwob_jo)8C%gM@BEP94HWcCBgL!Z)9!Gv|2hrFZXZEf}IKnDZ;5cYgqy7!2;S5-#(vE_PBHK(vs z@7@QCdff}$L>t6t%}@&>O+<-*g2CKUie!oHP48l3JR6M&=bj(82;xGEsPGQqY6v;T zsQh=X{sHI919=10I+$Wf+P2jhet*pk6-~}C*+tBRcxgBW`kr0>uwWrzP1w2MO`24vPF7ZQ=a)*U*qMo=C{QA1tA+Xu7wIv zn1ynczRAOlbq{2@UO0EVGyv)uG>*4Deyq_(tX#ZnucB0%C^Bh-8^f{``ms%N;VbW+ zRN9X3zIfa5dr*=uu~YlvZZV~{{XWm|&`V|N#Eb`2sNsJ^x%AU~KhMJy$)h%1%aXll z2`08oFvCfD?P5AD$799Z%KLi)J+%7<8v0WrX0t^aY~~3B!xV9-N}hfw*fvABGWyf? z%;{vs@bMo+3TOFZy$QV469E3Oagut^-oE50%gP{7oompvJ26Ph!H0XzK-jdu6yOi zuRe8QRG;636~lMKd|Yd=imaC_py7y;)?Sy9|> zJBP?dm7pbA-AS7{sc_~W?{H!q6#aX8eS`fvUF8I^to6SUoC-u!hoMY#F=m;4V=6rhC1mtEE;WFQ+?XO zi4(B-q*b-LzeV?Glhk9M*Hl4dWOVAys?aGv4{xYMtW$(D?}6LCxt3wbb&P!cmy52W z?P2M-`+Q0gfvOX)VT@L5>cs2G!X(jl zCmP=1>M7G|$qBVH+5}jQLH=4oE!nvQyWdX8i@CEXa!KE6yRDJNWO^0?vfp+7(|W@! z6e4dId|(=n)0kQ0)L)BGpLs}$;%Zgkk{hRZ5XacdR^bB=VEuYcwO{BN3nwK%9rdyj zuRSXPgI3}OZo~)Z(D(95V{ZDM((XKZ9N9>KV+3`GiGfHL{%K~p7fGZ1vcGFv88ZC1 zYA;3=9Pjkkk~htq)k5?b7m}b5aQ3tJyXu){g`X4l7%F%yI)b7KMP%^r zv~g%`J#%ySITlX80guvYF>~*wx0W3BQDvS-59NN4ob6W?m(Np-&tUM$gNrpp%lv4X zYkg-0xlN=W1ah| z?bc*x*ZP!<=wqd-`>kbBEc!)L3}*y!PUM92k8e3<&0Tb`pb~5%_6_R)uoR}lRA}wv z_YC(wlG-;yBj?M~8}bj=w>LcUE{ZaET2mLlBky==Ai--bKkM3c<85!{;HG&TzgOX2 zan}U92QMpzNuu0QhhLxNfPDyyi-Sf}7g@dhi$4!DBOt;nw`<13I&fRXvj}h#S@&`s z*6vDd8X!=*K5=#9`*yv}1g?W{Uq5bW3aQYh>hon@!eq!Ls6qDLGGRv2tnQCdQ`;YC z=^+_nXS>^y*Gpnr1Rir)VJ|lygg+Hh6#A;E;2kfkOP-kai&9!3Zy&txI4yXjxh_o- z@b>@s@kr8TsGYrJLI?GO7UOd-%h4GD%dC#W<7Tc#@c_2`$8>?N$FmVj?Au*CRY#)R zq=@Q@^Wh*tnjtb{XasJCHl(|FN+t@38!^v?ysrC@Ruw1xy+bp7**!g!W1WY&`<1z} z&Je`2sGk=s;SYUfDDrG6-)T)Va@^LxLSx9pX4{pzP!<-4u+(b_c>P0<@klydm=b(zh^IhdM zR!O3ydVbp7dsN5>0ma;fg7ax#7n6}$;-OvEvhV3na<)xGQP%#nAE+O!w{IPa^{edl zSXDd>du8p{(|Q-N5Ff!C^a_)ZnL$ul8S_ za;_#i*R^f5#=u|A^*ws)Lubg%t;ij)T~jn4#BQB;L!B*7ktK?xKn+ zanxt(*QnwqRki8>hN+6IAf6EGrlWmGv6`T?6m4s2jB-tHFZS!4_QX@iFYoetclmEC zyd|-F5AJx?w{NBm@nTLKam*%)oO2gBup-`!MUD__n{kswn^Y8hk&tx0)xFLu?7?!7oF-dsE)2g8ASgM$`Ij}uIBgn!$P@`{v0iuk zAX#72n!+*G2Znnf{`aV}Dzl9P0q&7sR^q#w?SLmR-lL^ZJ=VB=e%Ps`in#w3qn1-hAp){-Av{i2q~!<}TmhcOP@p-}9qafe=Z=IbIgY zlwUU2O&5>(>B20&Pyep|DppKAh1_I%LFE?1m3c%d6SKuqvn@0-n!fn!2kY;OqcB%; z#s{4*Ly>`-EG-~P4SlyuRKTo~@14zR^OT^@Uyh2Ky~v1Pn8fyv_A1LQX-LHw^_1j9 z!N!VvE>i-$<`$EhhHyknY4_~gql6e2=f|(^e3;s%lj1^XxrmHqY`e^v%sDNc*Qw)d zV)Z#7(%2VDTI<~UjtOUG<3qe)&MuS@@82z-3?C|2)qM^wo97$xLanvQsob>f=-A78 zLu!!5M?#Qjv$Q&^XH7IwA;ajz>{>X>G9SLfo|Y_4O!rYW&76O`>-xG029Yj|BC;LP_E zM0a>n+zN?bcq*sjC8IqO9xQL^a{^WB#bOSoh0#|>#QRONRCv57g^wQD3j4WQpZNWi zo`&9UGsLLtS($xTbKmaBaS1#0GohPi-_iRawB40(gmhw@5_UZ>@#KB%cAGHt=>grex9%#TgZM80=JuSiOsA7-1{9w zN6oFvUZgitYYfA1B_m#rA_vx^t5`C7?Y=Sxe@kyTmbE zyYb8Y1Z)ITd0Fu3S-lCa#@33|WE@Z^W~!$~a)C=e8&3Rnd!E5(8eN%dO zd&nwihb0@0l#s0x&M4=E_)ln^4}B%VvSxR(vn-Uzj};Gcn&gY}j5-xUCuQM4~=klSp+JZ}wXT7^A+SNs3Q3ZWmwkzY#>ZWn57Gu_z-Kj=yIH!MJ;&!QJB&v0p-3>73kpz=iDq=P!q-{@e|9sZfE%hwuXH8U@7%|!g%KOPmFqm ztNXj--Le{vlLXBq-U^HgMU`>D;a$_#d)R6diP92Qf$3)iPR?h1Sm4rrFIWFir(TKhb8I6FYWW9qMWIoYyM+vNC>5TioCCgb%Jyo&-z+~XV>^pa$e+I5T~1pUj(lQ z9YNzmsmq_$8fbDeY;!Ss%?gelr3{Qyzgl~u#=nGN=uGNm+7~JxB=_30JQxb={}H0H zs%7J`f8vMb6?&B44_9-IN@i{ESz7(3ME?#SF2g^C2t##T5>bJ~NiCCop|s_Hewq6W z6^0D|Q)e^g&&hic*tD15#UU53r$D_e_ZL)NF@Y2^7c++A4GWH>6$4xn&ljW`Bwyu< z5-rFb36I~Q)Y~Mhr&KfKZ)ZOyRBAIUj}T;9il`q=P`_+HQr8X6%a&Xz|YJ=kLD8mbx89HdD$CD%kx+ zcow9|Dje~WYh5Lm#v?r0AYB5|OQ`b^TAA_It7kC#s_cXx%*Vg@gf%3;(GsYc_jbto zjS2rjsvRq#ijk}t!y;E){3q81l{<6t@=_`fHv?x}$i+re*R;oCnA0~G9AKnzp1bdO zlRM^at4xfVBU%;K5^oMVlG6fseygvy9lx@N>mV>;(Dp6a66WTFxBozuArw1FPB>U) zzh*!3a|p|RO+;<;Whr4+gdG?9-m}6bDG@xC0SyuO)iu|XT~0R25SIn7G2eLObEU7FF`4suffJ%Nvr?90gFG(g5kvC zL=Qc9e!XD*jnjA?8SPT258Q$pSBGENWp$wDJYz|E0U9v;tXDje%-LQ1yR))FROiVS z+wD2I<+28aK4Vet4ypY}2bZ7;I>*8^DLt$%b?=Svi?c-^B5{UKWwCwUC@|tg0tMP0 zr-Tjpd>o4Kr;b_Qy!sShq;*$8`d`)-hASwC5}c_-cwye%B45))kYOtj;#ZKrtHYDp z5AF+N^0wwFEZ*O_C4xMScJciCEUg2Dj0EpmHStnKtxkJLiv0(s26(*V$1T4<8}^sE zk}(Lua^7D-;*hlzHu7Z)0~rEzx)L`mCtPua3Eb`HX12o1=?hiN@Z&Xiuiv5T{Z+koU%VNfCqU5%bO416GH(3HZP>vWLuFnnlt&Up)V*&)uDqSjRykZ5*hO zb+}7Gxli$?X)Rqud&6HJSMl+$vUZ^_wUjcXk@D>+u zRs@K4pGIPYKIdl8N?<+%drd&%^u%16C||@VAkU`+=D*oskXu{zE?nxX9kgaB8>(Ij zGCD50N{d%qlcxe_#v(s|?t;lRaMT4J&cIO3x*|tERw~nt*otpIK-7HKBc%Q+oZ!*M zvcAWyPsp>a9c#IIS?N$EeLlT^V0m>BlOTSf=P|lumuru;*gEy^S%A%-xzWNp{OkGR z_~A%tBgplZBdbDVQDM0>osxPjK)Go#g2T!6b!ERT$wP|lLo$V1`p>d|PRdOQSX{mO z#=8a|b{pVYbuQe44{d99iSi5w#yfJ<^tpu;T>*Qb1nj|(@NEKg9?ZJY@N;cavOB6Z z4K~kUPw)D>=tJr#cShIfR)JRH9^l_n?TvTofTeZp^US9AN5_DZPNE>`TfLaufS7^J z^w<-#p4D>P#BYRe9C_oIC0=rkM<`kE$tcrfy%p}a&rvSZ@my9l#KMYLj&(OxA1XP0 z|6Q{RZ$^lpMX@A%z#xayZ{SXc$h_C2j39Z6c-wNg?i0gHg zs2Gc$0|CqzyXOH%fdtA7$=-`@s2jm_-}G7{ZdM50}(fy1P&P>gIgmOAYA#5 zb7nIMx`$DchkkYaaXdYl0p;1pl@a=xvcrgEr+aD}vECaprFSIt{0GayMjpKbEydS-TF>JHr{%$C^?XXq z3Gf}14_WRi*A7k!XRt4#F&L3`=LPFQWwHsNEaSMZ=xbqngNN&#q>s+)wFB>I+#JK* zxPQyU&^*sS@a9yor~}&QQPae6QdoT6pFTYYI5q8mAJ(;q7PTuCc!#t9`p-;Jd}Y?u zP8jL`JTeZdZoZ(pqKK9<7ts&+_Q|&XzmJCh|93?Go2dU^o`4pk!Mk7hJ@S8DYv#a6 z!Qo7?`>*r=-+zLS(KS{df(_pPoQ3G!Tcd_|AKo13{g2NdQQTTtJlIOW|G(wa{vOMY zrG3wzVETjPzZTX)^PV4~pQvK=pP%3VB8vBokl-K=b!*V4ndN^7&mbdeNV|0#T9Kgi z#_m}j;j2gg`AgB)2cXt(+wd#h{g&#e6dYrso&Pwq?uzz9#QHWBp1az7lV0nhE(|(9 z9k@;I^r-B$?Q>LC10>`=LuDnd59D9FR?MqL?pYQai0n=})xUE71OJBo{(JmqQi%yc z8l3fjmcvzV-NKxf*gGBp4_xb8kVSqm1od|qw&M2bbC>MF;r5-KtCLi_FS1y5d=)S=vJUBsh!6bVCip}+HCksNi7e((+KSNY|s*u6CZ&{VA6 zvn^Ww=TK%;1AWq6QF3>-NoGY981#>GKu9y_^?*~<{p{f9Jvnz~oQl;0cAb!WIgG9- zOLqBJm%HCR?|LYZ3%k-v-uz5n)UU;Wbqbs~ll?hiVs4JIbU;Fga|=TpZ6$Btt!mhE zKT~XTBFWqRO0vSR)lh`7$|Sd6FJuav6$QzIw_2#wIP|O*0;aIV;y-BmodlJCmh6r` znaIaw#m`V3ucE_zzwk11-k)YN^*Lw1H zZzW%LtTP#W*LqGkHID3$A2WCu^^e4>hVL!eVFbZBqLJl*n4Vn~J_F@~-e|L9oAnU* zF3S=LT8Dc62_K%5xzu2iK?5~4>%k2e`_7J5Qyq*&cPY_UWUE)r;@b?vBG;5^{;PHh z9%pTfzpNKLIz8$fmcEmq`lCdPP2WPpNBd&Pm!V^sHq7^kAB-Vpdn=JuBpS5EUhKXj z^%eA%Y!AS5+Gu|5A`HGgCRsFoh4uwBsZz~U*2H*7u+gw;he9*#|E3yIbUVioYSRaOR5Ya8S0;R}Y1t>(ZjrkHr+Ra-Re`OP+-b@Jx^t|c6 z_33yJ3)QqjkF3IqLc%6(WsyPC<>>3q8VGSHxuvRx@1sWW}Z`(ij$3BI6z{NkWh z!qJPAKmWX*^WWEV0=_T?-cCh6PYB+k*^55W4DORbs~NJtgTi4_WE)U0J@Vr3nBS6V zaXMlyVZArn@);CRB3f#Oxq^Rjiu<1({se7A><;8pwoyQnoKsR=p+9gim&*0FK%|<) zD*}=Tlj+_Cb)6Qe@)KX+#)H)xPQ1gF30W;ckE%9$@U3mp)pM^c07;v8S@1XuDdY?C zaklM5W!xLNy8S4OXXCCQDR#;i5(j8@&^l-ano4}bFJ2@y1vwHZpCYMQ=;|+I0!lym z>RvWH1%1)irlR}ILBN*|-|gIUkkhYNdxm&_?_}N{`u>6C?YE72 z8-IB_?b-;17`TNG&Lk2%?)L$*X;-4A*jxr`Q1!0;(84&01!GcZ?6LWOS+#p~s;kP`d5A+{(!3W>st34=J-mN1jUgw(cF%#c@KVt3WYvTNd~h# zE7M0;WdLN$+et|42MR~PN_B%5$V!3WL?Sw}(a?7?lr~E05A!-W=xo$BUw6kDk5Q~1 z^`vSnXw^9YU2Dera>F^$xQXq~rGIt~p50<}u$ z!L!5nsY1cT**^H(#<>3h%{4*yOu;>~K zyZ537|0=SVO+0N6H&TM2+00A~I33FltS#4ZeSiNFpsx!t)>Z zFiw!|#TqB<_IrFm?*lc#s`ov9fJ6yc&hIGziAsi_*&}6D!qLu0l=WW{HCV^4V*NuR zKBi|#rNNna%x}yGF)CD9Lk07}`uP*>gExSs$pLrrSGnL#Io}-oYYU)|5Jdf>P3^7b zM>{M)eAR{S3;?NMi^np5JZSun2x*7Jo5RxpSeHWjEirn4RQnNg1F2>o)dQBd(5*XB z3{VE=k%CxC6QJ)|)mwCB4gf>B8y}5|sBk)p>s^%qzdq~%ep$4s>>&00xYLdTwc-DY zP$evyaq=${vP%4+5L?%TBI#isvI-p+{mfV7=v} z+4bP=CEv~6OK^1S9~o*=uX*Kn_pi12iU}ldQ1&*^{6~|3Zy(g4ZHWMn76o5ai>0_A znNTF-t4;;jlYcmMr2ZI*DmP3p+yzWf_fnZg>T$osCv=RHpnpfG;;DJ<_Rk1@GGK&! z`g_3MST4`90(xGmB}bZ>f+*V3CI-a%ZUqqQ zS7Y*q>qzU7{6Ls$kn>+z8i-p-Ya{t<)t)lo`z|vBLW$M>0ff3V=!mp?3`QXS@!1^X z1PK7EOc7L_X3S9;A$25SMjJI*` zAIn@wFAtkCy8l<_AY=Y5hiZ?qefh!4jljyO^58F`k!_v@89iU5AYt|_gW6Z;KImIt z=%I0hbQT00*+X6e2c`}0>;}}IEu$hAj)GN;w}JleAe79oP33h=q|8zK? zp!;(c9m(=n0g!BQt{Z7mdnt`0D8C%ls$5PNvKL=O2Gh ziWwNjZdpR09vi1XJ@hqLKkFm_acYAZ}wpe0S@C?=p4>6Y}lr;O$;T zdfk;sBP}aRgMn5n|L@zKv2^(m|3nJX@;6S*J*2DWH4j>>nC#w7T!_mUQ`QE>u~8Mj z;aMP^*t5w9ds?t8^^c_^VWU6{bqss&farGPkRrO}E@Kt$geqnCE8SI1oPcwt&5M)$ znW{gHS{w&f&GR`K$wfrysUH7#ac%^O8ufpjoS+mQ7_@OVFkC-ekiKyV!lCS+7Zx#R4dpnnK!aFY!&1LlO;#9xd>erV`D`x*`@cuo(F-6~t2Lu%I zf*Da82&x3@HDqFHBm>j8>&CZn{U$|YJQZNIb;;T#?D=W>!}GD=Rkq|le3r%I8NYdF z=Scop_@3O`Kc8g9%Uwf%wy`Q@jO5Cw+eRdsQ^rST1(?I}Y#zMAURkKbx22u~anKvo zWzuZ+zZ!Wx_VmVtA2v4Kb(bLD0_?f_rjdR^MbSS@;}B^Wm|((JIdWgL2}!-^-mM<| zbh{n25i&cQCSJ%DmGO=SHzucZ9$g8$3^Z^+an79Q9bQ5O{?h~BHu=}8`A`U@doJ$* z8baB)3fUEg%w#?Wv{+f~EOJMxUo_w4QfVjWaWef#K%KWOQ-+|8_2?hY(LSWHkHb72kz4> zDy^uHRkna`j`jH*&^=;P#EO1C!(nrq-$1n`#?o{nQ>eytXOZJ{YMX6X7^rIP7Y_p{ z@UV)PYVH&O^e?V=5wUf7+juaDtg%m4P)4CAC&VCAdkZ*z1X@SG3~FjX?TPIO3q@N_ zG|PmTuIF>kJ+xsD3b;}8oKB{1!kt3yq*E!NW^z@-eDZ~Ffr#KPLr#7%v$3q$Axa=T znutAZOrE-$&lHRrMEk0W&ZQ7_<*?>#AI%`sJ$c-D_3&Z2C(SBj@fCLQ!xfHqXG#>M z9DvibLpOLjY|OXgj$M@WXByy204=k6ah5_W(4l54`;S%*vwgePcx7sad<84vzaA>$ z$1#rJ%-71Mg+I}Ia$J!b=zgUlJIsQ062;%6~)r(w@3?fC@~jYD?jdgs7UD#fpg zp!2!HJ4Sp3DMZP_N&VYXx$|l+6ivDHgr5gwCYfkAee`8NU~buID&>q%ZWFPCaI(d! zlr!`1jB`&{{ODQ@?VDRnE+D7AK-KSV4>zE@OxQyyzFVsSw5)#e*8I-&$zk(OSE2#o zljy)Dgf+v6Pe&QO%WtfRTYRJ(w}!p|S7H{W*1#EHYH^F2>|h`iB2o~i_@fw^Y``8@ zqP%ev{c2F#%-u_XJw282OyVG;xVkvz#dx{6u->FBD<}86AW570OVu+HXoJ^pTPW$B zKlQGOMA@t!Q{Dtz(5JHh;1{FLgwQGPUj^LVqa3T9OP<=~z*%3py}k!b^yF*uT9+Xf z9h{DaqTSKP|7os;%t(4w^YnqxUN}Pr6iKmBjkLW_5Ip zse8co1eqZ!7Wy%UR0I$hT>QOlIba0%Clo}ED&1^H7t_5T`YJyJftV&rwpF$_a1-FX zI#C|`MWkv0hlZf4|L-0@TfTa=i{2TN2QLQPj)onyy$~Uki2kw_Ba-I$2?d_5g_*Gt znlDLsudvf5D6#IcQU8a1WSN2HVG`?GI(>x}U~XKr`vA$T-Sh)+Hs?C1n_90jH{WJo zZFGIdw)H8oO$59@{`oHs`>4d*0|nJa-S}5c(5Xg=XjN7d0n$9H7{Gd`ONuYV`Gj_T&tt0W?a zvO2vEzBAq5zdtqk@u%PZ@^V2)=0N>H#ez-j5YL}qKPoOZ?V|hnZq3l9Xqyx>`(lSk zVbwNO%uo?4HY}3r3;N{yg0{u{Eo#P-FCj(#C1i`BmzJjDo*Fm4Z#*_#qrEcyTc&Td zy!A(E4sAyYPHrGl$*Wio+3ATkXtpW9p&#hZBJM~0>IvbUd9o=xVObe-zWFoV7A<2D zRNdCeEYeI07McPES91Z471>Dmi!$yND4kz|KUYSF?`!DeSDZ8&RE)!0|4X-~UWNRI ze5+nZ?on%n#v6)SDi?`YC*730aZx?INsD$DJau|>Sp{xMgjnl$r9{2dHY#qiYYeuP zQ!b(15xGx)+?X!^>2Ay+DjbdX6PYPUm}5rd__Z2lGx?^6CFXRWFbQEA+ZVk)gyZZ3 zp(s7_9GN}d$$Ee`Ktr%#qFiaPx%q2HNi()aM5#bvgx*b9q%Fxx(~O-}t-qM}mgU3D zCglPbX?uHKlsaXFE)8g+*QNI3AN)Qyroo9P#U4h3(*xV8X9dcop6W5|s0N`R(X#k+ z=F|+@uQBT54;UTm{pKe$tl#*#GOqW?0be*Ox@r~ICGqgj69xCUIGV9z4>DWDj8}if&Rt;J zY5=X#>o8f@7J47#K3krvxorM$-st+WTH;u^1akw)>eVSHh9>}o@_U5LsbL$r`FG1o z9hIK}FZ0-7L~k)V*h>!;w23RE``zwbllU={S~!a1i1wQiGL;+VF$a-LZ^e2^a6v<0 z@I?9N>ym%nXl+tk7YgeepQoby@G~{&H2o>RE(s}$E_>`F#K2cZz;0-#hyPqK!`A+% zw&%-0LnfR6t_;dykkMNeb8b~Mw$X%oGx4w~U~*+7b6t8NeXo7)dh7FV``kV4>c+qh za{r*Ul0<=RNS#~}BQa)W5#f_*ygg!hQzl2dg9g+a23X^b_bq(n-^lV z!VfWa(j4|A_7fv<-*^}O_2_b+%ikfcqCNHa7voXoaoy8^evzhYdJyIt%F+y6JrIqTQl*7|< z9v2E3S?5e_p;y%&?^qD^r2Fj=g_)#h(6PnV|%9|sjxirfu$ayFFAyBNEw^gtw|KZBP}4n+-k@ZGoRRYV_FeoQjjm$bo$Sg zwW7t~Lc@%o84d6FRS?|)iyZ7&vb!jFnxzR*2%nR|RiS1>*!Ap>7KC4D=-fU{wp1D} zrg;Y(Rw#Ctd>mRd=nP+9w^ZU_OjiOKAN2O)kVk>V(hZLoBXz7GLLm&g^Snyg&0*ol zf6;%EYAo+&O+&<`M+`eNB*m_!$Irr+q>D%sVATtuekt4}VTi z#edb8H4)ts{W8Nz6XI;sD=y1DyFz}u$1tM2Y~%$<%WlqTA;-$~2-}4N<#M(=@b<1S z=Ib?A`9bSS=GT(=5tPS1e^FGvYG6r^BWIrl#3&lLUp?GCvd66fw#KPRiIn+3cg!35BY}0iAAX@w-60E<`eHbbW^Lzf zrcwXq*hgOw8NU&*P17=TcE%*~cct)%rz`duj&7lO*N>5C3C5;B&pcRpGo{9LMAMQ< ziPSS|OL6V&Hp8lRxpLJOdHp}2i(y7$BGD3CyzG8I(gT+<2X|2@%bKm35><;pvSy$` zN{_ukMQ<8u(QLZV>8s|}Bnff(W~H&^>ub8FD)HxJsuav<{;cB9!v%576L;bCVa17& zEPdD?#TM>K#RDrWM(d&j(s~W%C$%Dz|8I=*s1}bF+ixCu%@eJVCkhFstk=}zYfvG- z8PwEm`ppp!Yf+WJp|%gJEKjV8A|$Dy8*>Fta~xCkjsU?Rzdc+(Yu|`K8%4j}4pBGg zQ-J8t+uFLUid$kP3N(wrxGqy>aOaUF3(sg$s0-CS3rEg9Hd&Cp#dPkZN z07!1kd&V4>Jb^ku+5G&Uiq^3`dcD;3K|+K$xF^R;el5xH1+|Y&umXXQg4 zWtN}mov~tM>kD6f2EpBm^b6an!d{j{@O|bG{JpJF{Oi4(mJork7c+cMXSAtdPa`7D zeD5W-4ebebv2Sc@c(LvEdFrT&#zzQi$*nj&pD;(z97_E&NXKY|AI!~F_a zu-Lv#7xPjGi(2MXah9yaSgL5-k{#GsGu>nZNgEh9OBHd*)yx!M=awdEYyMadqg(LT zc=;Lsmcj^#<BmVbpSb@;qC1Sy%(A|Jr!R(4Qz#u=T?61TC4nW#3S1$a;2SE zI@sarRdibkp`V4D6{sAJ#t(8$6lsN`C$W@?ifKp#{guFm?jwF}ny} zG^YQx%(DGDS&5NS(PJ=Vug-zps2WJnYuH5pe*dDBykDw(eLj#)U}R>v>lw^-mA5o7 zlr?B-@G9}Q0`CvSMjocPAubv#g;V(Z@<+g6sM@iD*L2r~xz=$XXg`(#_WL`Hg%+@d z-!l03Kz>A-u)1=f~wrTP@4|{p3=(Si=cFZ)Ce=7fi-v}C*Q_88FOwi5ShJ*SB=&DGoA^ktPiuGT* zs+%VxkmnK2@CK3tlE{|<15Oe6(Jj^OF>|t8O39S2VYt-*{STqT2#VuKJb^TF&nN*gg}C#@RXAC!zx=7$&xb3 ztyV+RrxOCyAVo1Yz_!591N>=5Tnl=3L%tI$yRNzH5CRT4@#*H>V*28Da|-m*zE^kX zqvhNl3Ldv}GB^yHtizvNU#P<$68I_Z2}}vc+!3pOftr{54<{SHw!n;A3z+^%?J?3xxTlmadJCkZ3Ia~nAs=ISd=9@LzB7t zAd{Kv?tuHH2;}psduJ&uxlG$qnOrwef@-VMfB|-FV0D5-zVbb_Xc^jYb2nKIWJnZr zFtEp2&Z<@AyElr9n&j(m63*e+_{Q($-MYC=8eqIZQ7nk>{k&=7N z--F3f6B681`y$dzRUc%v*Yyjq=vO}5cmuyqt=AwU>x*<=xsM`uWx%`D_UI=0-}FSg zUe}=npIysmF=q3gJ{Zu(dsUeE7e7yBA;A-F%`Hf8tp~i||1tSf-e641y(qr##B@dN z{gi5ah)*7cyWgO?o<;2vt%hj}c`d69#)Y5-vAVjDqOHT7>6M9*?(i=EjlKv-UFLmW z!CwpTS`svkap!8i)xyV&oC!MbP=}REA>*&)y$F0c%&4=<^>KLs68Dv8kO7smv6x_7 z64t`8ZJds2X!RHWf;fk3Vtf-{wjW!hzy#H!{hYjd>@>l4nPS6Ja@PIPX^=oGD6!HP zR4C-P7+WgaiYbv=zu2`o>umA))1}Il$xu^?Wkc`?F z>aU+0FNPzRZg0$I9q#O`4QdFnGziSl_oQyqMekgs=I^w2w(pu|8FuA4Afz@bp>jpr z@oN1mHEl>}R0`LupEF6=cKNhgGR*G=6EOY}KRZ5^&eiR8aHgRR_n$g=XK=!mNk2W4 z>dC1*qov0Ck&jY&wYHT|+o+ zt5&1ufjg^Z^*`*&r6nIE#NeFU$-)3-Op#kpcJB6Ld|z+-H;t6PphQchIZ(M19jDIm zgnkP^g(C*ckMI6?dY&Jx?uWd~Sl3Tye*Ea#>+rBD{b#~iNem^DU+_mv^24ll67nJ6yQ(vl%>?ycm= z-|*z(8BT{t!}$D`O<#|sEcec8${EN;SAYvB06AFFyj?f)R|g|#?}<%83o_{>#dI zsAGDxlZt)_x(?EZZv%Ve($iCxk8#RsPWG&mnS8-T$jp!hA7up_4Hcg~!$II9X--sq z9Z+7f+d=3!&H5`_&H!PY;eKU&F?5F!fhs^Bl;P}PxlgoOxWT;^mNSFw7EiI&02Wq- zY?|&K$5u)Qy0{ZSFFXbfdQ!dSWUs3mO&G^>2>6ET*8C5-$#o4D*yEUO^%g!BFaf(l zrXPuu;9@lga-&5$L8@>{Tba1S^ruS}+Hv`8{2l!%7~pIP(%t#(-?E~+@aDkeP41WY zdU=pK`40^R3@vznlEkX2o+F2Nvf&4f1kmtZQyDT%*ddP@tP4MxJ_Nog`CebfmZJ1> z#l>L7?ymiUbX*rxE*IFtGZZTDV8S67ndcU`=U}F1$k*HYqst25dj4unW$EdZy{j8s z`HOw5=91C5HVi+Tb&2gfwr6=sKgGPXpRbI(U5VfE5Y7z}snuqL%9hSMkz83=wG$y! zcC4j39&Z+2Fy47#^5q}CI83NXB%8)t6?r&NSdO0$_9$*6f}*V^#a>+k-*@`K0Ep^H_hS#}CF2o+{U96q1r>661Af#< z#)2Gk@{xALmN{v37Cy_n+D=#O?T+i;&W>4mW67z`yi6;9p9T^zS-dWGz&E0&S7-g~ zw*j#Lrf0vS2C?kVVN!ptQPP>3>z69B4v*P)eNikN*Uw$6K9$EW37S z!*YMY%)d&a`v0jUUdG&foUT1JNDcM3V~;~&bwttM$5gBZ$`0Bl_&)$+Ir{OF8OIIg zKkTY6I=(p~TX32X$!*gDYb7VhL=-1K`a+VOAda@`c1_@?dkm{Zo82W2NHN(Z=Pg`1 zc^{C0#xfHW#{LBuk2|N<1KTw~-*PIER@0o{HV0duq}HoOO4YR(Z2v%J#mUco7~+BE z-fPTII#fXd;G=NQ8#B|gX(s@P^)W-b7Brb_rPq}mz@Vg|&he z3D<1Y0tjb8a2axx?PEOtyegDj&UKG?kirF?NGow*cR2VBu4KEX0Ei@;?~Y!UsnnL^ z9H*zB<52Ci(tABX|9K<$)mHof;o)Zg$+ZIZc9JdpL^1;5CH`bOQOCS^6&cn~b-^YO5N6$7Zm>rX73+VM4dVI~gThbgcJm~)@Mj?zVk8^Su;^l|qH~fhMxz2e-*?4U{BJf^Tum*x+C*`|kaTavN!x;Is; z*;!G&Q=0$xR5y|$`7kj6$QR!dXa9(R{(sfAY**1w_$nBJH(jXks+uK(zTBSO_!_rT z8`Y$A&m#Aranrf_=0yogh-nh5GI?BQGt*U3Gfa+Q=LP8hF`E3VlT<8^8joUpQ_tuU z{`;lFUoFG56Yo&ViT>~75goXEFSMMs|J%c~9;d+MY%RgZ30PZoL|-l@)kZc8&-jkQ zM5aQzlECW&;vy9xcQL((RN$ylHVi>d4i0%!uiK*}!LUMXgYoi~mwQFIbZ;cZ5j&S>Ht#YT4IF%5B-Pb!xjF< zy|<^AxRv8C{Gv|AeU7OSu#h95B9|IB%`>~qIi_-ehZApdabU&?ug*qFBo18*uz$uXB2GzO=ffSXibjMi*YPKtFprs-0Rr@7Oaiy-C} zQQcBK+Uq}47_i;zhyhGG=e(=!0ex^UJu22(32;NfmoA%Y$Xl#(Cv^bVf-zPQusa!N zagObW5~7G)w(NK0TJhRR(^YniR;a(1+#d-r{C4l z)z#%Jdjt3EX?AN%(a+0uYAj;yvGfK2w*_2bbU>^BkN0t%bu3E!(V#p0O9)rpy-8hF z&??MYbAW_DVeEzF-T}KX2Y@80w8C2R^q$iprXsZm$Rmak7a3B2Y6rgE7f|KhcQ*NS z$VQS|Q8?vKBm8e}%-3>7h9HhScZq+M;h9RgyAw&>#mR~w)LgUfZ`EB_JzPWi35UOE zSsa2-rI`qDdvIdbz(t1hlbjj$67-Nu$GN8v`guWI3Dq(ri9DYw0_ppij9LD6MWwyj zke1tfRWQZ;?bw&f!NUV7d$7 zxDR8ug22b|)Az2V*>~?vjc*!8CnB7zR#qRLX+R0bPvZg1>)b$Gu3zfU0HW|%P0 zC6UcmK0R_u3g`?H7i#@xDz9h$S0aW^ zu=(-X*Yhwx0k?Rwr~t1#m#j(2hVtF zYSK%fNTL^wedFvIfFj#9O92`(_SIj0Qhsan21f-0)-jx;c{CQ|;+rh6DYq}8~vSZ0@E7T{X2O9 zHMs{9%9Xg$GHyDm)0Ji2Ro$87+UyL$gIVW@A}%yMko{lwWFuGse~cb71QI*5j!E)N z`9K~`sJUyJdwb|-Gu~Q?k>8w-4lvpesLdF_{6;OidOvG8l3OIQD!Yr-;a1ojk-Su1 z8;(J5*+8z&;>7t<(zkg%w6i2=I$ZTLM|q3MtGdBdIXb&`lO-1)1mHEpjo)PEy=l|Pr=I@V@czNV!al=cN2-bEgXyTRs@2GEKixurRa?Utej^Na zu#TFJR+wv8P)1r%|FM(`Sm{_SpAfeyEw|_k7EJ&BGtDgFDE36azi9W&xm|T59Oaop z$Ga5Nvae1DT9>Efoe9LgZ3qTzt0#hcUx!q0@{!cT3UVs}>EivW70Xm2+f*i-<`Vpt zrDZpr+xTtBzxk!4+R_1se>p3i_u13M<@Wk!Zm5`d+jm#)vm3DW$FJXayuxAM7sh5N z27~^`=HmfOdLAa8uI)3xOW=}WB5C=~*50_uWjK~g8Kl1sMb#M^nzY*^fUC^diWTkh zIb2-}XP#P;lQAG_^zJzPmOZLr3zsNBvz(Wn^FWxGU0&1VqDfB>2a6n9|7bYVp%DpcE(-ATTHal>yJvpoOj!K z9~;7g{z&JMw@l*!uBaQXf6?08tK3+xX ziq!gx=0R)2B{}dwVpPK!F-0gBSn{{qvSX1MQ3Cn!&Te__LR>r4%4lL1TH_i%HwG+k~n?3V>qQ< zpI)70-QL~dK}FdGhUgFkO=AZQvW=e-$tqqEuUOO*cub{KKq-Prw(xl$Dv~1}bN??4 zNxb!~l|wsuy9J4NWwO!cyFH_9@33pTZ42xOM{L;ET(!*?l&_6+t~?r^{)+=E*}^%{ zb8br$cpNblq*z&0R`%U;*W4CH)HEdIL!FS|inQ(ihsLv32ixibs4+7qhe#lk@UV*P zmDO&Yzn?XI{dEd$xt?&)CLy*71#eBOWRpThs$u&7CG%y~gJDYqQ@zjEwR#b#3^~T6 zA+(6UoD`CddJQ*V!CF3CX@5(qs+v-C-;~liI;y+!wYL=6p0@FpD!$Arjso%|c*F`_N#fa4Agn$r%CA3QK-EI9Fb9V@L7YGEnS&6Wvhs6a+(g!5m_*`dE}0MmaVxq zRpDIqjS#ujIeZLaL%svlvZLk?r#VEx5a4rjbkUy;e>WyUjH`ZC@+6KQbd)7(rvWM* zur7>VD}qv18D_h;rv{cS=N?33Epz(5Jma@qhB{x7i#h%)$KYBp~U zZBpnV`%mBGnuB#2SoRbNKhHZheqVDyP6}^M3Mqzy@Fr-d2=hmcwPG#*(kRYR!+K>B z;ZtKV}5F4KTk%fXjJ zwTbmvdszbC2!oAvKd(|pIO5^SYKHb{vR2Z2 z_l0HepHq|ph+QLFv+akIN?;NZ#fu^OKx+AWw^&~ceL4}Db$dptA-vbH-(Mu9TXs4r zQ$q(l{2el44nJZ@UiVd#?3=9Q_1=1e-CM{=LoOx)(W}q%-FhU*#SA`TL{W_2O&;$O z-37O0;;|g#H4m+%DN^CXjv&tj&3fH8`uP-^&6#xSar@&o;Eg;~OP6pF+PB`L4wtuE z)$9|iXI$x3Smd}d`{RS-h0^DPFt-YG)YaN->r`JZG>BZohLQ|BpA+?*JFv;3>}ABG zxaic(>sNvH&yM(goHeMykx)cW&Uo_*Q3E>v2(b3Mi46c+11T!|d` z{4}@QwL3YxAGKX%z(7R)v}j%IO^ld-R9PKF&TjLLBc3Udn~b3MzI?vx)KU33ktXZ&wWL*%nDZ%&(e9RBOj&(AZV9B-NX!mneZtez|+{&Y;U? zWqe%OT@bmKVuiccBM1vmJk?`Z`&_x6te$24t^QVQ%YMFij}=hY#tNiGfh&GDEOy^H zEF#?Pp`VuQ+B3#;-sepPX4lk$TNMdGq-~XHX<6uj{d zHT&(CSK^#*ymM7m+j}7 zzRXvrxV27`;`LVWfo3CqSo9Wfx~*4;g4h1I4Ilj4gMug@?E)0QFm)wRHLvtWm9dKP zbnY(3oKmxF@BhyW@c1`zt&sPP-%Wk88g&90yVaed|A)yRX*=&5x|BU0CNm*`ajfUt z86cFCL053WX?Z!Rtc_>;&ji#@64;>^!WtNz9cC%5wbh^d$E z-tk@BwY%2OL_srm<#E28Yczxe`pj`Xpn0;~C+)>q#bf4c|JDw$`UWXuO&hAY;%mkfJHK$)Z-g)?A4outnza5Cf0p|$P z*zUhd^c(4x&t;j@BSp`?sQXaLI`Z_w=GoWlS@N5Db(3w;2xo~TcyHI>v1mWYrZ{AO zr$pHtkl(g+XjopZ25kyts*Y8E$ZLTTau5sBd6?3)K>EFVe0f#^Il9C<#u06k+h@zQ4kF{L1+Q&1a2n&(Xpr4j^@C3%hQUl>G{6Eg#mG*K0}Je)-Iz5JU*xz|q|-Mz zAnftTTZcO8g*+Nn1DsT7>E4vU_OLxF<)xzs4O}AvH!mJ1bM-o=YKDEv`K#`!jrDEf zRg-JD8ftB&ke7;r^=i?_pV`i_p0>EH!sK2eA|X0iCBf2CRKawqMaOC;kOnc|4|G+R zo6q*5Hb(oV&AX)llQ!bF)zJTh7Mx|su3C&={M7vF^;5cRVe7vl9@c}{lk1sT3PowA zoc{1#?))%YLW`helf3xb+FVYw%^py^ZyT0W{{Y*U2D^?B;2|;XiU(041xzmMDwOn) zjIigXBN>v@x)r%q_Z3G}I$pfh)q8aSr8OKC^sXNf*;D;?^#Gh~w1Uhw&K^KNAASZ7 z8Wf!+uRSI%qffKBRl&9C@?35CzwIpMuDDQqAIHnHy~*lC66E0c;+B{bl99Z3rRdT_ zw@5K#xdLGqY&S}Y5&yhP!J5WbDooHGdPLhItE|r+_DjfxdWwACO67 zCmo|*!g|IIh|dW3tc;5B3y|jW+4?e?{Tp_^aNrLitcKad$8ph(3nP^SJe$9Cq9q^= zvBhN*0!x=V%^~QbAOBe4tQ>r$$Tb}zelHZKhL*CVQ}noHKQ0sHc`0KPbF4=phwb(O zP}?GK^zHv!zD9#vGtZn~NNS~mMTYfqL!&BwS`)oF#hKlE)@u3gYJ0&%es~6VYQgHq;fNha`2C-y#F>yak5})5N4XIpExeq=8yY?=#c8upPeZoOBmu|F=OH-k81mII&Eo1P|Lks z=xzAK*mfZDm4$`k;xw}{JW~);bFTHq7gLO7{q*n)u0td1W5v^xr;;#G>kdaA3pDy5 zMoZQX00qJJtNr9YMXiL2fFl>gxSOulCN>g{p@es~;>~!s25WLpl{NZ(AdV~34_%XK z&tnas$vu`zrU^z2lh{%sWIFgrKq-L>1&XV+;KHXQ8tWf?f%W^c!WD?o+CWilan#34 zNIE)u27E(;o|4n&CW{YLk{$i?uO2G}Z9lodEtY?Ey`6kp$U(cNn80aJysZ69+ohwq z=g&Qp9J)%gCOg}1A(=dblLF|*-&oHb$8`)A1eQlaIbCQT(h87R(b8qr6+0DnVSg_^ zF0*~w8JBKT?^#CtmYOztj1iKA{5D=6Zuu2HtJrN9#)$-_8#v^*SmxL)?r`AgX6-;p zL^lU!x7si)mh<_jYzJM#nmq0mm5S~|2HJlpUBewQyp;`e2m$&htdyXF^Nb@G0_R+Z zPhGLV&nrZ|8JCw`m9^Is13+HZC8DY&?QIq5lU z1=}&5)F19ADiVb=4=OkciN3HIJxZ?Q{OpN@St<7GvrTRECnxr(Kzx1mh?ll#_Qqb* zr~dZvFcjfO25l$c(<+xwUs%_{U@Y@jKq1*=e`6uXZx;c1UEnRLWN0WNxcqOE2mExY zsDSUIFwzh2>qEC;Q{{7e8}bgXZ~r+Bv$iHxAXY^i#eBKL6)Hsx-c^1@xfEfi){R=9 zVtO~NrbgQw&G-S|PYrJIIbw3p(_(TIu7gIh>cET3tRs#Jr!|(eKAKCy0&Ps6?GS$f z5^|stnd%9;VX+?BS-UNinu>EPoYhmX}yWcg-vn>PcExqSH7Okq#Y~chWy~}Jw zn0&QXPB}M*G_v_t=$=q=-N$u(7hU$Pt9h(p{QHy-w<<-nY%+yS#Yk& z13Gzqe9kAPcZly{(n*x<*qFo8JP-ozIM>%7DPg|Ri0&u1%q~@j6l~_Hr9S3f{e&J0 z+FVfYW#t+Dx?!NOj|EF0V6i>L)Y=qz{K<{rMo0g=%9(2TWVAQCK}DCRl~bG0Sfy|0 zuMxs3ETl+Qw~9Oc!mCu-o~kt={2tqo(4dEeZp^?~fuA1onzG56BGYXPdxYb4}Us}Gl#aX)gexLE5opXm5|4~+^+r9?m$V_iswr4dA-lq zkiqhbPb0slq<4zwxqL#B<}B?OC&O#4x^l%K&rs!fxOUo%lFi9uv=3^Yz`1Lr0c`yf zUkh;>KUt!o6f{lhobcnY5Ua(cq>^%GCygYth!InJ4ITC0(Mz6tC=c&4pNODoM3KL5 zM>2RIYqJ^jC|2sX@TZz28oCu9<@xISX@HgO-9t$km_sw6*kobATK%Lfq4(0t1l{vn z%2ck%$)2=H3$6hoMHprvbxV6r3Vb#}d5$C|mbe-3gQPL8o-YEyH)rmf+^aqMT?a$D zDpNVZaMP4y8i9R?V(&z7jeC~Z+MuE=HF@#rKCZ~>(>wi1XC7B{M^C|einsLswStNQ zaft1&PxcS}Tz60z66fMnhi;t{ZTLrxwX7*pm!g{vwv7h-wuufK3t zpK{*B?0)Og_&c;1s2kw)@NRDHzld+Eow$ zD%Dl45hd1*IoN&9hf>d{gy1n0}qc; zy$jn+4o(XZ8NUn2A;+O_9mFPleT~e#9M+xPh~-HHZn{Qk$;Hzn9w;C(-K(q}ZWZj@ zm~4?=P%-$JDstF`H)V(ut~=t2Tu*PhPRe$2-VGr|+wRP6Z*#tvLpBQ(vA_HrCejBU zH}lgP-4t}^Hv?_<>U;y9{EU+CrhSTyhea?g*WEy&iB9VgzD-9RV!|BNfX9|cQoNuQ z8AVI!QIB#mt6}9Nf5f8XM=#rLvzVzuPA7GU))AHW4?4IVyArn|V@qwij!$^^XFYv! z@|zQ(T?lkj(IdTlX+ZcvGyH=o$J=#p@W;?lTpa>bIX zgrE^Y*i=Qd4K1DElBN!3p14JH4wCJ>W7?`G1Fv&~QiyRI{8}QRTCA}SI||#+b7?{9 z3)De$JihlA=v|@^BEw8fWJU*rtAcLLOOvx?!F3q>!O_jDQ|O%5KBXcA&6Yy#=c5;f z)U|OcEfm)9u-Y!VD#TbqBxCK-t_^~UaKfJ*3XFB>!-Ka{=txluoIPf?E72&^kRm4pK zkFq>^3x;NyK4roFmm6TP{KYk>4&M0dg|rIIJLF(@<@PuJ&!|2yNT3uieDy(L5%yN$ zXc@ubQA5il4?q8|T(_k1U!C$ykXp^DuQ1Z>F)1jY4XL_!v{t$%LMsJzn;44LC*DJd zxi--+6oiDJkPJ^Q$@f5;Xkz|UI0BtJ`%e(LcMwGf$gw_p=?=7U)Q{-zSW7f{B7hh;zi>T(?t z?JD#JGn;egn0G3xAk`q8N>{o63&*9jTA;`!e0yM;zPsf}H*YQOJ5~}yg?NoR%0_g z@99wX>7^a_Z^rkKFYW=e?_ViHR6{>phEC?{*;5Ueh9U`Ktmj%b@p*Yzp?4CqLQ2__ zSQURDEIkv7_UrkJ1)zVg-(P{+a2N%fbx~6-tJZuAB*tRjj2E{lM3RbfoV|cG4N^Ax z;`CX}W~Ujk6dPBD^Fyf1-aZGz)kYVt@st>JAffU=vZwf&JV|(_Ax)+yS?Uio!;36j zfdNeB#zfv|t}Q(r^3nGd(F4+Y-B`=t>_e?aMK{Rn)c8E-HWcYl*WjTaSfcW_me80* zQUyY~fIH!by_SJHY&V#$b!o`UMNRaaG=w$5E}BS8TsmK!=)KBGM|0QOF63p^Aj_1z z3K9|%5^j0W+-?NAGI$Pd>oOmdIv5M-4dG@Lq>$_kcaw0)Mh;!UUhiQ4ZZ8wzz-YoX zh{5nd%~G+Zqt)tRb3lQG3&7v( zqs*-#0^Cr2L{L{&Z>|Zp1rGJQ&5DfAk-hH;*(bXh?}WHY4ec%FwVejAN5SR!f7d6g zsdpB+1^T@w`Pxu}SU;kH?*@3|t~qqT{rRSaU#wA5wZG~7pjz?*PiYgtFj)lICnm|F zfJyerDcr8^URawt?rjWl-4I8X`5tV_WceawVu@!QxH){s3dRYmw8%*qj&vnlni<}v zScs~it~Tf#P2_Z0PV{~)Refy@l&hcFsLaK*RAYs7PhJQJ{VGr>$mmqSLG#ah@4Yla zIubd{#-+1oIvT5a;wZwdL+9jcRHa!=#9vMl+_g#b$9k%Uz!1~}eX_2W`j?OCq3gm| zqSmZ&DWWL=vo!SS#byuX-A>d>w-$)_0~<{E#fj5r-M8YAE1vWWJP$o)H%x)@FI<*C z6t#x?Bsz#N!`E!cR&>g#IHzsoQ{>yEfs}x+c;x$)gcCGJ_faI>R24`}F_0ai$S7YL za~Icx+owP_T6eU82MOh{7g8T$hl_kgw9bN+(E<1|M@jCQ$o@<6FDG3FRyy)vyeGm@ z^!xBkyc;uyV6VJMrbHbZWKE&^_mG!YUGa6S=e!}kM)*Szzlaz?>lMO&@i1muC z?5TD{p5M!UU1P!U#7Kog)4yDE9HV$igoPYxW9nNLh0ro2k!N@17DS87_u4m?o&OSd z91bhb4=87Xc&?+xc@m1Fwwd1o2hpGN=8j@4 zP@>LADxZz!7Z@Ggv>J|#;`rDkJ8vjUfbeANP|`2+dg7hZ(Op{D%{Lr(k4O0;bBna= z+ipyA)Ha^Szs(+7BXh*+cxAfI;^<+yUNtT-5KJgqfDUzl$@rJI28L38LnT10KbmfN zaL&^Vjdt$T+%?=Pkrnld%YHc-UoW|fup!IkL4TQ?`uSnbc1>wGz2-KVd$Rc6vE#T+O)M;#-`3+w_;+i5s{vukWV{m$VeC(sh}4fIrAXi0cDT# z0wqGjUxX&4WGKu%Da?Y34HjOsYiFk$hpOP#Da*T!3D_v&OHeD*+2jsLTH;%fWrl@t zEW!m}3vJhp@g?#u;4D4Spu+sk+BM>+n@7@bFGv%4#6yN{x+{f6c&Pv$z%fgvaM{MD zc4TsAMM&d9l-sk1&wqYd#{utsf9z@8_N={9v@in6q-c3kjh%@V>^>kPY+H>%w!W4R zXB*@lvT2NeY-azR?nxm?hELIVf@6?39NPpOc5fS%Ho$Enz5s-AY?pyEum>6y zrWi2u#2GEf*{G+AoUEDRtmj2HoGa*+S5zR&kH*PBZ!kUFQAYDb`d!KDz5^JnTw>r; zQ`7^~fD^G>?d`t6JoT$p*{Ehhx_^ZC_qVeB8={7AkZ z@$JCBswvun^`|7Z9f)9rs4v?H|Bg#Z+BtJ&B>kca;3kvXQ}S%UfWJ!2EnFGt1jh%p zJ|T303Ae(^N6Q&&BF5y31?B1ZXOD0@IUq|=>ynsPj;ZuN1_@+;EN@f>T7!^hwGr06 z4XAR$XKOIcVOX&Y)j*bxi@vnzYx3@6xtwa%KWu<(9i5l^b>(!%<^5vO}41J1w0cW0YoV6}tYDq~lv(lct`hQm!#)A{fjIcwCwH$fN6{*Lr5 zph3AsD)1oKK#rgkiaI45F!+&h3b_8lwa!;LD3bg)?g@lZd#bO21yZs4x) z1B`V_;0ED3ssYLiOe}SG#J^4)%D}#l$v|wSo5$j`=sm{7Wz!C_8u}uUDl*k(oWdtn z73hf{?7I%tk*w;7Yae=mvcLC%Z`RME$i-K5gDL6y^8@6J{o*-z*rM11KxgMp7HS?# z=uSv0Ej6g{VSk?3Ow%+|xn(PiSowbJ`LE-&gJ2s<+#NkFjW9UULz%OezaqI7=c+%c zxZ)an4K8i0PgCT;7NiUW_TMbqP%o6OS6M3)e>@Nve80UOgq+>I!L%zPxvgaZ%^mos z#vKw5HpN$K7$vs)F*9l?LOTuD2?cizuHO#%*p9wA!ZzfK&Ke)$aP|cybct&5d{SX@ zqte;-t6T>##j!7&W}Dk_Q@voEN;8XOl0Gt=VKU>p-urL z(Z~e;SB@|eh@k0t@f}q@SqInh5rw}=-kUxJ>i=JB%Pu^)?Rg+Q?^rwTPo{ zgovasqE|vHdMFJ5YdU(2e!^igxI}&l5HkL_MQ<#oyQ`lGF~e`t>AraT7V=h*`KV74 zM;YJ8Nzkkt2v8z$%Ev|f4lLSV2w}Iq34cxHzA0BTJfmKtJKwliI{=;Sm<2~a&aI$r zMo#*mL^1^~01s`h%x+ZnvikmHaXJ93bmZ#5Eax;gla=ugk62ed7*wb9hg|L#Q}3Z{ zU^zlKm=X^zE+{59CNE8`(ZH{VHai*bKxy30Uwt+shF3pD*VjOn8~Q^#sN^vNxtOPe z1W6Z4Ur{RLXJGS+z1J<+*jVk0ZI+T9(o87IWXMjy=*n-V?~^m!HFs^&4q>CJ4eLpM zCMI-T%45`Mf|N46N4fmig)AuC?x-jmf1qbfFhoJFK~Ym)9p>G(o>XK8pT3bZiwG%g z+%Fc*YHv5Mi}V(!72Id2vQFb^#eKzedJrandi}|nruP^H+RP%7=$^8CG-st%3KUWQ znDm$!gO4;<8B>Ox)HQ0TB&UWQ(`D$=tV=rte0`ujzJRc0hH~z4?5IK}xKnxV%gd{P zPcBP=cY^H78w`i)gJLwzrpW{EpeYESoz?}*64(Hdk14prytj4P*tDjg9yhRgDbtvu zQ0oYBr5iSGTzm?WeX653_4aE>vx*@Io$(B~W41Nqnv;M$sEZt?CPoqKP@PBK+rFfV zm@T<1XwB$^l@Xz6iwuTes$eL8R36eiShH0B=)7s%)5)2@6UpWPcQAP;k&q>i2pTy2 zCUL=BgK1K}`uCfKLu%GyiJQ>Ok_v7qf<9y)e=Dg33WEBD=@u$v;m-U~3=+$k`UG#v z>w-fbYhXNb6puhKhx8nRJ)o4NLcp1cdOQwH@35w$kI^2jR z@ht}6d@b47Q_WH08<-bU$B@b;BYEjk*bbmjzt2!2*-lZuA(`c%r78Xr7p|;lCha+s z*qw{u`W2WPr*fU99{4pQoim&$2aoLRWc3AH&Xw;MMh-dqcu{iX`uvi!j9b8Ie`sLQ z(nMUfshYRm^;u*F!!1eQh6|TRPweE0Zy8_#r8^C&V2fr6IvGFnW;W2>2TQ2Jm0}JV z22eT|bs36hRFsRcsrJlaQqiY%8ER3w=%bz!wsL}b%Ku@|$2qd*w|!{8Hg+kRQLyjB zHWZS~?K>Iph9YGOx6O1};sSWDp_pU+Hx?MSe^`*S2%aeSkZo z%$QdH-4fG$ZL9yVTa!cu2StTh5|~JMWh}08Q%LqIzuKCNEh8(8UeS|r970v2@PbVXDL+W(sWAA{k|Ds7}-A=s5!H+ zPsPlwK1OorNlF?ZPVLGSp=Q@dJJ&1b{Jn?h=;+HfNxa9TK>Sf+VV45)I;e{u?