I think that the cause of this error could be in the Triton server initialization command (tritonserver --allow-sagemaker=true --allow-http=false $SAGEMAKER_ARGS) or in the SageMaker Endpoint invocation ( runtime_sm_client.invoke_endpoint(EndpointName=endpoint_name, ContentType=f"application/vnd.sagemaker-triton.binary+json;json-header-size={header_length}", Body=request_body)) (details and code attached below), since when I perform the Triton inference using the AWS SageMaker Training job (the same instance used for training) it works as expected. Any help with this issue will be highly appreciated.
import argparse
import json
import logging
import os
import sys
import tempfile
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import glob
import cudf
import numpy as np
import pandas as pd
import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags
from transformers4rec.utils.data_utils import save_time_based_splits
import torch
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory
from merlin.schema import Schema
from merlin.io import Dataset
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
from merlin.core.dispatch import make_df
from merlin.systems.dag import Ensemble
from merlin.systems.dag.ops.pytorch import PredictPyTorch
from merlin.systems.dag.ops.workflow import TransformWorkflow
import cloudpickle
from merlin.table import TensorTable, TorchColumn
from merlin.table.conversions import convert_col
import shutil
from nvtabular.workflow import Workflow
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))
def parse_args():
"""
Parse arguments passed from the SageMaker API to the container.
"""
parser = argparse.ArgumentParser()
# Model directory: we will use the default set by SageMaker, /opt/ml/model
parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
return parser.parse_known_args()
def data_preprocessing():
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "./data/")
NUM_ROWS = os.environ.get("NUM_ROWS", 100000)
long_tailed_item_distribution = np.clip(np.random.lognormal(3., 1., int(NUM_ROWS)).astype(np.int32), 1, 50000)
# generate random item interaction features
df = pd.DataFrame(np.random.randint(70000, 90000, int(NUM_ROWS)), columns=['session_id'])
df['item_id'] = long_tailed_item_distribution
# generate category mapping for each item-id
df['category'] = pd.cut(df['item_id'], bins=334, labels=np.arange(1, 335)).astype(np.int32)
df['age_days'] = np.random.uniform(0, 1, int(NUM_ROWS)).astype(np.float32)
df['weekday_sin']= np.random.uniform(0, 1, int(NUM_ROWS)).astype(np.float32)
# generate day mapping for each session
map_day = dict(zip(df.session_id.unique(), np.random.randint(1, 10, size=(df.session_id.nunique()))))
df['day'] = df.session_id.map(map_day)
SESSIONS_MAX_LENGTH =20
# Categorify categorical features
categ_feats = ['item_id', 'category'] >> nvt.ops.Categorify()
# Define Groupby Workflow
groupby_feats = categ_feats + ['session_id', 'day', 'age_days', 'weekday_sin']
# Group interaction features by session
groupby_features = groupby_feats >> nvt.ops.Groupby(
groupby_cols=["session_id"],
aggs={
"item_id": ["list", "count"],
"category": ["list"],
"day": ["first"],
"age_days": ["list"],
'weekday_sin': ["list"],
},
name_sep="-")
# Select and truncate the sequential features
sequence_features_truncated = (
groupby_features['category-list']
>> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
)
sequence_features_truncated_item = (
groupby_features['item_id-list']
>> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
>> TagAsItemID()
)
sequence_features_truncated_cont = (
groupby_features['age_days-list', 'weekday_sin-list']
>> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
>> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])
)
# Filter out sessions with length 1 (not valid for next-item prediction training and evaluation)
MINIMUM_SESSION_LENGTH = 2
selected_features = (
groupby_features['item_id-count', 'day-first', 'session_id'] +
sequence_features_truncated_item +
sequence_features_truncated +
sequence_features_truncated_cont
)
filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)
seq_feats_list = filtered_sessions['item_id-list', 'category-list', 'age_days-list', 'weekday_sin-list'] >> nvt.ops.ValueCount()
workflow = nvt.Workflow(filtered_sessions['session_id', 'day-first'] + seq_feats_list)
dataset = nvt.Dataset(df)
# Generate statistics for the features and export parquet files
# this step will generate the schema file
workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))
workflow.save(os.path.join(INPUT_DATA_DIR, "workflow_etl"))
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", os.path.join(INPUT_DATA_DIR, "sessions_by_day"))
# Read in the processed parquet file
sessions_gdf = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt/part_0.parquet"))
save_time_based_splits(data=nvt.Dataset(sessions_gdf),
output_dir= OUTPUT_DIR,
partition_col='day-first',
timestamp_col='session_id',
)
return
def model_training():
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "./data")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/sessions_by_day")
train = Dataset(os.path.join(INPUT_DATA_DIR, "processed_nvt/part_0.parquet"))
schema = train.schema
# You can select a subset of features for training
schema = schema.select_by_name(['item_id-list',
'category-list',
'weekday_sin-list',
'age_days-list'])
inputs = tr.TabularSequenceFeatures.from_schema(
schema,
max_sequence_length=20,
continuous_projection=64,
masking="mlm",
d_output=100,
)
# Define XLNetConfig class and set default parameters for HF XLNet config
transformer_config = tr.XLNetConfig.build(
d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
inputs, tr.MLPBlock([64]), tr.TransformerBlock(transformer_config, masking=inputs.masking)
)
# Define the evaluation top-N metrics and the cut-offs
metrics = [NDCGAt(top_ks=[20, 40], labels_onehot=True),
RecallAt(top_ks=[20, 40], labels_onehot=True)]
# Define a head related to next item prediction task
head = tr.Head(
body,
tr.NextItemPredictionTask(weight_tying=True,
metrics=metrics),
inputs=inputs,
)
# Get the end-to-end Model class
model = tr.Model(head)
per_device_train_batch_size = int(os.environ.get(
"per_device_train_batch_size",
'128'
))
per_device_eval_batch_size = int(os.environ.get(
"per_device_eval_batch_size",
'32'
))
# Set hyperparameters for training
train_args = T4RecTrainingArguments(
data_loader_engine='merlin',
dataloader_drop_last = True,
gradient_accumulation_steps = 1,
per_device_train_batch_size = per_device_train_batch_size,
per_device_eval_batch_size = per_device_eval_batch_size,
output_dir = "./tmp",
learning_rate=0.0005,
lr_scheduler_type='cosine',
learning_rate_num_cosine_cycles_by_epoch=1.5,
num_train_epochs=5,
max_sequence_length=20,
report_to = [],
logging_steps=50,
no_cuda=False,
)
trainer = Trainer(
model=model,
args=train_args,
schema=schema,
compute_metrics=True,
)
start_window_index = int(os.environ.get(
"start_window_index",
'1'
))
final_window_index = int(os.environ.get(
"final_window_index",
'8'
))
start_time_window_index = start_window_index
final_time_window_index = final_window_index
#Iterating over days of one week
for time_index in range(start_time_window_index, final_time_window_index):
# Set data
time_index_train = time_index
time_index_eval = time_index + 1
train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))
print(train_paths)
# Train on day related to time_index
print('*'*20)
print("Launch training for day %s are:" %time_index)
print('*'*20 + '\n')
trainer.train_dataset_or_path = train_paths
trainer.reset_lr_scheduler()
trainer.train()
trainer.state.global_step +=1
print('finished')
# Evaluate on the following day
trainer.eval_dataset_or_path = eval_paths
train_metrics = trainer.evaluate(metric_key_prefix='eval')
print('*'*20)
print("Eval results for day %s are:\t" %time_index_eval)
print('\n' + '*'*20 + '\n')
for key in sorted(train_metrics.keys()):
print(" %s = %s" % (key, str(train_metrics[key])))
wipe_memory()
eval_data_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))
# set new data from day 7
eval_metrics = trainer.evaluate(eval_dataset=eval_data_paths, metric_key_prefix='eval')
for key in sorted(eval_metrics.keys()):
print(" %s = %s" % (key, str(eval_metrics[key])))
model_path= os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/saved_model")
model.save(model_path)
def model_ensemble(output_path):
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "./data/")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/sessions_by_day")
model_path= os.environ.get("model_path", f"{INPUT_DATA_DIR}/saved_model")
loaded_model = cloudpickle.load(
open(os.path.join(model_path, "t4rec_model_class.pkl"), "rb")
)
model = loaded_model.cuda()
model.eval()
train_paths = os.path.join(OUTPUT_DIR, f"{1}/train.parquet")
dataset = Dataset(train_paths)
df = cudf.read_parquet(train_paths, columns=model.input_schema.column_names)
table = TensorTable.from_df(df.loc[:100])
for column in table.columns:
table[column] = convert_col(table[column], TorchColumn)
model_input_dict = table.to_dict()
traced_model = torch.jit.trace(model, model_input_dict, strict=True)
input_schema = model.input_schema
output_schema = model.output_schema
workflow = Workflow.load(os.path.join(INPUT_DATA_DIR, "workflow_etl"))
torch_op = workflow.input_schema.column_names >> TransformWorkflow(workflow) >> PredictPyTorch(
traced_model, input_schema, output_schema
)
ensemble = Ensemble(torch_op, workflow.input_schema)
ens_config, node_configs = ensemble.export(output_path)
return
def train(output_path):
data_preprocessing()
model_training()
model_ensemble(output_path)
return
if __name__ == "__main__":
args, _ = parse_args()
train(args.model_dir)
INFO:sagemaker:Creating training-job with name: model-training-2024-12-10-16-51-51-539
2024-12-10 16:51:54 Starting - Starting the training job...
2024-12-10 16:52:08 Starting - Preparing the instances for training...
2024-12-10 16:52:49 Downloading - Downloading the training image..................
2024-12-10 16:55:56 Training - Training image download completed. Training in progress....==================================
== Triton Inference Server Base ==
==================================
NVIDIA Release 23.06 (build 62878575)
Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved.
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
2024-12-10 16:56:10,623 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2024-12-10 16:56:10,658 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2024-12-10 16:56:10,693 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2024-12-10 16:56:10,707 sagemaker-training-toolkit INFO Invoking user script
Training Env:
{
"additional_framework_parameters": {},
"channel_input_dirs": {},
"current_host": "algo-1",
"current_instance_group": "homogeneousCluster",
"current_instance_group_hosts": [
"algo-1"
],
"current_instance_type": "ml.g4dn.xlarge",
"distribution_hosts": [],
"distribution_instance_groups": [],
"framework_module": null,
"hosts": [
"algo-1"
],
"hyperparameters": {},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {},
"input_dir": "/opt/ml/input",
"instance_groups": [
"homogeneousCluster"
],
"instance_groups_dict": {
"homogeneousCluster": {
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.xlarge",
"hosts": [
"algo-1"
]
}
},
"is_hetero": false,
"is_master": true,
"is_modelparallel_enabled": null,
"is_smddpmprun_installed": false,
"is_smddprun_installed": false,
"job_name": "model-training-2024-12-10-16-51-51-539",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://sagemaker-us-east-1-397669588823/model-training-2024-12-10-16-51-51-539/source/sourcedir.tar.gz",
"module_name": "train",
"network_interface_name": "eth0",
"num_cpus": 4,
"num_gpus": 1,
"num_neurons": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"current_instance_type": "ml.g4dn.xlarge",
"current_group_name": "homogeneousCluster",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.xlarge",
"hosts": [
"algo-1"
]
}
],
"network_interface_name": "eth0"
},
"user_entry_point": "train.py"
}
Environment variables:
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS={}
SM_USER_ENTRY_POINT=train.py
SM_FRAMEWORK_PARAMS={}
SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}],"network_interface_name":"eth0"}
SM_INPUT_DATA_CONFIG={}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=[]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.g4dn.xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT={"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}}
SM_DISTRIBUTION_INSTANCE_GROUPS=[]
SM_IS_HETERO=false
SM_MODULE_NAME=train
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=4
SM_NUM_GPUS=1
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://sagemaker-us-east-1-397669588823/model-training-2024-12-10-16-51-51-539/source/sourcedir.tar.gz
SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","current_instance_group":"homogeneousCluster","current_instance_group_hosts":["algo-1"],"current_instance_type":"ml.g4dn.xlarge","distribution_hosts":[],"distribution_instance_groups":[],"framework_module":null,"hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","instance_groups":["homogeneousCluster"],"instance_groups_dict":{"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}},"is_hetero":false,"is_master":true,"is_modelparallel_enabled":null,"is_smddpmprun_installed":false,"is_smddprun_installed":false,"job_name":"model-training-2024-12-10-16-51-51-539","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-397669588823/model-training-2024-12-10-16-51-51-539/source/sourcedir.tar.gz","module_name":"train","network_interface_name":"eth0","num_cpus":4,"num_gpus":1,"num_neurons":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"train.py"}
SM_USER_ARGS=[]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
PYTHONPATH=/opt/ml/code:/usr/local/bin:/opt/tritonserver:/usr/local/lib/python3.10/dist-packages:/usr/lib/python310.zip:/usr/lib/python3.10:/usr/lib/python3.10/lib-dynload:/usr/local/lib/python3.10/dist-packages/faiss-1.7.2-py3.10.egg:/ptx:/usr/local/lib/python3.10/dist-packages/merlin_hps-0.0.0-py3.10-linux-x86_64.egg:/usr/lib/python3/dist-packages:/usr/lib/python3.10/dist-packages
Invoking script with the following command:
/usr/bin/python3 train.py
2024-12-10 16:56:10,708 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker Debugger as it is not installed.
2024-12-10 16:56:10,708 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker TF as Tensorflow is not installed.
/usr/local/lib/python3.10/dist-packages/merlin/dtypes/mappings/tf.py:52: UserWarning: Tensorflow dtype mappings did not load successfully due to an error: No module named 'tensorflow'
warn(f"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}")
#015Creating time-based splits: 0%| | 0/9 [00:00<?, ?it/s]#015Creating time-based splits: 11%|█ | 1/9 [00:00<00:07, 1.11it/s]#015Creating time-based splits: 56%|█████▌ | 5/9 [00:01<00:00, 6.19it/s]#015Creating time-based splits: 100%|██████████| 9/9 [00:01<00:00, 11.05it/s]#015Creating time-based splits: 100%|██████████| 9/9 [00:01<00:00, 7.77it/s]
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
['./data/sessions_by_day/1/train.parquet']
********************
Launch training for day 1 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:01<01:53, 1.78s/it]#015 3%|▎ | 2/65 [00:01<00:51, 1.23it/s]#015 6%|▌ | 4/65 [00:02<00:21, 2.88it/s]#015 9%|▉ | 6/65 [00:02<00:12, 4.59it/s]#015 12%|█▏ | 8/65 [00:02<00:09, 6.29it/s]#015 15%|█▌ | 10/65 [00:02<00:06, 7.86it/s]#015 18%|█▊ | 12/65 [00:02<00:05, 9.19it/s]#015 22%|██▏ | 14/65 [00:02<00:05, 9.63it/s]#015 25%|██▍ | 16/65 [00:02<00:04, 10.71it/s]#015 28%|██▊ | 18/65 [00:03<00:04, 11.48it/s]#015 31%|███ | 20/65 [00:03<00:03, 12.16it/s]#015 34%|███▍ | 22/65 [00:03<00:03, 12.70it/s]#015 37%|███▋ | 24/65 [00:03<00:03, 13.03it/s]#015 40%|████ | 26/65 [00:03<00:02, 13.20it/s]#015 43%|████▎ | 28/65 [00:03<00:03, 12.32it/s]#015 46%|████▌ | 30/65 [00:04<00:02, 12.75it/s]#015 49%|████▉ | 32/65 [00:04<00:02, 13.03it/s]#015 52%|█████▏ | 34/65 [00:04<00:02, 13.26it/s]#015 55%|█████▌ | 36/65 [00:04<00:02, 13.46it/s]#015 58%|█████▊ | 38/65 [00:04<00:01, 13.55it/s]#015 62%|██████▏ | 40/65 [00:04<00:02, 12.46it/s]#015 65%|██████▍ | 42/65 [00:04<00:01, 12.93it/s]#015 68%|██████▊ | 44/65 [00:05<00:01, 13.19it/s]#015 71%|███████ | 46/65 [00:05<00:01, 13.43it/s]#015 74%|███████▍ | 48/65 [00:05<00:01, 13.48it/s]#015 77%|███████▋ | 50/65 [00:05<00:01, 13.66it/s]#015 #015#015 77%|███████▋ | 50/65 [00:05<00:01, 13.66it/s]#015 80%|████████ | 52/65 [00:05<00:00, 13.54it/s]#015 83%|████████▎ | 54/65 [00:05<00:00, 12.33it/s]#015 86%|████████▌ | 56/65 [00:05<00:00, 12.79it/s]#015 89%|████████▉ | 58/65 [00:06<00:00, 13.13it/s]#015 92%|█████████▏| 60/65 [00:06<00:00, 13.26it/s]#015 95%|█████████▌| 62/65 [00:06<00:00, 13.36it/s]#015 98%|█████████▊| 64/65 [00:06<00:00, 13.53it/s]#015 #015#015100%|██████████| 65/65 [00:06<00:00, 13.53it/s]#015100%|██████████| 65/65 [00:06<00:00, 9.73it/s]
{'loss': 5.8526, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 6.6806, 'train_samples_per_second': 1245.397, 'train_steps_per_second': 9.73, 'train_loss': 5.716501206618089, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 81.45it/s]
********************
Eval results for day 2 are:#011
********************
eval_/loss = 5.159573078155518
eval_/next-item/ndcg_at_20 = 0.151389941573143
eval_/next-item/ndcg_at_40 = 0.20212794840335846
eval_/next-item/recall_at_20 = 0.4114583432674408
eval_/next-item/recall_at_40 = 0.65625
eval_runtime = 0.1406
eval_samples_per_second = 1365.82
eval_steps_per_second = 42.682
['./data/sessions_by_day/2/train.parquet']
********************
Launch training for day 2 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:00<00:07, 8.46it/s]#015 5%|▍ | 3/65 [00:00<00:05, 11.95it/s]#015 8%|▊ | 5/65 [00:00<00:04, 13.05it/s]#015 11%|█ | 7/65 [00:00<00:04, 13.28it/s]#015 14%|█▍ | 9/65 [00:00<00:04, 13.38it/s]#015 17%|█▋ | 11/65 [00:00<00:03, 13.59it/s]#015 20%|██ | 13/65 [00:00<00:03, 13.72it/s]#015 23%|██▎ | 15/65 [00:01<00:03, 12.57it/s]#015 26%|██▌ | 17/65 [00:01<00:03, 12.87it/s]#015 29%|██▉ | 19/65 [00:01<00:03, 13.18it/s]#015 32%|███▏ | 21/65 [00:01<00:03, 13.38it/s]#015 35%|███▌ | 23/65 [00:01<00:03, 13.57it/s]#015 38%|███▊ | 25/65 [00:01<00:02, 13.74it/s]#015 42%|████▏ | 27/65 [00:02<00:03, 12.62it/s]#015 45%|████▍ | 29/65 [00:02<00:02, 12.92it/s]#015 48%|████▊ | 31/65 [00:02<00:02, 13.14it/s]#015 51%|█████ | 33/65 [00:02<00:02, 13.42it/s]#015 54%|█████▍ | 35/65 [00:02<00:02, 13.43it/s]#015 57%|█████▋ | 37/65 [00:02<00:02, 13.57it/s]#015 60%|██████ | 39/65 [00:02<00:01, 13.57it/s]#015 63%|██████▎ | 41/65 [00:03<00:01, 12.52it/s]#015 66%|██████▌ | 43/65 [00:03<00:01, 12.88it/s]#015 69%|██████▉ | 45/65 [00:03<00:01, 13.17it/s]#015 72%|███████▏ | 47/65 [00:03<00:01, 13.26it/s]#015 75%|███████▌ | 49/65 [00:03<00:01, 13.44it/s]#015 #015#015 77%|███████▋ | 50/65 [00:03<00:01, 13.44it/s]#015 78%|███████▊ | 51/65 [00:03<00:01, 13.54it/s]#015 82%|████████▏ | 53/65 [00:04<00:00, 12.41it/s]#015 85%|████████▍ | 55/65 [00:04<00:00, 12.80it/s]#015 88%|████████▊ | 57/65 [00:04<00:00, 12.91it/s]#015 91%|█████████ | 59/65 [00:04<00:00, 13.21it/s]#015 94%|█████████▍| 61/65 [00:04<00:00, 13.23it/s]#015 97%|█████████▋| 63/65 [00:04<00:00, 13.45it/s]#015100%|██████████| 65/65 [00:04<00:00, 13.69it/s]#015 #015#015100%|██████████| 65/65 [00:04<00:00, 13.69it/s]#015100%|██████████| 65/65 [00:04<00:00, 13.18it/s]
{'loss': 4.9397, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 4.932, 'train_samples_per_second': 1686.954, 'train_steps_per_second': 13.179, 'train_loss': 4.896147273137019, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 80.61it/s]
********************
Eval results for day 3 are:#011
********************
eval_/loss = 4.623995304107666
eval_/next-item/ndcg_at_20 = 0.1915176659822464
eval_/next-item/ndcg_at_40 = 0.2323640137910843
eval_/next-item/recall_at_20 = 0.5
eval_/next-item/recall_at_40 = 0.6979166865348816
eval_runtime = 0.1367
eval_samples_per_second = 1404.154
eval_steps_per_second = 43.88
['./data/sessions_by_day/3/train.parquet']
********************
Launch training for day 3 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:00<00:07, 8.63it/s]#015 5%|▍ | 3/65 [00:00<00:05, 12.12it/s]#015 8%|▊ | 5/65 [00:00<00:04, 13.07it/s]#015 11%|█ | 7/65 [00:00<00:04, 13.19it/s]#015 14%|█▍ | 9/65 [00:00<00:04, 13.32it/s]#015 17%|█▋ | 11/65 [00:00<00:03, 13.54it/s]#015 20%|██ | 13/65 [00:00<00:03, 13.67it/s]#015 23%|██▎ | 15/65 [00:01<00:03, 12.57it/s]#015 26%|██▌ | 17/65 [00:01<00:03, 12.99it/s]#015 29%|██▉ | 19/65 [00:01<00:03, 13.34it/s]#015 32%|███▏ | 21/65 [00:01<00:03, 13.57it/s]#015 35%|███▌ | 23/65 [00:01<00:03, 13.64it/s]#015 38%|███▊ | 25/65 [00:01<00:02, 13.74it/s]#015 42%|████▏ | 27/65 [00:02<00:03, 12.49it/s]#015 45%|████▍ | 29/65 [00:02<00:02, 12.83it/s]#015 48%|████▊ | 31/65 [00:02<00:02, 13.05it/s]#015 51%|█████ | 33/65 [00:02<00:02, 13.33it/s]#015 54%|█████▍ | 35/65 [00:02<00:02, 13.52it/s]#015 57%|█████▋ | 37/65 [00:02<00:02, 13.50it/s]#015 60%|██████ | 39/65 [00:02<00:01, 13.56it/s]#015 63%|██████▎ | 41/65 [00:03<00:01, 12.42it/s]#015 66%|██████▌ | 43/65 [00:03<00:01, 12.79it/s]#015 69%|██████▉ | 45/65 [00:03<00:01, 12.98it/s]#015 72%|███████▏ | 47/65 [00:03<00:01, 13.11it/s]#015 75%|███████▌ | 49/65 [00:03<00:01, 13.19it/s]#015 #015#015 77%|███████▋ | 50/65 [00:03<00:01, 13.19it/s]#015 78%|███████▊ | 51/65 [00:03<00:01, 13.28it/s]#015 82%|████████▏ | 53/65 [00:04<00:01, 11.52it/s]#015 85%|████████▍ | 55/65 [00:04<00:00, 11.63it/s]#015 88%|████████▊ | 57/65 [00:04<00:00, 12.03it/s]#015 91%|█████████ | 59/65 [00:04<00:00, 12.49it/s]#015 94%|█████████▍| 61/65 [00:04<00:00, 12.77it/s]#015 97%|█████████▋| 63/65 [00:04<00:00, 13.09it/s]#015100%|██████████| 65/65 [00:05<00:00, 13.23it/s]#015 #015#015100%|██████████| 65/65 [00:05<00:00, 13.23it/s]#015100%|██████████| 65/65 [00:05<00:00, 12.95it/s]
{'loss': 4.6249, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 5.0177, 'train_samples_per_second': 1658.117, 'train_steps_per_second': 12.954, 'train_loss': 4.608376018817609, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 65.46it/s]
********************
Eval results for day 4 are:#011
********************
eval_/loss = 4.37877082824707
eval_/next-item/ndcg_at_20 = 0.21264421939849854
eval_/next-item/ndcg_at_40 = 0.26149502396583557
eval_/next-item/recall_at_20 = 0.5520833134651184
eval_/next-item/recall_at_40 = 0.7916666865348816
eval_runtime = 0.1551
eval_samples_per_second = 1238.234
eval_steps_per_second = 38.695
['./data/sessions_by_day/4/train.parquet']
********************
Launch training for day 4 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:00<00:09, 6.78it/s]#015 3%|▎ | 2/65 [00:00<00:07, 8.19it/s]#015 5%|▍ | 3/65 [00:00<00:07, 8.77it/s]#015 6%|▌ | 4/65 [00:00<00:06, 9.10it/s]#015 8%|▊ | 5/65 [00:00<00:06, 9.26it/s]#015 9%|▉ | 6/65 [00:00<00:06, 9.38it/s]#015 11%|█ | 7/65 [00:00<00:06, 9.43it/s]#015 12%|█▏ | 8/65 [00:00<00:06, 9.45it/s]#015 14%|█▍ | 9/65 [00:00<00:05, 9.54it/s]#015 15%|█▌ | 10/65 [00:01<00:05, 9.56it/s]#015 17%|█▋ | 11/65 [00:01<00:05, 9.58it/s]#015 18%|█▊ | 12/65 [00:01<00:05, 9.62it/s]#015 20%|██ | 13/65 [00:01<00:05, 9.62it/s]#015 22%|██▏ | 14/65 [00:01<00:06, 8.48it/s]#015 23%|██▎ | 15/65 [00:01<00:05, 8.81it/s]#015 26%|██▌ | 17/65 [00:01<00:04, 10.19it/s]#015 29%|██▉ | 19/65 [00:01<00:04, 11.27it/s]#015 32%|███▏ | 21/65 [00:02<00:03, 11.98it/s]#015 35%|███▌ | 23/65 [00:02<00:03, 12.46it/s]#015 38%|███▊ | 25/65 [00:02<00:03, 12.65it/s]#015 42%|████▏ | 27/65 [00:02<00:03, 11.79it/s]#015 45%|████▍ | 29/65 [00:02<00:02, 12.27it/s]#015 48%|████▊ | 31/65 [00:02<00:02, 12.59it/s]#015 51%|█████ | 33/65 [00:03<00:02, 12.78it/s]#015 54%|█████▍ | 35/65 [00:03<00:02, 12.96it/s]#015 57%|█████▋ | 37/65 [00:03<00:02, 13.05it/s]#015 60%|██████ | 39/65 [00:03<00:01, 13.11it/s]#015 63%|██████▎ | 41/65 [00:03<00:01, 12.02it/s]#015 66%|██████▌ | 43/65 [00:03<00:01, 12.44it/s]#015 69%|██████▉ | 45/65 [00:04<00:01, 12.49it/s]#015 72%|███████▏ | 47/65 [00:04<00:01, 12.88it/s]#015 75%|███████▌ | 49/65 [00:04<00:01, 13.10it/s]#015 #015#015 77%|███████▋ | 50/65 [00:04<00:01, 13.10it/s]#015 78%|███████▊ | 51/65 [00:04<00:01, 13.18it/s]#015 82%|████████▏ | 53/65 [00:04<00:00, 12.10it/s]#015 85%|████████▍ | 55/65 [00:04<00:00, 12.51it/s]#015 88%|████████▊ | 57/65 [00:04<00:00, 12.83it/s]#015 91%|█████████ | 59/65 [00:05<00:00, 12.98it/s]#015 94%|█████████▍| 61/65 [00:05<00:00, 13.17it/s]#015 97%|█████████▋| 63/65 [00:05<00:00, 13.29it/s]#015100%|██████████| 65/65 [00:05<00:00, 13.41it/s]#015 #015#015100%|██████████| 65/65 [00:05<00:00, 13.41it/s]#015100%|██████████| 65/65 [00:05<00:00, 11.76it/s]
{'loss': 4.5203, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 5.5296, 'train_samples_per_second': 1504.618, 'train_steps_per_second': 11.755, 'train_loss': 4.5138815072866585, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 80.70it/s]
********************
Eval results for day 5 are:#011
********************
eval_/loss = 4.3964667320251465
eval_/next-item/ndcg_at_20 = 0.18582205474376678
eval_/next-item/ndcg_at_40 = 0.24064064025878906
eval_/next-item/recall_at_20 = 0.5104166865348816
eval_/next-item/recall_at_40 = 0.78125
eval_runtime = 0.1351
eval_samples_per_second = 1421.411
eval_steps_per_second = 44.419
['./data/sessions_by_day/5/train.parquet']
********************
Launch training for day 5 are:
********************
#015 0%| | 0/60 [00:00<?, ?it/s]#015 2%|▏ | 1/60 [00:00<00:07, 7.42it/s]#015 5%|▌ | 3/60 [00:00<00:05, 11.33it/s]#015 8%|▊ | 5/60 [00:00<00:04, 12.46it/s]#015 12%|█▏ | 7/60 [00:00<00:04, 12.99it/s]#015 15%|█▌ | 9/60 [00:00<00:03, 13.34it/s]#015 18%|█▊ | 11/60 [00:00<00:03, 13.43it/s]#015 22%|██▏ | 13/60 [00:01<00:03, 12.15it/s]#015 25%|██▌ | 15/60 [00:01<00:03, 12.56it/s]#015 28%|██▊ | 17/60 [00:01<00:03, 12.92it/s]#015 32%|███▏ | 19/60 [00:01<00:03, 13.29it/s]#015 35%|███▌ | 21/60 [00:01<00:02, 13.54it/s]#015 38%|███▊ | 23/60 [00:01<00:02, 13.71it/s]#015 42%|████▏ | 25/60 [00:01<00:02, 12.52it/s]#015 45%|████▌ | 27/60 [00:02<00:02, 12.78it/s]#015 48%|████▊ | 29/60 [00:02<00:02, 13.12it/s]#015 52%|█████▏ | 31/60 [00:02<00:02, 13.23it/s]#015 55%|█████▌ | 33/60 [00:02<00:02, 13.33it/s]#015 58%|█████▊ | 35/60 [00:02<00:01, 13.40it/s]#015 62%|██████▏ | 37/60 [00:02<00:01, 11.61it/s]#015 65%|██████▌ | 39/60 [00:03<00:01, 12.02it/s]#015 68%|██████▊ | 41/60 [00:03<00:01, 12.44it/s]#015 72%|███████▏ | 43/60 [00:03<00:01, 12.68it/s]#015 75%|███████▌ | 45/60 [00:03<00:01, 12.88it/s]#015 78%|███████▊ | 47/60 [00:03<00:00, 13.09it/s]#015 82%|████████▏ | 49/60 [00:03<00:00, 12.05it/s]#015 #015#015 83%|████████▎ | 50/60 [00:03<00:00, 12.05it/s]#015 85%|████████▌ | 51/60 [00:04<00:00, 12.41it/s]#015 88%|████████▊ | 53/60 [00:04<00:00, 12.63it/s]#015 92%|█████████▏| 55/60 [00:04<00:00, 12.93it/s]#015 95%|█████████▌| 57/60 [00:04<00:00, 13.02it/s]#015 98%|█████████▊| 59/60 [00:04<00:00, 13.15it/s]#015 #015#015100%|██████████| 60/60 [00:04<00:00, 13.15it/s]#015100%|██████████| 60/60 [00:04<00:00, 12.79it/s]
{'loss': 4.4897, 'learning_rate': 0.00024999999999999995, 'epoch': 4.17}
{'train_runtime': 4.6906, 'train_samples_per_second': 1637.311, 'train_steps_per_second': 12.791, 'train_loss': 4.493310101826986, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 75.56it/s]
********************
Eval results for day 6 are:#011
********************
eval_/loss = 4.464842319488525
eval_/next-item/ndcg_at_20 = 0.182639017701149
eval_/next-item/ndcg_at_40 = 0.23443275690078735
eval_/next-item/recall_at_20 = 0.5
eval_/next-item/recall_at_40 = 0.75
eval_runtime = 0.1432
eval_samples_per_second = 1341.187
eval_steps_per_second = 41.912
['./data/sessions_by_day/6/train.parquet']
********************
Launch training for day 6 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:00<00:07, 8.29it/s]#015 5%|▍ | 3/65 [00:00<00:05, 11.42it/s]#015 8%|▊ | 5/65 [00:00<00:05, 11.30it/s]#015 11%|█ | 7/65 [00:00<00:05, 11.23it/s]#015 14%|█▍ | 9/65 [00:00<00:05, 11.14it/s]#015 17%|█▋ | 11/65 [00:00<00:04, 11.23it/s]#015 20%|██ | 13/65 [00:01<00:04, 11.20it/s]#015 23%|██▎ | 15/65 [00:01<00:04, 10.49it/s]#015 26%|██▌ | 17/65 [00:01<00:04, 10.92it/s]#015 29%|██▉ | 19/65 [00:01<00:04, 10.68it/s]#015 32%|███▏ | 21/65 [00:01<00:04, 10.58it/s]#015 35%|███▌ | 23/65 [00:02<00:04, 10.26it/s]#015 38%|███▊ | 25/65 [00:02<00:03, 10.12it/s]#015 42%|████▏ | 27/65 [00:02<00:04, 9.11it/s]#015 43%|████▎ | 28/65 [00:02<00:04, 9.14it/s]#015 45%|████▍ | 29/65 [00:02<00:03, 9.27it/s]#015 46%|████▌ | 30/65 [00:02<00:03, 9.27it/s]#015 49%|████▉ | 32/65 [00:03<00:03, 10.02it/s]#015 52%|█████▏ | 34/65 [00:03<00:02, 10.98it/s]#015 55%|█████▌ | 36/65 [00:03<00:02, 11.78it/s]#015 58%|█████▊ | 38/65 [00:03<00:02, 12.34it/s]#015 62%|██████▏ | 40/65 [00:03<00:02, 11.57it/s]#015 65%|██████▍ | 42/65 [00:03<00:01, 12.28it/s]#015 68%|██████▊ | 44/65 [00:04<00:01, 12.69it/s]#015 71%|███████ | 46/65 [00:04<00:01, 12.83it/s]#015 74%|███████▍ | 48/65 [00:04<00:01, 13.03it/s]#015 77%|███████▋ | 50/65 [00:04<00:01, 13.25it/s]#015 #015#015 77%|███████▋ | 50/65 [00:04<00:01, 13.25it/s]#015 80%|████████ | 52/65 [00:04<00:00, 13.31it/s]#015 83%|████████▎ | 54/65 [00:04<00:00, 12.21it/s]#015 86%|████████▌ | 56/65 [00:04<00:00, 12.52it/s]#015 89%|████████▉ | 58/65 [00:05<00:00, 12.78it/s]#015 92%|█████████▏| 60/65 [00:05<00:00, 12.95it/s]#015 95%|█████████▌| 62/65 [00:05<00:00, 13.18it/s]#015 98%|█████████▊| 64/65 [00:05<00:00, 13.03it/s]#015 #015#015100%|██████████| 65/65 [00:05<00:00, 13.03it/s]#015100%|██████████| 65/65 [00:05<00:00, 11.49it/s]
{'loss': 4.4849, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 5.6554, 'train_samples_per_second': 1471.167, 'train_steps_per_second': 11.493, 'train_loss': 4.474948002741887, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 75.96it/s]
********************
Eval results for day 7 are:#011
********************
eval_/loss = 4.43782901763916
eval_/next-item/ndcg_at_20 = 0.2118179351091385
eval_/next-item/ndcg_at_40 = 0.25489482283592224
eval_/next-item/recall_at_20 = 0.5364583134651184
eval_/next-item/recall_at_40 = 0.7447916865348816
eval_runtime = 0.1424
eval_samples_per_second = 1348.357
eval_steps_per_second = 42.136
['./data/sessions_by_day/7/train.parquet']
********************
Launch training for day 7 are:
********************
#015 0%| | 0/65 [00:00<?, ?it/s]#015 2%|▏ | 1/65 [00:00<00:07, 8.45it/s]#015 5%|▍ | 3/65 [00:00<00:05, 11.55it/s]#015 8%|▊ | 5/65 [00:00<00:04, 12.14it/s]#015 11%|█ | 7/65 [00:00<00:04, 12.45it/s]#015 14%|█▍ | 9/65 [00:00<00:04, 12.64it/s]#015 17%|█▋ | 11/65 [00:00<00:04, 12.93it/s]#015 20%|██ | 13/65 [00:01<00:03, 13.11it/s]#015 23%|██▎ | 15/65 [00:01<00:04, 12.07it/s]#015 26%|██▌ | 17/65 [00:01<00:03, 12.48it/s]#015 29%|██▉ | 19/65 [00:01<00:03, 12.80it/s]#015 32%|███▏ | 21/65 [00:01<00:03, 12.85it/s]#015 35%|███▌ | 23/65 [00:01<00:03, 13.10it/s]#015 38%|███▊ | 25/65 [00:01<00:03, 12.94it/s]#015 42%|████▏ | 27/65 [00:02<00:03, 11.95it/s]#015 45%|████▍ | 29/65 [00:02<00:02, 12.41it/s]#015 48%|████▊ | 31/65 [00:02<00:02, 12.76it/s]#015 51%|█████ | 33/65 [00:02<00:02, 12.83it/s]#015 54%|█████▍ | 35/65 [00:02<00:02, 13.01it/s]#015 57%|█████▋ | 37/65 [00:02<00:02, 13.02it/s]#015 60%|██████ | 39/65 [00:03<00:02, 12.91it/s]#015 63%|██████▎ | 41/65 [00:03<00:02, 11.93it/s]#015 66%|██████▌ | 43/65 [00:03<00:01, 12.38it/s]#015 69%|██████▉ | 45/65 [00:03<00:01, 12.74it/s]#015 72%|███████▏ | 47/65 [00:03<00:01, 13.04it/s]#015 75%|███████▌ | 49/65 [00:03<00:01, 13.28it/s]#015 #015#015 77%|███████▋ | 50/65 [00:03<00:01, 13.28it/s]#015 78%|███████▊ | 51/65 [00:04<00:01, 13.35it/s]#015 82%|████████▏ | 53/65 [00:04<00:00, 12.21it/s]#015 85%|████████▍ | 55/65 [00:04<00:00, 12.45it/s]#015 88%|████████▊ | 57/65 [00:04<00:00, 12.75it/s]#015 91%|█████████ | 59/65 [00:04<00:00, 12.89it/s]#015 94%|█████████▍| 61/65 [00:04<00:00, 13.13it/s]#015 97%|█████████▋| 63/65 [00:04<00:00, 13.11it/s]#015100%|██████████| 65/65 [00:05<00:00, 13.15it/s]#015 #015#015100%|██████████| 65/65 [00:05<00:00, 13.15it/s]#015100%|██████████| 65/65 [00:05<00:00, 12.72it/s]
{'loss': 4.5053, 'learning_rate': 0.0002801341700638303, 'epoch': 3.85}
{'train_runtime': 5.1121, 'train_samples_per_second': 1627.513, 'train_steps_per_second': 12.715, 'train_loss': 4.501318594125601, 'epoch': 5.0}
finished
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 71.06it/s]
********************
Eval results for day 8 are:#011
********************
eval_/loss = 4.415571689605713
eval_/next-item/ndcg_at_20 = 0.17643620073795319
eval_/next-item/ndcg_at_40 = 0.2237410992383957
eval_/next-item/recall_at_20 = 0.5260416865348816
eval_/next-item/recall_at_40 = 0.7552083134651184
eval_runtime = 0.149
eval_samples_per_second = 1288.315
eval_steps_per_second = 40.26
#015 0%| | 0/6 [00:00<?, ?it/s]#015100%|██████████| 6/6 [00:00<00:00, 69.36it/s]
eval_/loss = 4.415571689605713
eval_/next-item/ndcg_at_20 = 0.17643620073795319
eval_/next-item/ndcg_at_40 = 0.2237410992383957
eval_/next-item/recall_at_20 = 0.5260416865348816
eval_/next-item/recall_at_40 = 0.7552083134651184
eval_runtime = 0.1498
eval_samples_per_second = 1281.623
eval_steps_per_second = 40.051
2024-12-10 16:57:15,910 sagemaker-training-toolkit INFO Reporting training SUCCESS
2024-12-10 16:57:22 Uploading - Uploading generated training model
2024-12-10 16:57:35 Completed - Training job completed
Training seconds: 300
Billable seconds: 300
❓ Questions & Help
Hi everyone, I was trying to reproduce the Getting Started: Session-based Recommendation with Synthetic Data example on AWS SageMaker following the Training and Serving Merlin on AWS SageMaker official tutorial (that uses a
merlin-modelsmodel) but using atransformers4recmodel instead.The AWS SageMaker tutorial using
merlin-modelsworks as expected for both the training and inference steps (after following the PR NVIDIA-Merlin/Merlin#1040 fixes). However, when I'm trying to do the same with thetransformers4recgetting-started tutorial, I'm getting the following error trying to perform the inference on a SageMaker Endpoint:| 1733851742784 | I1210 17:29:02.641670 103 python_be.cc:2177] TRITONBACKEND_ModelInstanceExecute: model instance name 0_transformworkflowtriton_0 released 1 requests | AllTraffic/i-0a730c865fae02cab | | 1733851742784 | Failed to transform operator <merlin.systems.dag.runtimes.triton.ops.workflow.TransformWorkflowTriton object at 0x7f70e322ce50> | AllTraffic/i-0a730c865fae02cab | | 1733851742784 | Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 237, in _run_node_transform transformed_data = node.op.transform(selection, input_data) File "/usr/local/lib/python3.10/dist-packages/merlin/systems/dag/runtimes/triton/ops/workflow.py", line 92, in transform raise RuntimeError(inference_response.error().message()) | AllTraffic/i-0a730c865fae02cab | | 1733851742784 | RuntimeError: Error: <class 'KeyError'> - "['weekday_sin-list', 'category-list', 'item_id-count', 'age_days-list', 'item_id-list', 'day-first'] not in index", Traceback: [' File "/opt/ml/model/0_transformworkflowtriton/1/model.py", line 117, in execute\n transformed = self.runner.run_workflow(input_tensors)\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/systems/workflow/base.py", line 103, in run_workflow\n transformed = LocalExecutor().transform(transformable, self.workflow.graph)\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 102, in transform\n transformed_data = self._execute_node(node, transformable, capture_dtypes, strict)\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 116, in _execute_node\n upstream_outputs = self._run_upstream_transforms(\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 130, in _run_upstream_transforms\n node_output = self._execute_node(\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 119, in _execute_node\n upstream_columns = self._append_addl_root_columns(node, transformable, upstream_outputs)\n', ' File "/usr/local/lib/python3.10/dist-packages/merlin/dag/executors.py", line 154, in _append_addl_root_columns\n upstream_outputs.append(transformable[list(root_columns)])\n', ' File "/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py", line 3811, in __getitem__\n indexer = self.columns._get_indexer_strict(key, "columns")[1]\n', ' File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 6113, in _get_indexer_strict\n self._raise_if_missing(keyarr, indexer, axis_name)\n', ' File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 6176, in _raise_if_missing\n raise KeyError(f"{not_found} not in index")\n']As you can see, the error seems to be related to the grouped variables in the
0_transformworkflowtritonmodel of the Triton ensemble. However, the model training and the ensemble initialization on the Triton server seems to be ok SM_endpoint_logs_full.txt:I think that the cause of this error could be in the Triton server initialization command (
tritonserver --allow-sagemaker=true --allow-http=false $SAGEMAKER_ARGS) or in the SageMaker Endpoint invocation (runtime_sm_client.invoke_endpoint(EndpointName=endpoint_name, ContentType=f"application/vnd.sagemaker-triton.binary+json;json-header-size={header_length}", Body=request_body)) (details and code attached below), since when I perform the Triton inference using the AWS SageMaker Training job (the same instance used for training) it works as expected. Any help with this issue will be highly appreciated.Details
Following the Merlin SageMaker tutorial, these are my files:
Dockerfileserve(Initializes the Triton server. Copied from the PR fix):train.py(here I just copied thetransformers4rectutorial):