diff --git a/.gitignore b/.gitignore index e75b6c06..6a055629 100644 --- a/.gitignore +++ b/.gitignore @@ -144,6 +144,7 @@ local.py local_code/ local_code.py debug.py +*.local* # MacOs files .DS_Store diff --git a/_tsml_research_resources/soton/config.example b/_tsml_research_resources/soton/config.example new file mode 100644 index 00000000..27ebc883 --- /dev/null +++ b/_tsml_research_resources/soton/config.example @@ -0,0 +1,10 @@ +username="arb1g19" +gpu_job="false" +iridis_version="5" +env_name="regression_experiments" +max_folds=1 +relative_script_file_path="tsml-eval/tsml_eval/experiments/forecasting_experiments.py" +relative_data_dir="Data/forecasting" +relative_dataset_list="Data/windowed_series.txt" +relative_results_dir="RegressionResults/results/" +relative_out_dir="RegressionResults/output/" diff --git a/_tsml_research_resources/soton/iridis/batch_scripts/taskfarm_regression_experiments.sh b/_tsml_research_resources/soton/iridis/batch_scripts/taskfarm_regression_experiments.sh deleted file mode 100644 index ec378afc..00000000 --- a/_tsml_research_resources/soton/iridis/batch_scripts/taskfarm_regression_experiments.sh +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/bash -# Check and edit all options before the first run! -# While reading is fine, please dont write anything to the default directories in this script - -# Start and end for resamples -max_folds=10 -start_fold=1 - -# To avoid hitting the cluster queue limit we have a higher level queue -max_num_submitted=900 - -# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx -queue="batch" - -# The number of tasks to submit in each job. This can be larger than the number of cores, but tasks will be delayed until a core is free -n_tasks_per_node=40 - -# The number of cores to request from the node. Don't go over the number of cores for the node. 40 is the number of cores on batch nodes -# If you are not using the whole node, please make sure you are requesting memory correctly -max_cpus_to_use=40 - -# Create a separate submission list for each regressor. This will stop the mixing of -# large and small jobs in the same node, but results in some smaller scripts submitted -# to serial when moving between regressors. -# For small workloads i.e. single resample 10 datasets, turning this off will be the only way to get on the batch queue realistically -split_regressors="true" - -# Enter your username and email here -username="ajb2u23" -mail="NONE" -mailto=$username"@soton.ac.uk" - -# Max allowable is 60 hours -max_time="60:00:00" - -# Start point for the script i.e. 3 datasets, 3 regressors = 9 experiments to submit, start_point=5 will skip to job 5 -start_point=1 - -# Put your home directory here -local_path="/mainfs/home/$username/" - -# Datasets to use and directory of data files. Dataset list can either be a text file or directory of text files -# Separate text files will not run jobs of the same dataset in the same node. This is good to keep large and small datasets separate -data_dir="$local_path/Data/" -dataset_list="$local_path/DataSetLists/RegressionBatch/" - -# Results and output file write location. Change these to reflect your own file structure -results_dir="$local_path/RegressionResults/results/" -out_dir="$local_path/RegressionResults/output/" - -# The python script we are running -script_file_path="$local_path/tsml-eval/tsml_eval/experiments/regression_experiments.py" - -# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md -# Separate environments for GPU and CPU are recommended -env_name="eval-py11" - -# Regressors to loop over. Must be separated by a space. Different regressors will not run in the same node by default -# See list of potential regressors in set_regressor -regressors_to_run="ROCKET DrCIF" - -# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args -# You will have to add any variable to the python call close to the bottom of the script -# and possibly to the options handling below - -# generate a results file for the train data as well as test, usually slower -generate_train_files="false" - -# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples -predefined_folds="false" - -# Normalise data before fit/predict -normalise_data="false" - -# ====================================================================================== -# Experiment configuration end -# ====================================================================================== - -# Set to -tr to generate test files -generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") - -# Set to -pr to use predefined folds -predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") - -# Set to -rn to normalise data -normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") - -# This creates the submission file to run and does clean up -submit_jobs () { - -if ((cmdCount>=max_cpus_to_use)); then - cpuCount=$max_cpus_to_use -else - cpuCount=$cmdCount -fi - -echo "#!/bin/bash -#SBATCH --mail-type=${mail} -#SBATCH --mail-user=${mailto} -#SBATCH --job-name=batch-${dt} -#SBATCH -p ${queue} -#SBATCH -t ${max_time} -#SBATCH -o ${outDir}/%A-${dt}.out -#SBATCH -e ${outDir}/%A-${dt}.err -#SBATCH --nodes=1 -#SBATCH --ntasks=${cpuCount} - -. /etc/profile - -module load anaconda/py3.10 -source activate $env_name - -staskfarm ${outDir}/generatedCommandList-${dt}.txt" > generatedSubmissionFile-${dt}.sub - -echo "At experiment ${expCount}, ${totalCount} jobs submitted total" - -sbatch < generatedSubmissionFile-${dt}.sub - -rm generatedSubmissionFile-${dt}.sub - -} - -totalCount=0 -expCount=0 -dt=$(date +%Y%m%d%H%M%S) - -# turn a directory of files into a list -if [[ -d $dataset_list ]]; then - file_names="" - for file in ${dataset_list}/*; do - file_names="$file_names$dataset_list$(basename "$file") " - done - dataset_list=$file_names -fi - -for dataset_file in $dataset_list; do - -echo "Dataset list ${dataset_file}" - -for regressor in $regressors_to_run; do - -mkdir -p "${out_dir}/${regressor}/" - -if [ "${split_regressors,,}" == "true" ]; then - # we use time for unique names - sleep 1 - cmdCount=0 - dt=$(date +%Y%m%d%H%M%S) - outDir=${out_dir}/${regressor} -else - outDir=${out_dir} -fi - -while read dataset; do - -# Skip to the script start point -((expCount++)) -if ((expCount>=start_point)); then - -# This finds the resamples to run and skips jobs which have test/train files already written to the results directory. -# This can result in uneven sized command lists -resamples_to_run="" -for (( i=start_fold-1; i=n_tasks_per_node)); then - submit_jobs - - # This is the loop to stop you from dumping everything in the queue at once, see max_num_submitted - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) - while [ "${num_jobs}" -ge "${max_num_submitted}" ] - do - echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} - sleep 60 - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) - done - - sleep 1 - cmdCount=0 - dt=$(date +%Y%m%d%H%M%S) -fi - -# Input args to the default regression_experiments are in main method of -# https://github.com/time-series-machine-learning/tsml-eval/blob/main/tsml_eval/experiments/regression_experiments.py -echo "python -u ${script_file_path} ${data_dir} ${results_dir} ${regressor} ${dataset} ${resample} ${generate_train_files} ${predefined_folds} ${normalise_data} > ${out_dir}/${regressor}/output-${dataset}-${resample}-${dt}.txt 2>&1" >> ${outDir}/generatedCommandList-${dt}.txt - -((cmdCount++)) -((totalCount++)) - -done -fi -done < ${dataset_file} - -if [[ "${split_regressors,,}" == "true" && $cmdCount -gt 0 ]]; then - # final submit for this regressor - submit_jobs -fi - -done - -if [[ "${split_regressors,,}" != "true" && $cmdCount -gt 0 ]]; then - # final submit for this dataset list - submit_jobs -fi - -done - -echo Finished submitting jobs diff --git a/_tsml_research_resources/soton/iridis/gpu_scipts/gpu_regression_experiments.sh b/_tsml_research_resources/soton/iridis/gpu_scipts/gpu_regression_experiments.sh deleted file mode 100644 index e9fd2441..00000000 --- a/_tsml_research_resources/soton/iridis/gpu_scipts/gpu_regression_experiments.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/bash -# Check and edit all options before the first run! -# While reading is fine, please dont write anything to the default directories in this script - -# Start and end for resamples -max_folds=5 -start_fold=1 - -# To avoid hitting the cluster queue limit we have a higher level queue -max_num_submitted=12 - -# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx -queue="gpu" - -# Enter your username and email here -username="ajb2u23" -mail="NONE" -mailto="$username@soton.ac.uk" - -# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue -max_memory=8000 - -# Max allowable is 60 hours -max_time="60:00:00" - -# Start point for the script i.e. 3 datasets, 3 regressors = 9 jobs to submit, start_point=5 will skip to job 5 -start_point=1 - -# Put your home directory here -local_path="/mainfs/home/$username/" - -# Datasets to use and directory of data files. Default is Tony's work space, all should be able to read these. Change if you want to use different data or lists -data_dir="$local_path/Data/" -datasets="$local_path/DataSetLists/Regression.txt" - -# Results and output file write location. Change these to reflect your own file structure -results_dir="$local_path/RegressionResults/results/" -out_dir="$local_path/RegressionResults/output/" - -# The python script we are running -script_file_path="$local_path/tsml-eval/tsml_eval/experiments/regression_experiments.py" - -# the path to the apptainer sandbox. The above script or most other files do not need to be in the sandbox -container_path="scratch/tensorflow_sandbox/" - -# Regressors to loop over. Must be separated by a space -# See list of potential regressors in set_regressor -regressors_to_run="RocketRegressor TimeSeriesForestRegressor" - -# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args -# You will have to add any variable to the python call close to the bottom of the script -# and possibly to the options handling below - -# generate a results file for the train data as well as test, usually slower -generate_train_files="false" - -# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples -predefined_folds="false" - -# Normalise data before fit/predict -normalise_data="false" - -# ====================================================================================== -# Experiment configuration end -# ====================================================================================== - -# Set to -tr to generate test files -generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") - -# Set to -pr to use predefined folds -predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") - -# Set to -rn to normalise data -normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") - -count=0 -while read dataset; do -for regressor in $regressors_to_run; do - -# Skip to the script start point -((count++)) -if ((count>=start_point)); then - -# This is the loop to keep from dumping everything in the queue which is maintained around max_num_submitted jobs -num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -while [ "${num_jobs}" -ge "${max_num_submitted}" ] -do - echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} - sleep 60 - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -done - -mkdir -p "${out_dir}${regressor}/${dataset}/" - -# This skips jobs which have test/train files already written to the results directory. Only looks for Resamples, not Folds (old file name) -array_jobs="" -for (( i=start_fold-1; i generatedFile.sub - -echo "${count} ${regressor}/${dataset}" - -sbatch < generatedFile.sub - -else - echo "${count} ${regressor}/${dataset}" has finished all required resamples, skipping -fi - -fi -done -done < ${datasets} - -echo Finished submitting jobs diff --git a/_tsml_research_resources/soton/iridis/serial_scripts/regression_experiments.sh b/_tsml_research_resources/soton/iridis/serial_scripts/regression_experiments.sh deleted file mode 100644 index c7eef2da..00000000 --- a/_tsml_research_resources/soton/iridis/serial_scripts/regression_experiments.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/bin/bash -# Check and edit all options before the first run! -# While reading is fine, please dont write anything to the default directories in this script - -# Start and end for resamples -max_folds=30 -start_fold=1 - -# To avoid hitting the cluster queue limit we have a higher level queue -max_num_submitted=100 - -# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx -queue="batch" - -# Enter your username and email here -username="ajb2u23" -mail="NONE" -mailto="$username@soton.ac.uk" - -# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue -max_memory=8000 - -# Max allowable is 60 hours -max_time="60:00:00" - -# Start point for the script i.e. 3 datasets, 3 regressors = 9 jobs to submit, start_point=5 will skip to job 5 -start_point=1 - -# Put your home directory here -local_path="/mainfs/home/$username/" - -# Datasets to use and directory of data files. Default is Tony's work space, all should be able to read these. Change if you want to use different data or lists -data_dir="$local_path/Data/" -datasets="$local_path/DataSetLists/Regression.txt" - -# Results and output file write location. Change these to reflect your own file structure -results_dir="$local_path/RegressionResults/results/" -out_dir="$local_path/RegressionResults/output/" - -# The python script we are running -script_file_path="$local_path/tsml-eval/tsml_eval/experiments/regression_experiments.py" - -# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md -# Separate environments for GPU and CPU are recommended -env_name="tsml-eval" - -# Regressors to loop over. Must be separated by a space -# See list of potential regressors in set_regressor -regressors_to_run="RocketRegressor TimeSeriesForestRegressor" - -# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args -# You will have to add any variable to the python call close to the bottom of the script -# and possibly to the options handling below - -# generate a results file for the train data as well as test, usually slower -generate_train_files="false" - -# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples -predefined_folds="false" - -# Normalise data before fit/predict -normalise_data="false" - -# ====================================================================================== -# Experiment configuration end -# ====================================================================================== - -# Set to -tr to generate test files -generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") - -# Set to -pr to use predefined folds -predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") - -# Set to -rn to normalise data -normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") - -# dont submit to serial directly -queue=$([ "$queue" == "serial" ] && echo "batch" || echo "$queue") -queue_alias=$([ "$queue" == "batch" ] && echo "serial" || echo "$queue") - -count=0 -while read dataset; do -for regressor in $regressors_to_run; do - -# Skip to the script start point -((count++)) -if ((count>=start_point)); then - -# This is the loop to keep from dumping everything in the queue which is maintained around max_num_submitted jobs -num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -while [ "${num_jobs}" -ge "${max_num_submitted}" ] -do - echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} - sleep 60 - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -done - -mkdir -p "${out_dir}${regressor}/${dataset}/" - -# This skips jobs which have test/train files already written to the results directory. Only looks for Resamples, not Folds (old file name) -array_jobs="" -for (( i=start_fold-1; i generatedFile.sub - -echo "${count} ${regressor}/${dataset}" - -sbatch < generatedFile.sub - -else - echo "${count} ${regressor}/${dataset}" has finished all required resamples, skipping -fi - -fi -done -done < ${datasets} - -echo Finished submitting jobs diff --git a/_tsml_research_resources/soton/iridis/staskfarm.sh b/_tsml_research_resources/soton/iridis/staskfarm.sh new file mode 100644 index 00000000..3bfd76fe --- /dev/null +++ b/_tsml_research_resources/soton/iridis/staskfarm.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# 2014-07-07 +# Simple taskfarm script for a Slurm environment. +# +# Purpose: take a file of tasks (one per line) and create slurm multi-prog +# config to execute those tasks. Each task can comprise of multiple commands. +# +# Background: the slurm multi-prog setup can be difficult for some +# scenarios: +# * only one executable can be specified per task (e.g. no chain of commands +# or shell loops are possible, such as "cd dir01; ./my_exec") +# * a limitation on the maximum number of characters per task description (256) +# * building the multi-prog file can be onerous, if you do not have the +# luxury of using the '%t' tokens in your commands or arguments +# * the number of commands must match exactly the number of slurm tasks (-n), +# which means updating two files if you wish to add or remove tasks +# +# 2017-10-17 +# Inspired by forked version by cmeesters, add a second mode of operation: +# * original: staskfarm commands.txt +# * new: staskfarm command param [param]... +# In this mode, allow a parameter sweep, prefaced by a single command. It +# may be easier than creating the equivalent commands.txt, and allows +# for shell globbing for example to generate the param list. + + +function usage { + cat <<-EOM + Usage: $(basename "$0") [-v] command_filename + or: $(basename "$0") [-v] command param [param]... + + In the first mode of operation: $(basename "$0") [-v] command_filename + + The must have one individual task per + line. The task can comprise of multiple bash shell commands, + each separated by a semi-colon (;). + + For example, the following shows 6 tasks: + + ./my_prog my_input01 > my_output01 + ./my_prog my_input02 > my_output02 + ./my_prog my_input03 > my_output03 + ./my_prog my_input04 > my_output04 + ./my_prog my_input05 > my_output05 + ./my_prog my_input06 > my_output06 + + A more complex example, showing 4 tasks which include loops: + + cd sample01; for i in controls patients; do ./my_prog \$i; done + cd sample02; for i in controls patients; do ./my_prog \$i; done + cd sample03; for i in controls patients; do ./my_prog \$i; done + cd sample04; for i in controls patients; do ./my_prog \$i; done + + Enabling verbose mode prints each command to stdout as it is + read from the command file. + + In the second mode of operation: $(basename "$0") [-v] command param [param]... + + The is combined with each of the individual parameters to + generate the list of tasks to be executed. The number of tasks will be equal + to the number of values. + + The values can either be a simple list (e.g. input1 input2...), + or a shell glob (e.g. *.inp). + + Note that no output redirection is performed in this mode. + + Limitations: + + * the use of MPI is not supported in the tasks. Only serial tasks + can appear in the task lists. + + * it writes the list of tasks to K files, where K is the value of + of the SLURM_NTASKS environment variable. The tasks are written + in a simple round-robin manner over the K files. This makes no + provision for how quickly any individual task might execute + compared to the others, and so an equal division of labour + between the SLURM_NTASKS processors is not guaranteed at all. + + * it makes no decisions about memory usage per task. The + assumption is that the user has already calculated memory + consumption, and has used a combination of "#SBATCH -n " + and "#SBATCH -N " to fit. For example, if the node has 8 + cores and 16 GB of RAM, then "#SBATCH -n 8" will spread the + tasks over 8 cores on one machine, and will assume that the + total memory usage is no more than 16GB (2GB per task). If you + need 4GB per task, then instead you must use "#SBATCH -n 8" + and "#SBATCH -N 2" in order to spread the 8 tasks + over 2 nodes. + + * no output redirection is performed, so any stdout/stderr will + be sent to the slurm-NNNNN.out file by default. This can + be changed by adding individual redirects to each task (in the + first mode of operation). Care must be taken in that case so + that the output files have unique names/paths. + + Note that this program will create a temporary directory + (called .taskfarm_job_\${SLURM_JOB_ID}) in which to store + the slurm multi-config files. + +EOM +} + + +###################################################### +# Variables +###################################################### +verbose=0 +command_filename="" +# no-op dummy command, for when ncommands < SLURM_NTASKS +dummy_command=/bin/true +# loop counter +i=0 +# count of non-blank and non-comment lines: the number of actual commands +ncommands=0 + + +###################################################### +# Parse options +###################################################### +while getopts "vh" Option +do + case "$Option" in + v) verbose="1";; + h) usage; exit;; + *) usage; exit;; + esac +done +shift $((OPTIND-1)) + + +###################################################### +# Check for command_filename +###################################################### +if [ "$#" -eq "0" ] +then + usage; exit 1 +fi + + +###################################################### +# Check for slurm environment +###################################################### +if [ "x${SLURM_JOB_ID}" = "x" -o "x${SLURM_NTASKS}" = "x" ] +then + echo "$(basename "$0"): error: must be executed from within a SLURM allocation. Exiting." + exit 1 +fi + + +###################################################### +# Sanity check if old stale files exist +###################################################### +if [ -d ".taskfarm_job_${SLURM_JOB_ID}" ] +then + if [ "${verbose}" = "1" ] + then + echo "Deleting old job files .taskfarm_job_${SLURM_JOB_ID}/*.sh" + fi + rm -f .taskfarm_job_"${SLURM_JOB_ID}"/*.sh +fi + + +###################################################### +# Create the taskfarm directory +###################################################### +if [ "${verbose}" = "1" ] +then + echo "Creating taskfarm job directory .taskfarm_job_${SLURM_JOB_ID}" +fi +mkdir ".taskfarm_job_${SLURM_JOB_ID}" + + +###################################################### +# Two modes of operation: +# 1 if ($# == 1) then we've supplied a command_filename +# and go with the previous logic +# 2 else we've supplied a command and a list of params, +# so generate the tasks based on those +###################################################### + + +if [ "$#" -eq "1" ] +then + ###################################################### + # Mode 1: command_filename + ###################################################### + + command_filename=$1 + + ###################################################### + # Does the file exist? + ###################################################### + if [ ! -f "${command_filename}" ] + then + echo "$(basename "$0"): error: commands file ${command_filename} does not exist. Exiting." + exit 1 + fi + + + if [ "${verbose}" = "1" ] + then + echo "" + echo "-------------------- $(basename "$0") START --------------------" + echo "Reading commands from file: ${command_filename}." + echo "There are $(wc -l < "${command_filename}") lines in the file." + echo "They will be spread over the ${SLURM_NTASKS} processors: ${SLURM_TASKS_PER_NODE} tasks on ${SLURM_NODELIST}" + #md5sum ${command_filename} + fi + + + ###################################################### + # Warn if no output redirection + ###################################################### + if [ "${verbose}" = "1" ] && ! grep -q '>' "${command_filename}" + then + echo "" + echo "WARNING: there is no individual task output redirection in the ${command_filename}" + echo " file. This could potentially be a problem. Output of all individual" + echo " tasks will likely be merged in the slurm output file ('slurm-${SLURM_JOB_ID}.out')." + echo "" + fi + + + ###################################################### + # Main loop: + # read the file, line by line, and create the + # individual multi-prog shell scripts + ###################################################### + while read line + do + # ignore blank lines and comment lines + if [[ $line =~ ^[[:blank:]]*# || $line =~ ^[[:blank:]]*$ ]] + then + if [ "${verbose}" = "1" ] + then + echo "Skipping blank and comment lines" + fi + continue + fi + + if [ "${verbose}" = "1" ] + then + echo "Adding the following line to .taskfarm_job_${SLURM_JOB_ID}/${i}.sh: $line" + fi + + echo "$line" >> ".taskfarm_job_${SLURM_JOB_ID}/${i}.sh" + + # increment, modulo the number of tasks + (( i = (i + 1) % SLURM_NTASKS )) + + # increment the total number of commands + (( ncommands++ )) + done < "${command_filename}" + +else + ###################################################### + # Mode 2: command + param list + ###################################################### + + command=$1 + + # the first positional arg is the 'command' variable, which we've saved above. + # remove it from the list + shift + + ###################################################### + # Does the file exist? + ###################################################### + if [ ! "which ${command}" ] + then + echo "$(basename "$0"): error: taskfarm command ${command} does not exist. Exiting." + exit 1 + fi + + + if [ "${verbose}" = "1" ] + then + echo "" + echo "-------------------- $(basename "$0") START --------------------" + echo "Using command: ${command} with $# parameters." + echo "They will be spread over the ${SLURM_NTASKS} processors: ${SLURM_TASKS_PER_NODE} tasks on ${SLURM_NODELIST}" + #md5sum ${command_filename} + fi + + + ###################################################### + # Warn if no output redirection + ###################################################### + if [ "${verbose}" = "1" ] + then + echo "" + echo "WARNING: there is no individual task output redirection." + echo " This could potentially be a problem. Output of all individual" + echo " tasks will likely be merged in the slurm output file ('slurm-${SLURM_JOB_ID}.out')." + echo "" + fi + + + ###################################################### + # Main loop: + # loop over the parameters, and create the + # individual multi-prog shell scripts + ###################################################### + + # just in case of quoted parameters (e.g. filenames with spaces) + SAVEIFS=$IFS + IFS=$(echo -en "\n\b") + + for param in "$@" + do + line="${command} ${param}" + + if [ "${verbose}" = "1" ] + then + echo "Adding the following line to .taskfarm_job_${SLURM_JOB_ID}/${i}.sh: $line" + fi + + echo "$line" >> ".taskfarm_job_${SLURM_JOB_ID}/${i}.sh" + + # increment, modulo the number of tasks + (( i = (i + 1) % SLURM_NTASKS )) + + # increment the total number of commands + (( ncommands++ )) + done + + IFS=$SAVEIFS +fi + +###################################################### +# Sanity check: if the number of commands is less +# than SLURM_NTASKS, then it the srun --multi-prog +# will error. Add dummy tasks to fill them out. +###################################################### + +while (( ncommands < SLURM_NTASKS )) +do + if [ "${verbose}" = "1" ] + then + echo "Adding the dummy command to .taskfarm_job_${SLURM_JOB_ID}/${i}.sh: ${dummy_command}" + fi + + echo "${dummy_command}" >> ".taskfarm_job_${SLURM_JOB_ID}/${i}.sh" + + # increment (don't need the modulo here, the loop guard takes care of it) + (( i++ )) + + # increment the total number of commands + (( ncommands++ )) +done + + +###################################################### +# Create the multi-prog file (using the '%t' token) +###################################################### +if [ "${verbose}" = "1" ] +then + echo "Creating the .taskfarm_job_${SLURM_JOB_ID}/multi.config file." +fi +echo "* bash .taskfarm_job_${SLURM_JOB_ID}/%t.sh" > ".taskfarm_job_${SLURM_JOB_ID}/multi.config" + + +###################################################### +# And finally run the slurm multi-prog task file +###################################################### +if [ "${verbose}" = "1" ] +then + echo "About to execute 'srun --multi-prog .taskfarm_job_${SLURM_JOB_ID}/multi.config'." + echo "==================== $(basename "$0") END ====================" + echo "" +fi +srun --multi-prog ".taskfarm_job_${SLURM_JOB_ID}/multi.config" diff --git a/_tsml_research_resources/soton/iridis/taskfarm_regression_experiments.sh b/_tsml_research_resources/soton/iridis/taskfarm_regression_experiments.sh new file mode 100644 index 00000000..a377db73 --- /dev/null +++ b/_tsml_research_resources/soton/iridis/taskfarm_regression_experiments.sh @@ -0,0 +1,329 @@ +#!/bin/bash +# Check and edit all options before the first run! +# While reading is fine, please dont write anything to the default directories in this script +set -eu +# ====================================================================================== +# Default experiment configuration start +# Create your own config.local or pass in --config to override these settings +# Use config.local.example as a template +# ====================================================================================== +# Start and end for resamples +max_folds=10 +start_fold=1 + +# To avoid hitting the cluster queue limit we have a higher level queue +max_num_submitted=900 + +gpu_job="false" +iridis_version="5" + +# The number of tasks/threads to use in each job. 40 is the number of cores on batch nodes +n_tasks_per_node=40 + +# The number of threads per task. Usually 1 unless using a regressor that can multithread internally +# use with threaded_regression_experiments.py +n_threads_per_task=1 + +# The number of cores to request from the node. Don't go over the number of cores for the node. 40 is the number of cores on batch nodes +# If you are not using the whole node, please make sure you are requesting memory correctly +max_cpus_to_use=40 + +# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue +max_memory=8000 + +# Create a separate submission list for each regressor. This will stop the mixing of +# large and small jobs in the same node, but results in some smaller scripts submitted +# to serial when moving between regressors. +# For small workloads i.e. single resample 10 datasets, turning this off will be the only way to get on the batch queue realistically +split_regressors="true" + +# Enter your username and email here +mail="NONE" + +# Max allowable is 60 hours +max_time="60:00:00" + +# Start point for the script i.e. 3 datasets, 3 regressorss = 9 experiments to submit, start_point=5 will skip to job 5 +start_point=1 + +# Datasets to use and directory of data files. This can either be a text file or directory of text files +# Separate text files will not run jobs of the same dataset in the same node. This is good to keep large and small datasets separate +relative_data_dir="Data/forecasting" +relative_dataset_list="Data/windowed_series.txt" + +# Results and output file write location. Change these to reflect your own file structure +relative_results_dir="RegressionResults/results/" +relative_out_dir="RegressionResults/output/" + +# The python script we are running +relative_script_file_path="tsml-eval/tsml_eval/experiments/forecasting_experiments.py" + +# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md +# Separate environments for GPU and CPU are recommended #regress_gpu regression_experiments +# env_name="regression_experiments" +container_path="scratch/tensorflow_sandbox/" + +# Regressors to loop over. Must be seperated by a space. Different regressors will not run in the same node +# See list of potential regressors in set_regressor InceptionTimeRegressor +# regressors_to_run="ETSForecaster, AutoETSForecaster, SktimeETS, StatsForecastETS" # RocketRegressor MultiRocketRegressor ResNetRegressor fpcregressor fpcr-b-spline TimeCNNRegressor FCNRegressor 1nn-ed 1nn-dtw 5nn-ed 5nn-dtw FreshPRINCERegressor TimeSeriesForestRegressor DrCIFRegressor Ridge SVR RandomForestRegressor RotationForestRegressor xgboost + +# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args +# You will have to add any variable to the python call close to the bottom of the script +# and possibly to the options handling below + +# generate a results file for the train data as well as test, usually slower +generate_train_files="false" + +# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples +predefined_folds="false" + +# Normalise data before fit/predict +normalise_data="false" +# ====================================================================================== +# Experiment configuration end +# ====================================================================================== + +# ====================================================================================== +# Read in config files and CLI args +# ====================================================================================== + +# Helper +maybe_source() { [ -f "$1" ] && . "$1"; } + +# Resolve script dir +SCRIPT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd) + +# 1) Load configs in increasing priority +maybe_source "$SCRIPT_DIR/../../config.default" +maybe_source "$SCRIPT_DIR/../../config.local" + +# 2) Parse CLI overrides (highest priority) +CONFIG_FILE="" +while [ "$#" -gt 0 ]; do + case "$1" in + --config) CONFIG_FILE=$2; shift 2 ;; + --regressors_to_run) regressors_to_run=$2; shift 2 ;; + --debug) DEBUG=1; shift ;; + --) shift; break ;; + *) echo "Unknown option: $1" >&2; exit 2 ;; + esac +done +[ -n "${CONFIG_FILE:-}" ] && maybe_source "$CONFIG_FILE" + +# 3) Validate required vars +: "${username:?Set username in config file}" +: "${env_name:?Set env_name in config file}" +: "${regressors_to_run:?Set regressors_to_run in config file or CLI}" + +[ "$DEBUG" = "1" ] && echo "Running $regressors_to_run" + +# ====================================================================================== +# Read in config files and CLI args +# ====================================================================================== + +mailto=$username"@soton.ac.uk" + +# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx +if [ "${gpu_job}" == "true" ]; then + if [ "${iridis_version}" == "5" ]; then + queue="gpu" + else + queue="a100" + fi +else + queue="batch" +fi + +# Different home paths on iridis5 and iridis6/iridisX +if [ "${iridis_version}" == "5" ]; then + local_path="/ECShome/$username" +else + local_path="/home/$username" +fi + +# staskfarm doesn't exist on iridis6 or iridisX +if [ "${iridis_version}" == "5" ]; then + taskfarm_file_path="staskfarm" +else + taskfarm_file_path="$local_path/tsml-eval/_tsml_research_resources/soton/iridis/staskfarm.sh" +fi + +# The python script we are running +full_script_file_path="$local_path/$relative_script_file_path" + +# Datasets to use and directory of data files. This can either be a text file or directory of text files +# Separate text files will not run jobs of the same dataset in the same node. This is good to keep large and small datasets separate +data_dir="$local_path/$relative_data_dir" +dataset_list="$local_path/$relative_dataset_list" + +# Results and output file write location. +results_dir="$local_path/$relative_results_dir" +out_dir="$local_path/$relative_out_dir" + +# Set to -tr to generate test files +generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") + +# Set to -pr to use predefined folds +predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") + +# Set to -rn to normalise data +normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") + +mkdir -p "${out_dir}/" + +if [ "${iridis_version}" == "5" ]; then + conda_instruction="anaconda/py3.10" +else + conda_instruction="conda/python3" +fi + +if [ "${gpu_job}" == "true" ]; then + gpu_instruction="#SBATCH --gres=gpu:1" +else + gpu_instruction="" +fi + +# This creates the submission file to run and does clean up +submit_jobs () { + +if ((cmdCount>=max_cpus_to_use)); then + cpuCount=$max_cpus_to_use +else + cpuCount=$cmdCount +fi + +if [ "${gpu_job}" == "true" ]; then + environment_instructions="module load apptainer/1.3.3" + apptainer_instruction="apptainer exec --nv ${container_path} echo "Running Apptainer job."; " +else + environment_instructions="module load $conda_instruction && source activate $env_name" + apptainer_instruction="" +fi + +echo "#!/bin/bash +${gpu_instruction} +#SBATCH --mail-type=${mail} +#SBATCH --mail-user=${mailto} +#SBATCH --job-name=batch-${dt} +#SBATCH -p ${queue} +#SBATCH -t ${max_time} +#SBATCH -o ${out_dir}/%A-${dt}.out +#SBATCH -e ${out_dir}/%A-${dt}.err +#SBATCH --nodes=1 +#SBATCH --ntasks=${cpuCount} +#SBATCH --mem=${max_memory}M + +. /etc/profile + +${environment_instructions} + + + +${taskfarm_file_path} ${out_dir}/generatedCommandList-${dt}.txt" > generatedSubmissionFile-${dt}.sub + +echo "At experiment ${expCount}, ${totalCount} jobs submitted total" + +sbatch < generatedSubmissionFile-${dt}.sub + +rm generatedSubmissionFile-${dt}.sub + +} + +totalCount=0 +expCount=0 +dt=$(date +%Y%m%d%H%M%S) + +# turn a directory of files into a list +if [[ -d $dataset_list ]]; then + file_names="" + for file in ${dataset_list}/*; do + file_names="$file_names$dataset_list$(basename "$file") " + done + dataset_list=$file_names +fi + +for dataset_file in $dataset_list; do + +echo "Dataset list ${dataset_file}" + +for regressor in $regressors_to_run; do + +mkdir -p "${out_dir}/${regressor}/" + +if [ "${split_regressors,,}" == "true" ]; then + # we use time for unique names + sleep 1 + cmdCount=0 + dt=$(date +%Y%m%d%H%M%S) + outDir=${out_dir}/${regressor} +else + outDir=${out_dir} +fi + +while read dataset; do + +# Skip to the script start point +((expCount++)) +if ((expCount>=start_point)); then + +# This finds the resamples to run and skips jobs which have test/train files already written to the results directory. +# This can result in uneven sized command lists +resamples_to_run="" +for (( i=start_fold-1; i(n_tasks_per_node-n_threads_per_task))); then + submit_jobs + + # This is the loop to stop you from dumping everything in the queue at once, see max_num_submitted jobs + num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) + echo "Number of Jobs currently running on the cluster: ${num_jobs}" + while [ "${num_jobs}" -ge "${max_num_submitted}" ] + do + echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} + sleep 60 + num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) + done + + sleep 1 + cmdCount=0 + dt=$(date +%Y%m%d%H%M%S) +fi + +# Input args to the default classification_experiments are in main method of +# https://github.com/time-series-machine-learning/tsml-eval/blob/main/tsml_eval/experiments/classification_experiments.py +echo "${apptainer_instruction}python -u ${full_script_file_path} ${data_dir} ${results_dir} ${regressor} ${dataset} ${resample} ${generate_train_files} ${predefined_folds} ${normalise_data} > ${out_dir}/${regressor}/output-${dataset}-${resample}-${dt}.txt 2>&1" >> ${out_dir}/generatedCommandList-${dt}.txt + +((cmdCount=cmdCount+n_threads_per_task)) +((totalCount++)) + +done +fi +done < ${dataset_file} + +if [[ "${split_regressors,,}" == "true" && $cmdCount -gt 0 ]]; then + # final submit for this regressor + submit_jobs +fi + +done + +if [[ "${split_regressors,,}" != "true" && $cmdCount -gt 0 ]]; then + # final submit for this dataset list + submit_jobs +fi + +done + +echo Finished submitting jobs diff --git a/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_regression_experiments.sh b/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_regression_experiments.sh deleted file mode 100644 index e10b768f..00000000 --- a/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_regression_experiments.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash -# Check and edit all options before the first run! -# While reading is fine, please dont write anything to the default directories in this script - -# Start and end for resamples -max_folds=30 -start_fold=1 - -# To avoid hitting the cluster queue limit we have a higher level queue -max_num_submitted=100 - -# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx -queue="batch" - -# The number of threads to request. Please check the number of cores for the node you are using, some are exclusive (i.e. batch) so you should request all cores or a taskfarm script -n_threads=10 - -# Enter your username and email here -username="ajb2u23" -mail="NONE" -mailto="$username@soton.ac.uk" - -# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue -max_memory=8000 - -# Max allowable is 60 hours -max_time="60:00:00" - -# Start point for the script i.e. 3 datasets, 3 regressors = 9 jobs to submit, start_point=5 will skip to job 5 -start_point=1 - -# Put your home directory here -local_path="/mainfs/home/$username/" - -# Datasets to use and directory of data files. Default is Tony's work space, all should be able to read these. Change if you want to use different data or lists -data_dir="$local_path/Data/" -datasets="$local_path/DataSetLists/Regression.txt" - -# Results and output file write location. Change these to reflect your own file structure -results_dir="$local_path/RegressionResults/results/" -out_dir="$local_path/RegressionResults/output/" - -# The python script we are running -script_file_path="$local_path/tsml-eval/tsml_eval/experiments/threaded_regression_experiments.py" - -# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md -# Separate environments for GPU and CPU are recommended -env_name="tsml-eval" - -# Regressors to loop over. Must be separated by a space -# See list of potential regressors in set_regressor -regressors_to_run="RocketRegressor TimeSeriesForestRegressor" - -# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args -# You will have to add any variable to the python call close to the bottom of the script -# and possibly to the options handling below - -# generate a results file for the train data as well as test, usually slower -generate_train_files="false" - -# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples -predefined_folds="false" - -# Normalise data before fit/predict -normalise_data="false" - -# ====================================================================================== -# Experiment configuration end -# ====================================================================================== - -# Set to -tr to generate test files -generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") - -# Set to -pr to use predefined folds -predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") - -# Set to -rn to normalise data -normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") - -# dont submit to serial directly -queue=$([ "$queue" == "serial" ] && echo "batch" || echo "$queue") -queue_alias=$([ "$queue" == "batch" ] && echo "serial" || echo "$queue") - -count=0 -while read dataset; do -for regressor in $regressors_to_run; do - -# Skip to the script start point -((count++)) -if ((count>=start_point)); then - -# This is the loop to keep from dumping everything in the queue which is maintained around max_num_submitted jobs -num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -while [ "${num_jobs}" -ge "${max_num_submitted}" ] -do - echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} - sleep 60 - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l) -done - -mkdir -p "${out_dir}${regressor}/${dataset}/" - -# This skips jobs which have test/train files already written to the results directory. Only looks for Resamples, not Folds (old file name) -array_jobs="" -for (( i=start_fold-1; i generatedFile.sub - -echo "${count} ${regressor}/${dataset}" - -sbatch < generatedFile.sub - -else - echo "${count} ${regressor}/${dataset}" has finished all required resamples, skipping -fi - -fi -done -done < ${datasets} - -echo Finished submitting jobs diff --git a/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_taskfarm_regression_experiments.sh b/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_taskfarm_regression_experiments.sh deleted file mode 100644 index c1511f8c..00000000 --- a/_tsml_research_resources/soton/iridis/threaded_scripts/threaded_taskfarm_regression_experiments.sh +++ /dev/null @@ -1,223 +0,0 @@ -#!/bin/bash -# Check and edit all options before the first run! -# While reading is fine, please dont write anything to the default directories in this script - -# Start and end for resamples -max_folds=10 -start_fold=1 - -# To avoid hitting the cluster queue limit we have a higher level queue -max_num_submitted=900 - -# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx -queue="batch" - -# The number of tasks to submit in each job. This can be larger than the number of cores, but tasks will be delayed until a core is free -n_tasks_per_node=4 - -# The number of threads to use per task. You can only run as many tasks as there are CPUs available, 4 tasks with 10 threads will take up a full batch node -n_threads_per_task=10 - -# The number of cores to request from the node. Don't go over the number of cores for the node. 40 is the number of cores on batch nodes -# If you are not using the whole node, please make sure you are requesting memory correctly -max_cpus_to_use=40 - -# Create a separate submission list for each regressor. This will stop the mixing of -# large and small jobs in the same node, but results in some smaller scripts submitted -# to serial when moving between regressors. -# For small workloads i.e. single resample 10 datasets, turning this off will be the only way to get on the batch queue realistically -split_regressors="true" - -# Enter your username and email here -username="ajb2u23" -mail="NONE" -mailto=$username"@soton.ac.uk" - -# Max allowable is 60 hours -max_time="60:00:00" - -# Start point for the script i.e. 3 datasets, 3 regressors = 9 experiments to submit, start_point=5 will skip to job 5 -start_point=1 - -# Put your home directory here -local_path="/mainfs/home/$username/" - -# Datasets to use and directory of data files. Dataset list can either be a text file or directory of text files -# Separate text files will not run jobs of the same dataset in the same node. This is good to keep large and small datasets separate -data_dir="$local_path/Data/" -dataset_list="$local_path/DataSetLists/RegressionBatch/" - -# Results and output file write location. Change these to reflect your own file structure -results_dir="$local_path/RegressionResults/results/" -out_dir="$local_path/RegressionResults/output/" - -# The python script we are running -script_file_path="$local_path/tsml-eval/tsml_eval/experiments/threaded_regression_experiments.py" - -# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md -# Separate environments for GPU and CPU are recommended -env_name="eval-py11" - -# Regressors to loop over. Must be separated by a space. Different regressors will not run in the same node by default -# See list of potential regressors in set_regressor -regressors_to_run="ROCKET DrCIF" - -# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args -# You will have to add any variable to the python call close to the bottom of the script -# and possibly to the options handling below - -# generate a results file for the train data as well as test, usually slower -generate_train_files="false" - -# If set for true, looks for _TRAIN.ts file. This is useful for running tsml-java resamples -predefined_folds="false" - -# Normalise data before fit/predict -normalise_data="false" - -# ====================================================================================== -# Experiment configuration end -# ====================================================================================== - -# Set to -tr to generate test files -generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "") - -# Set to -pr to use predefined folds -predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "") - -# Set to -rn to normalise data -normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "") - -# This creates the submission file to run and does clean up -submit_jobs () { - -totalThreads=$((cmdCount * n_threads_per_task)) -if ((totalJobs>=max_cpus_to_use)); then - cpuCount=$max_cpus_to_use -else - cpuCount=$totalThreads -fi - -echo "#!/bin/bash -#SBATCH --mail-type=${mail} -#SBATCH --mail-user=${mailto} -#SBATCH --job-name=batch-${dt} -#SBATCH -p ${queue} -#SBATCH -t ${max_time} -#SBATCH -o ${outDir}/%A-${dt}.out -#SBATCH -e ${outDir}/%A-${dt}.err -#SBATCH --nodes=1 -#SBATCH --ntasks=${cpuCount} - -. /etc/profile - -module load anaconda/py3.10 -source activate $env_name - -staskfarm ${outDir}/generatedCommandList-${dt}.txt" > generatedSubmissionFile-${dt}.sub - -echo "At experiment ${expCount}, ${totalCount} jobs submitted total" - -sbatch < generatedSubmissionFile-${dt}.sub - -rm generatedSubmissionFile-${dt}.sub - -} - -totalCount=0 -expCount=0 -dt=$(date +%Y%m%d%H%M%S) - -# turn a directory of files into a list -if [[ -d $dataset_list ]]; then - file_names="" - for file in ${dataset_list}/*; do - file_names="$file_names$dataset_list$(basename "$file") " - done - dataset_list=$file_names -fi - -for dataset_file in $dataset_list; do - -echo "Dataset list ${dataset_file}" - -for regressor in $regressors_to_run; do - -mkdir -p "${out_dir}/${regressor}/" - -if [ "${split_regressors,,}" == "true" ]; then - # we use time for unique names - sleep 1 - cmdCount=0 - dt=$(date +%Y%m%d%H%M%S) - outDir=${out_dir}/${regressor} -else - outDir=${out_dir} -fi - -while read dataset; do - -# Skip to the script start point -((expCount++)) -if ((expCount>=start_point)); then - -# This finds the resamples to run and skips jobs which have test/train files already written to the results directory. -# This can result in uneven sized command lists -resamples_to_run="" -for (( i=start_fold-1; i=n_tasks_per_node)); then - submit_jobs - - # This is the loop to stop you from dumping everything in the queue at once, see max_num_submitted - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) - while [ "${num_jobs}" -ge "${max_num_submitted}" ] - do - echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted} - sleep 60 - num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue}" -e "PD ${queue}" | wc -l) - done - - sleep 1 - cmdCount=0 - dt=$(date +%Y%m%d%H%M%S) -fi - -# Input args to the default threaded_regression_experiments are in main method of -# https://github.com/time-series-machine-learning/tsml-eval/blob/main/tsml_eval/experiments/threaded_regression_experiments.py -echo "python -u ${script_file_path} ${data_dir} ${results_dir} ${regressor} ${dataset} ${resample} ${generate_train_files} ${predefined_folds} ${normalise_data} > ${out_dir}/${regressor}/output-${dataset}-${resample}-${dt}.txt 2>&1" >> ${outDir}/generatedCommandList-${dt}.txt - -((cmdCount++)) -((totalCount++)) - -done -fi -done < ${dataset_file} - -if [[ "${split_regressors,,}" == "true" && $cmdCount -gt 0 ]]; then - # final submit for this regressor - submit_jobs -fi - -done - -if [[ "${split_regressors,,}" != "true" && $cmdCount -gt 0 ]]; then - # final submit for this dataset list - submit_jobs -fi - -done - -echo Finished submitting jobs diff --git a/do_forecasting_stuff.bat b/do_forecasting_stuff.bat new file mode 100644 index 00000000..e17747e9 --- /dev/null +++ b/do_forecasting_stuff.bat @@ -0,0 +1,118 @@ +@echo off +setlocal enabledelayedexpansion + +REM Activate the virtual environment +call ..\tsml-eval-venv\Scripts\activate.bat + +set SCRIPT_PATH=tsml_eval\experiments\forecasting_experiments.py +set DATA_DIR=..\aeon\aeon\datasets\local_data\differenced_forecasting +set RESULTS_DIR=..\ForecastingResults\differenced_results +set FORECASTER=AutoSARIMA +set FIXED_PARAM=0 + +for %%D in ( + weather_dataset_T1 + weather_dataset_T2 + weather_dataset_T3 + weather_dataset_T4 + weather_dataset_T5 + solar_10_minutes_dataset_T1 + solar_10_minutes_dataset_T2 + solar_10_minutes_dataset_T3 + solar_10_minutes_dataset_T4 + solar_10_minutes_dataset_T5 + sunspot_dataset_without_missing_values_T1 + wind_farms_minutely_dataset_without_missing_values_T1 + wind_farms_minutely_dataset_without_missing_values_T3 + wind_farms_minutely_dataset_without_missing_values_T4 + wind_farms_minutely_dataset_without_missing_values_T5 + elecdemand_dataset_T1 + us_births_dataset_T1 + saugeenday_dataset_T1 + london_smart_meters_dataset_without_missing_values_T1 + london_smart_meters_dataset_without_missing_values_T2 + london_smart_meters_dataset_without_missing_values_T3 + traffic_hourly_dataset_T1 + traffic_hourly_dataset_T2 + traffic_hourly_dataset_T3 + traffic_hourly_dataset_T4 + traffic_hourly_dataset_T5 + electricity_hourly_dataset_T1 + electricity_hourly_dataset_T2 + electricity_hourly_dataset_T3 + pedestrian_counts_dataset_T1 + pedestrian_counts_dataset_T2 + pedestrian_counts_dataset_T3 + pedestrian_counts_dataset_T4 + pedestrian_counts_dataset_T5 + kdd_cup_2018_dataset_without_missing_values_T1 + australian_electricity_demand_dataset_T1 + australian_electricity_demand_dataset_T2 + australian_electricity_demand_dataset_T3 + oikolab_weather_dataset_T1 + oikolab_weather_dataset_T2 + oikolab_weather_dataset_T3 + oikolab_weather_dataset_T4 + m4_monthly_dataset_T122 + m4_monthly_dataset_T145 + m4_monthly_dataset_T180 + m4_monthly_dataset_T186 + m4_monthly_dataset_T17051 + m4_monthly_dataset_T17088 + m4_monthly_dataset_T17132 + m4_monthly_dataset_T17146 + m4_monthly_dataset_T26710 + m4_monthly_dataset_T27138 + m4_monthly_dataset_T27170 + m4_monthly_dataset_T27175 + m4_monthly_dataset_T27186 + m4_monthly_dataset_T37009 + m4_monthly_dataset_T37070 + m4_monthly_dataset_T37238 + m4_monthly_dataset_T37248 + m4_monthly_dataset_T47915 + m4_weekly_dataset_T1 + m4_weekly_dataset_T2 + m4_weekly_dataset_T19 + m4_weekly_dataset_T20 + m4_weekly_dataset_T21 + m4_weekly_dataset_T55 + m4_weekly_dataset_T56 + m4_weekly_dataset_T60 + m4_weekly_dataset_T61 + m4_weekly_dataset_T62 + m4_weekly_dataset_T224 + m4_weekly_dataset_T225 + m4_weekly_dataset_T226 + m4_weekly_dataset_T227 + m4_weekly_dataset_T248 + m4_weekly_dataset_T249 + m4_weekly_dataset_T250 + m4_daily_dataset_T1 + m4_daily_dataset_T2 + m4_daily_dataset_T6 + m4_daily_dataset_T130 + m4_daily_dataset_T131 + m4_daily_dataset_T145 + m4_daily_dataset_T1604 + m4_daily_dataset_T1605 + m4_daily_dataset_T1606 + m4_daily_dataset_T1607 + m4_daily_dataset_T1614 + m4_daily_dataset_T1615 + m4_daily_dataset_T1634 + m4_daily_dataset_T1650 + m4_daily_dataset_T2036 + m4_daily_dataset_T2037 + m4_daily_dataset_T2041 + m4_daily_dataset_T3595 + m4_daily_dataset_T3597 + m4_hourly_dataset_T170 + m4_hourly_dataset_T171 + m4_hourly_dataset_T172 +) do ( + echo Running for dataset: %%D + python -u %SCRIPT_PATH% %DATA_DIR% %RESULTS_DIR% %FORECASTER% %%D %FIXED_PARAM% +) + +endlocal diff --git a/process_results.py b/process_results.py new file mode 100644 index 00000000..1d7d2db3 --- /dev/null +++ b/process_results.py @@ -0,0 +1,76 @@ +import sys + +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/aeon") +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/tsml-eval") + +from tsml_eval.evaluation.multiple_estimator_evaluation import ( + evaluate_forecasters_by_problem, +) + +if __name__ == "__main__": + evaluate_forecasters_by_problem( + "../ForecastingResults/master_comparison", + [ + # "ETSForecaster", + "AutoETSForecaster", + # "AutoARIMA", + # "AutoSARIMA", + # "d-ETSForecaster", + # "d-AutoETSForecaster", + "d-AutoARIMA", + # "d-AutoSARIMA", + # "SktimeETS", + # "StatsForecastETS", + "NaiveForecaster", + # "fpcregressor", + # "fpcr-b-spline", + # "FCNRegressor", + # "1nn-ed", + # "1nn-dtw", + # "5nn-ed", + # "5nn-dtw", + # "SVR", + "Ridge", + # "RocketRegressor", + # "MultiRocketRegressor", + # "ResNetRegressor", + # "IndividualInceptionRegressor", + # "InceptionTimeRegressor", + # "TimeCNNRegressor", + # "FreshPRINCERegressor", + # "TimeSeriesForestRegressor", + # "DrCIFRegressor", + # "RandomForestRegressor", + # "xgboost", + "d-Ridge", + # "d-RocketRegressor", + # "d-MultiRocketRegressor", + # "d-ResNetRegressor", + # "d-IndividualInceptionRegressor", + "d-InceptionTimeRegressor", + # "d-TimeCNNRegressor", + # "d-FreshPRINCERegressor", + "d-TimeSeriesForestRegressor", + # "d-DrCIFRegressor", + "d-RandomForestRegressor", + # "d-xgboost", + "pd-Ridge", + # "pd-RocketRegressor", + # "pd-MultiRocketRegressor", + # "pd-ResNetRegressor", + # "pd-IndividualInceptionRegressor", + # "pd-InceptionTimeRegressor", + # "pd-TimeCNNRegressor", + # "pd-FreshPRINCERegressor", + # "pd-TimeSeriesForestRegressor", + "pd-DrCIFRegressor", + # "pd-RandomForestRegressor", + # "pd-xgboost", + ], + "../aeon/aeon/datasets/local_data/forecasting/windowed_series.txt", + "../Plots/Comparisons", + 1, + False, + None, + False, + ) diff --git a/tsml_eval/_wip/forecasting/exponential_smoothing.py b/tsml_eval/_wip/forecasting/exponential_smoothing.py deleted file mode 100644 index 3c4c3706..00000000 --- a/tsml_eval/_wip/forecasting/exponential_smoothing.py +++ /dev/null @@ -1,295 +0,0 @@ -"""Implementation of Hyndman C functions for exponential smoothing in numba. - -Three functions from here -https://github.com/robjhyndman/forecast/blob/master/src/etscalc.c - -// Functions called by R -void etscalc(double *, int *, double *, int *, int *, int *, int *, - double *, double *, double *, double *, double *, double *, double *, int*); -void etssimulate(double *, int *, int *, int *, int *, - double *, double *, double *, double *, int *, double *, double *); -void etsforecast(double *, int *, int *, int *, double *, int *, double *); - - -Nixtla version is generated by a notebook using something called mbdev. - -https://github.com/Nixtla/statsforecast/blob/main/statsforecast/ets.py - -completely undocumented. We need to verify what each of the parameters mean, -and check translation. -""" -import math - -from numba import njit -import numpy as np - - -NA = -99999.0 -MAX_NMSE = 30 -MAX_SEASONAL_PERIOD = 24 - -@njit(fastmath=True, cache=True) -def fit_ets(y, n, x, m, error, trend, season, alpha, beta, gamma, phi, e, lik, amse, nmse): - """Exponential smoothing - - Check parameters map to Hyndman?? Why 14 not 15? - - Parameters - ---------- - y : np.ndarray - Time series data. - n : int - The length of the time series. - x : np.ndarray - Initial states of the ETS model. Starting values for the level, trend, and seasonal - components. This variable evolves during execution to store the states at each time - step (i.e., the state space matrix). - m : int - The period of the seasonality (e.g., for quaterly data m = 4) - error : int - The type of error model (0 -> None, 1 -> additive, 2 -> multiplicative). - trend : int - The type of trend model (0 -> None, 1 -> additive, 2 -> multiplicative). - season : int - The type of seasonality model (0 -> None, 1 -> additive, 2 -> multiplicative). - alpha : float - Smoothing parameter for the level. - beta : float - Smoothing parameter for the trend. - gamma : float - Smoothing parameter for the seasonality. - phi : float - Damping parameter. - e : np.ndarray - Residuals of the fitted model. - lik : np.ndarray - Likelihood measure. - amse : np.ndarray - Empty array for storing the Average Mean Squared Error. - nmse : int - The number of steps ahead to be considered for the calculation of AMSE. Determines - the forcasting horizon. - - Returns - ------- - """ - assert (m <= MAX_SEASONAL_PERIOD) or (season == 0), "Seasonal period must be <= 24 if seasonality is enabled" - if m < 1: - m = 1 - if nmse > MAX_NMSE: - nmse = MAX_NMSE - - olds = np.zeros(MAX_SEASONAL_PERIOD) - s = np.zeros(MAX_SEASONAL_PERIOD) - f = np.zeros(MAX_NMSE) - denom = np.zeros(MAX_NMSE) - nstates = 1 + (trend > 0) + m*(season > 0) - lik[0] = 0.0 - lik2 = 0.0 - denom = np.zeros(nmse) - - l = x[0] - if trend > 0: - b = x[1] - else: - b = 0.0 - if season > 0: - for j in range(m): - s[j] = x[(trend > 0) + j + 1] - - for i in range(n): - # Copy previous state. - oldl = l - if trend > 0: - oldb = b - if season > 0: - for j in range(m): - olds[j] = s[j] - - # One step forecast. - forecast(oldl, oldb, olds, m, trend, season, phi, f, nmse) - if(math.fabs(f[0] - NA) < 1.0e-10): # TOL - lik[0] = NA - return - - if error == 1: # Additive error model. - e[i] = y[i] - f[0] - else: - e[i] = (y[i] - f[0]) / f[0] - - for j in range(nmse): - if i+j < n: - denom[j] += 1.0 - tmp = y[i+j] - f[j] - amse[j] = (amse[j] * (denom[j]-1)+(tmp*tmp)) / denom[j] - - # Update state. - l, b, s = update(oldl, l, oldb, b, olds, s, m, trend, season, alpha, beta, gamma, phi, y[i]) - - # Store new state. - x[nstates*(i+1)] = l - if trend > 0: - x[nstates*(i+1)+1] = b - if season > 0: - for j in range(m): - x[(trend > 0) + nstates*(i+1) + j + 1] = s[j] - lik[0] = lik[0] + e[i]*e[i] - lik2 += np.log(math.fabs(f[0])) - - lik[0] = n * np.log(lik[0]) - if error == 2: # Multiplicative error model. - lik[0] = lik[0] + 2*lik2 - return f - -@njit(fastmath=True, cache=True) -def forecast(l, b, s, m, trend, season, phi, f, h): - """Performs forcasting. - - Helper function for fit_ets. - - Parameters - ---------- - l : float - Current level. - b : float - Current trend. - s : float - Current seasonal components. - m : int - The period of the seasonality (e.g., for quaterly data m = 4) - trend : int - The type of trend model (0 -> None, 1 -> additive, 2 -> multiplicative). - season : int - The type of seasonality model (0 -> None, 1 -> additive, 2 -> multiplicative). - phi : float - Damping parameter. - f : np.ndarray - Array to store forcasted values. - h : int - The number of steps ahead to forcast. - """ - phistar = phi - - # Forecasts - for i in range(h): - if trend == 0: # No trend component. - f[i] = l - elif trend == 1: # Additive trend component. - f[i] = l + phistar*b - elif b < 0: - f[i] = NA - else: - f[i] = l * b**phistar - - j = m - 1 - i - while j < 0: - j += m - - if season == 1: # Additive seasonal component. - f[i] = f[i] + s[j] - elif season == 2: # Multiplicative seasonal component. - f[i] = f[i] * s[j] - if i < (h-1): - if math.fabs(phi-1) < 1.0e-10: # TOL - phistar = phistar + 1 - else: - phistar = phistar + phi**(i+1) - - -@njit(fastmath=True, cache=True) -def update(oldl, l, oldb, b, olds, s, m, trend, season, alpha, beta, gamma, phi, y): - """Updates states. - - Helper function for fit_ets - - Parameters - ---------- - oldl : float - Previous level. - l : float - Current level. - oldb : float - Previous trend. - b : float - Current trend. - olds : np.ndarray - Previous seasonal components. - s : np.ndarray - Current seasonal components. - m : int - The period of the seasonality (e.g., for quaterly data m = 4) - trend : int - The type of trend model (0 -> None, 1 -> additive, 2 -> multiplicative). - season : int - The type of seasonality model (0 -> None, 1 -> additive, 2 -> multiplicative). - alpha : float - Smoothing parameter for the level. - beta : float - Smoothing parameter for the trend. - gamma : float - Smoothing parameter for the seasonality. - phi : float - Damping parameter. - y : np.ndarray - Time series data. - - Returns - ---------- - l : float - Updated level. - b : float - Updated trend. - s : float - Updated seasonal components. - """ - # New level. - if trend == 0: # No trend component. - phib = 0 - q = oldl # l(t-1) - elif trend == 1: # Additive trend component. - phib = phi*(oldb) - q = oldl + phib # l(t-1) + phi*b(t-1) - elif math.fabs(phi-1) < 1.0e-10: # TOL - phib = oldb - q = oldl * oldb # l(t-1) * b(t-1) - else: - phib = oldb**phi - q = oldl * phib # l(t-1) * b(t-1)^phi - - if season == 0: # No seasonal component. - p = y - elif season == 1: # Additive seasonal component. - p = y - olds[m-1] # y[t] - s[t-m] - else: - if math.fabs(olds[m-1]) < 1.0e-10: # TOL - p = 1.0e10 # HUGEN - else: - p = y / olds[m-1] # y[t] / s[t-m] - l = q + alpha*(p-q) - - # New growth. - if trend > 0: - if trend == 1: # Additive trend component. - r = l - oldl # l[t] - l[t-1] - else: # Multiplicative trend component. - if math.fabs(oldl) < 1.0e-10: # TOL - r = 1.0e10 # HUGEN - else: - r = l / oldl # l[t] / l[t-1] - b = phib + (beta / alpha)*(r - phib) # b[t] = phi*b[t-1] + beta*(r - phi*b[t-1]) - # b[t] = b[t-1]^phi + beta*(r - b[t-1]^phi) - - # New season. - if season > 0: - if season == 1: # Additive seasonal component. - t = y - q - else: # Multiplicative seasonal compoenent. - if math.fabs(q) < 1.0e-10: - t = 1.0e10 - else: - t = y / q - s[0] = olds[m-1] + gamma*(t - olds[m-1]) # s[t] = s[t-m] + gamma*(t - s[t-m]) - for j in range(m): - s[j] = olds[j-1] # s[t] = s[t] - - return l, b, s diff --git a/tsml_eval/_wip/forecasting/exponential_smoothing_comparison.py b/tsml_eval/_wip/forecasting/exponential_smoothing_comparison.py deleted file mode 100644 index 4215885c..00000000 --- a/tsml_eval/_wip/forecasting/exponential_smoothing_comparison.py +++ /dev/null @@ -1,84 +0,0 @@ -import time - -import numba -import numpy as np -from tsml_eval._wip.forecasting.exponential_smoothing import fit_ets -from statsforecast.ets import etscalc -from statsforecast.utils import AirPassengers as ap - -NA = -99999.0 -MAX_NMSE = 30 -MAX_SEASONAL_PERIOD = 24 - -def setup(): - y = ap - n = len(ap) - m = 12 - error = 1 - trend = 1 - season = 1 - nstates = 1 + (trend > 0) + m * (season > 0) - init_states = np.zeros(n * (nstates + 1)) - init_states[0] = y[0] - init_states[1] = (y[1]-y[0]) / 2 - alpha = 0.016763333 - beta = 0.001766333 - gamma = 0. - phi = 0.1 - e = np.zeros(n) - lik_fitets = np.zeros(1) - amse = np.zeros(MAX_NMSE) - nmse = 3 - return y, n, init_states, m, error, trend, season, alpha, beta, gamma, phi, e, lik_fitets, amse, nmse - - -def test_ets_comparison(setup): - y, n, init_states, m, error, trend, season, alpha, beta, gamma, phi, e, lik_fitets, amse, nmse = setup - - # tsml-eval implementation - start = time.time() - f1=fit_ets(y, n, init_states, m, - error, trend, season, - alpha, beta, gamma, phi, - e, lik_fitets, amse, nmse) - end = time.time() - time_fitets = end - start - - init_states_fitets = init_states.copy() - e_fitets = e.copy() - amse_fitets = amse.copy() - - # Reinitialise arrays - init_states.fill(0) - init_states[0] = y[0] - init_states[1] = (y[1]-y[0]) / 2 - e.fill(0) - amse.fill(0) - - # Nixtla/statsforcast implementation - start = time.time() - lik_etscalc = etscalc(ap, n, init_states, m, - error, trend, season, - alpha, beta, gamma, phi, - e, amse, nmse) - end = time.time() - time_etscalc = end - start - - init_states_etscalc = init_states.copy() - e_etscalc = e.copy() - amse_etscalc = amse.copy() - - # Comparing outputs and runtime - assert np.allclose(init_states_fitets, init_states_etscalc) - assert np.allclose(e_fitets, e_etscalc) - assert np.allclose(amse_fitets, amse_etscalc) - assert np.isclose(lik_fitets, lik_etscalc) - - print(time_fitets) - print(time_etscalc) - print(f1) - return - - -if __name__ == "__main__": - test_ets_comparison(setup()) diff --git a/tsml_eval/estimators/regression/__init__.py b/tsml_eval/estimators/regression/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tsml_eval/estimators/regression/_sklearn_regressor.py b/tsml_eval/estimators/regression/_sklearn_regressor.py new file mode 100644 index 00000000..aa86ed50 --- /dev/null +++ b/tsml_eval/estimators/regression/_sklearn_regressor.py @@ -0,0 +1,76 @@ +"""A tsml wrapper for sklearn regressors.""" + +__maintainer__ = ["MatthewMiddlehurst"] +__all__ = ["SklearnToTsmlRegressor"] + +import numpy as np +from aeon.base._base import _clone_estimator +from sklearn.base import RegressorMixin +from sklearn.utils.validation import check_is_fitted +from tsml.base import BaseTimeSeriesEstimator + + +class SklearnToTsmlRegressor(RegressorMixin, BaseTimeSeriesEstimator): + """Wrapper for sklearn estimators to use the tsml base class.""" + + def __init__( + self, + regressor=None, + pad_unequal=False, + concatenate_channels=False, + clone_estimator=True, + random_state=None, + ): + self.regressor = regressor + self.pad_unequal = pad_unequal + self.concatenate_channels = concatenate_channels + self.clone_estimator = clone_estimator + self.random_state = random_state + + super().__init__() + + def fit(self, X, y): + """Wrap fit.""" + if self.regressor is None: + raise ValueError("Regressor not set") + + X, y = self._validate_data( + X=X, + y=y, + convert_2d=self.concatenate_channels, + ensure_equal_length=not self.pad_unequal, + ) + X = self._convert_X( + X, + pad_unequal=self.pad_unequal, + concatenate_channels=self.concatenate_channels, + ) + + self._regressor = ( + _clone_estimator(self.regressor, self.random_state) + if self.clone_estimator + else self.regressor + ) + self._regressor.fit(X, y) + + return self + + def predict(self, X) -> np.ndarray: + """Wrap predict.""" + check_is_fitted(self) + + X = self._validate_data(X=X, reset=False) + X = self._convert_X( + X, + pad_unequal=self.pad_unequal, + concatenate_channels=self.concatenate_channels, + ) + + return self._regressor.predict(X) + + def _more_tags(self): + return { + "X_types": ["2darray"], + "equal_length_only": (False if self.pad_unequal else True), + "univariate_only": False if self.concatenate_channels else True, + } diff --git a/tsml_eval/evaluation/storage/forecaster_results.py b/tsml_eval/evaluation/storage/forecaster_results.py index 87cdf665..d372d730 100644 --- a/tsml_eval/evaluation/storage/forecaster_results.py +++ b/tsml_eval/evaluation/storage/forecaster_results.py @@ -1,10 +1,13 @@ """Class for storing and loading results from a forecasting experiment.""" import numpy as np -from sklearn.metrics import mean_absolute_percentage_error +from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error from tsml_eval.evaluation.storage.estimator_results import EstimatorResults -from tsml_eval.utils.results_writing import write_forecasting_results +from tsml_eval.utils.results_writing import ( + results_third_line, + write_results_to_tsml_format, +) class ForecasterResults(EstimatorResults): @@ -56,6 +59,8 @@ class ForecasterResults(EstimatorResults): ---------- mean_absolute_percentage_error : float or None Mean absolute percentage error of the predictions. + mean_absolute_squared_error : float or None + Mean absolute squared error of the predictions. Examples -------- @@ -100,6 +105,7 @@ def __init__( self.forecasting_horizon = None self.mean_absolute_percentage_error = None + self.mean_absolute_squared_error = None super().__init__( dataset_name=dataset_name, @@ -118,6 +124,7 @@ def __init__( # var_name: (display_name, higher is better, is timing) statistics = { "mean_absolute_percentage_error": ("MAPE", False, False), + "mean_absolute_squared_error": ("MASE", False, False), **EstimatorResults.statistics, } @@ -141,8 +148,19 @@ def save_to_file(self, file_path, full_path=True): self.mean_absolute_percentage_error = mean_absolute_percentage_error( self.target_labels, self.predictions ) - - write_forecasting_results( + if self.mean_absolute_squared_error is None: + self.mean_absolute_squared_error = mean_squared_error( + self.target_labels, self.predictions + ) + third_line = results_third_line( + mape=self.mean_absolute_percentage_error, + mse=self.mean_absolute_squared_error, + fit_time=self.fit_time, + predict_time=self.predict_time, + benchmark_time=self.benchmark_time, + memory_usage=self.memory_usage, + ) + write_results_to_tsml_format( self.predictions, self.target_labels, self.estimator_name, @@ -150,15 +168,11 @@ def save_to_file(self, file_path, full_path=True): file_path, full_path=full_path, split=self.split, - random_seed=self.resample_id, + resample_id=self.resample_id, time_unit=self.time_unit, first_line_comment=self.description, - parameter_info=self.parameter_info, - mape=self.mean_absolute_percentage_error, - fit_time=self.fit_time, - predict_time=self.predict_time, - benchmark_time=self.benchmark_time, - memory_usage=self.memory_usage, + second_line=self.parameter_info, + third_line=third_line, ) def load_from_file(self, file_path, verify_values=True): @@ -205,6 +219,11 @@ def calculate_statistics(self, overwrite=False): self.target_labels, self.predictions ) + if self.mean_absolute_squared_error is None or overwrite: + self.mean_absolute_squared_error = mean_squared_error( + self.target_labels, self.predictions + ) + def infer_size(self, overwrite=False): """ Infer and return the size of the dataset used in the results. diff --git a/tsml_eval/evaluation/storage/regressor_results.py b/tsml_eval/evaluation/storage/regressor_results.py index e1b6ed48..a5aea40f 100644 --- a/tsml_eval/evaluation/storage/regressor_results.py +++ b/tsml_eval/evaluation/storage/regressor_results.py @@ -10,7 +10,10 @@ ) from tsml_eval.evaluation.storage.estimator_results import EstimatorResults -from tsml_eval.utils.results_writing import write_regression_results +from tsml_eval.utils.results_writing import ( + regression_results_third_line, + write_results_to_tsml_format, +) class RegressorResults(EstimatorResults): @@ -180,8 +183,18 @@ def save_to_file(self, file_path, full_path=True): self.mean_squared_error = mean_squared_error( self.target_labels, self.predictions ) - - write_regression_results( + third_line = regression_results_third_line( + mape=self.mean_absolute_percentage_error, + mse=self.mean_squared_error, + fit_time=self.fit_time, + predict_time=self.predict_time, + benchmark_time=self.benchmark_time, + memory_usage=self.memory_usage, + train_estimate_method=self.train_estimate_method, + train_estimate_time=self.train_estimate_time, + fit_and_estimate_time=self.fit_and_estimate_time, + ) + write_results_to_tsml_format( self.predictions, self.target_labels, self.estimator_name, @@ -192,15 +205,8 @@ def save_to_file(self, file_path, full_path=True): resample_id=self.resample_id, time_unit=self.time_unit, first_line_comment=self.description, - parameter_info=self.parameter_info, - mse=self.mean_squared_error, - fit_time=self.fit_time, - predict_time=self.predict_time, - benchmark_time=self.benchmark_time, - memory_usage=self.memory_usage, - train_estimate_method=self.train_estimate_method, - train_estimate_time=self.train_estimate_time, - fit_and_estimate_time=self.fit_and_estimate_time, + second_line=self.parameter_info, + third_line=third_line, ) def load_from_file(self, file_path, verify_values=True): diff --git a/tsml_eval/experiments/_get_forecaster.py b/tsml_eval/experiments/_get_forecaster.py index 4f68ec75..8e0c2fe9 100644 --- a/tsml_eval/experiments/_get_forecaster.py +++ b/tsml_eval/experiments/_get_forecaster.py @@ -2,6 +2,16 @@ __maintainer__ = ["MatthewMiddlehurst"] +from aeon.forecasting import ( + AutoARIMAForecaster, + AutoETSForecaster, + AutoSARIMAForecaster, + ETSForecaster, + NaiveForecaster, +) +from aeon.forecasting._sktime_autoets import SktimeAutoETSForecaster +from aeon.forecasting._statsforecast_autoets import StatsForecastAutoETSForecaster + from tsml_eval.utils.functions import str_in_nested_list deep_forecasters = [ @@ -16,7 +26,11 @@ "autoarima", ["etsforecaster", "ets"], ["tarforecaster", "tar"], + ["autoetsforecaster", "autoets"], + ["autosarima", "sarima"], "autotar", + "sktimeets", + "statsforecastets", ["setarforecaster", "setar"], ["thetaforecaster", "theta"], ["tvpforecaster", "tvp"], @@ -123,6 +137,16 @@ def _set_forecaster_stats(f, random_state, n_jobs, kwargs): from aeon.forecasting.stats import TVP return TVP(**kwargs) + elif f == "autoetsforecaster" or f == "autoets": + return AutoETSForecaster(**kwargs) + elif f == "sktimeets": + return SktimeAutoETSForecaster(**kwargs) + elif f == "statsforecastets": + return StatsForecastAutoETSForecaster(**kwargs) + elif f == "autosarima" or f == "sarima": + return AutoSARIMAForecaster(**kwargs) + elif f == "autoarima": + return AutoARIMAForecaster(**kwargs) def _set_forecaster_regression(f, random_state, n_jobs, kwargs): diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index e78ca8df..75a8d7df 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -16,6 +16,7 @@ ] import os +import tempfile import time import warnings from datetime import datetime @@ -25,8 +26,9 @@ from aeon.benchmarking.metrics.clustering import clustering_accuracy_score from aeon.classification import BaseClassifier from aeon.clustering import BaseClusterer -from aeon.forecasting import BaseForecaster +from aeon.forecasting import BaseForecaster, RegressionForecaster from aeon.regression.base import BaseRegressor +from aeon.transformations.series import TrainTestTransformer from aeon.utils.validation import get_n_cases from sklearn import preprocessing from sklearn.base import BaseEstimator, is_classifier, is_regressor @@ -50,13 +52,15 @@ estimator_attributes_to_file, timing_benchmark, ) +from tsml_eval.utils.functions import time_function from tsml_eval.utils.memory_recorder import record_max_memory from tsml_eval.utils.resampling import resample_data, stratified_resample_data from tsml_eval.utils.results_writing import ( + regression_results_third_line, + results_third_line, write_classification_results, write_clustering_results, - write_forecasting_results, - write_regression_results, + write_results_to_tsml_format, ) MEMRECORD_ENV = os.getenv("MEMRECORD_INTERVAL") @@ -440,28 +444,37 @@ def load_and_run_classification_experiment( ) -def run_regression_experiment( - X_train: np.ndarray | list, - y_train: np.ndarray, - X_test: np.ndarray | list, - y_test: np.ndarray, - regressor, - results_path, - regressor_name=None, - dataset_name="", - resample_id=None, - data_transforms=None, - build_test_file=True, - build_train_file=False, - ignore_custom_train_estimate=False, - attribute_file_path=None, - att_max_shape=0, - benchmark_time=True, +def transform_input( + data_transforms, + x_train: np.ndarray, + x_test: np.ndarray, + y_train: np.ndarray = None, + y_test: np.ndarray = None, ): - """Run a regression experiment and save the results to file. + if data_transforms is not None: + if not isinstance(data_transforms, list): + data_transforms = [data_transforms] + + for transform in data_transforms: + x_train = transform.fit_transform(x_train, y_train) + x_test = transform.transform(x_test, y_test) + return x_train, x_test + + +def cross_validate_train_data(estimator, y_train, X_train): + cv_size = min(10, len(y_train)) + start = int(round(time.time() * 1000)) + train_preds = cross_val_predict(estimator, X_train, y=y_train, cv=cv_size) + train_time = int(round(time.time() * 1000)) - start + train_estimate_method = f"{cv_size}F-CV" + return train_preds, train_time, train_estimate_method - Function to run a basic regression experiment for a - // combination and write the results to csv file(s) + +class Experiment: + """Run an experiment and save the results to file. + + Function to run a basic experiment for a + // combination and write the results to csv file(s) at a given location. Parameters @@ -478,13 +491,13 @@ def run_regression_experiment( y_test : np.array Testing data class labels. One label per case in the testing data using the same ordering. - regressor : BaseRegressor - Regressor to be used in the experiment. + estimator : BaseRegressor + Estimator to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. - regressor_name : str or None, default=None - Name of regressor used in writing results. If None, the name is taken from - the regressor. + estimator_name : str or None, default=None + Name of estimator used in writing results. If None, the name is taken from + the estimator. dataset_name : str, default="N/A" Name of dataset. resample_id : int or None, default=None @@ -496,11 +509,11 @@ def run_regression_experiment( If None, no transformation is applied. Calls fit_transform on the training data and transform on the test data. build_test_file : bool, default=True: - Whether to generate test files or not. If the regressor can generate its own + Whether to generate test files or not. If the estimator can generate its own train predictions, the classifier will be built but no file will be output. build_train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold - cross-validation on the train data and saves. If the regressor can produce its + cross-validation on the train data and saves. If the estimator can produce its own estimates, those are used instead. ignore_custom_train_estimate : bool, default=False todo @@ -512,150 +525,298 @@ def run_regression_experiment( Whether to benchmark the hardware used with a simple function and write the results. This will typically take ~2 seconds, but is hardware dependent. """ - if not build_test_file and not build_train_file: - raise ValueError( - "Both test_file and train_file are set to False. " - "At least one must be written." + + def __init__( + self, + estimator, + results_path, + estimator_name=None, + dataset_name="", + resample_id=None, + data_transforms=None, + overwrite=False, + build_train_file=False, + write_attributes=False, + att_max_shape=0, + benchmark_time=True, + ): + build_test_file, build_train_file = _check_existing_results( + results_path, + estimator_name, + dataset_name, + resample_id, + overwrite, + True, + build_train_file, ) - if regressor_name is None: - regressor_name = type(regressor).__name__ + if not build_test_file and not build_train_file: + warnings.warn( + "All files exist and not overwriting, skipping.", stacklevel=1 + ) + return None - use_fit_predict = False - if isinstance(regressor, BaseRegressor): - if not ignore_custom_train_estimate and regressor.get_tag( - "capability:train_estimate", False, False - ): - use_fit_predict = True - elif isinstance(regressor, BaseTimeSeriesEstimator) and is_regressor(regressor): - pass - elif isinstance(regressor, BaseEstimator) and is_regressor(regressor): - regressor = SklearnToTsmlRegressor( - regressor=regressor, - pad_unequal=True, - concatenate_channels=True, - clone_estimator=False, - random_state=( - regressor.random_state if hasattr(regressor, "random_state") else None - ), + if write_attributes: + attribute_file_path = ( + f"{results_path}/{estimator_name}/Workspace/{dataset_name}/" + ) + else: + attribute_file_path = None + + # Ensure labels are floats + y_train = y_train.astype(float) + y_test = y_test.astype(float) + self.build_train_file = build_train_file + self.build_test_file = build_test_file + self.data_transforms = data_transforms + self.benchmark_time = benchmark_time + self.results_path = results_path + self.dataset_name = dataset_name + self.resample_id = resample_id + self.benchmark = -1 + if estimator_name is None: + self.estimator_name = type(estimator).__name__ + else: + self.estimator_name = estimator_name + self.estimator = self.validate_estimator(estimator=estimator) + self.second_comment = ( + str(estimator.get_params()).replace("\n", " ").replace("\r", " ") ) - else: - raise TypeError("regressor must be a tsml, aeon or sklearn regressor.") + if attribute_file_path is not None: + estimator_attributes_to_file( + self.estimator, attribute_file_path, max_list_shape=att_max_shape + ) - if data_transforms is not None: - if not isinstance(data_transforms, list): - data_transforms = [data_transforms] + def run_experiment(self): + x_train, y_train, x_test, y_test = self.load_experimental_data() - for transform in data_transforms: - X_train = transform.fit_transform(X_train, y_train) - X_test = transform.transform(X_test, y_test) + self.first_comment = ( + "Generated by run_experiment on " + f"{datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}" + ) - needs_fit = True - fit_time = -1 - mem_usage = -1 - benchmark = -1 - train_time = -1 - fit_and_train_time = -1 + x_train, x_test = transform_input( + data_transforms=self.data_transforms, + x_train=x_train, + x_test=x_test, + y_train=y_train, + y_test=y_test, + ) + if self.benchmark_time: + self.benchmark = timing_benchmark(random_state=self.resample_id) - if benchmark_time: - benchmark = timing_benchmark(random_state=resample_id) + if self.build_train_file: + train_preds, train_time = self.generate_train_preds(x_train, y_train) + self.write_results( + "TRAIN", y_train, train_preds, train_time, -1, self.benchmark, -1 + ) - first_comment = ( - "Generated by run_regression_experiment on " - f"{datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}" - ) + if self.build_test_file: + if self.needs_fit(): + mem_usage, fit_time = record_max_memory( + self.estimator.fit, + args=(x_train, y_train), + interval=MEMRECORD_INTERVAL, + return_func_time=True, + ) + fit_time += int(round(getattr(self.estimator, "_fit_time_milli", 0))) + test_preds, test_time = self.generate_test_preds(x_test, y_test) + test_time += int(round(getattr(self.estimator, "_predict_time_milli", 0))) + self.write_results( + "TEST", + y_test, + test_preds, + fit_time, + test_time, + self.benchmark, + mem_usage, + ) - second = str(regressor.get_params()).replace("\n", " ").replace("\r", " ") + def load_experimental_data(self): + return None, None, None, None - if build_train_file: - cv_size = min(10, len(y_train)) - start = int(round(time.time() * 1000)) - if use_fit_predict: - train_preds = regressor.fit_predict(X_train, y_train) - needs_fit = False - fit_and_train_time = int(round(time.time() * 1000)) - start - else: - train_preds = cross_val_predict(regressor, X_train, y=y_train, cv=cv_size) - train_time = int(round(time.time() * 1000)) - start + def validate_estimator(self, estimator): + estimator - train_mse = mean_squared_error(y_train, train_preds) + def generate_train_preds(self, X_train, y_train): + return time_function(self.estimator.fit_predict, (X_train, y_train)) - write_regression_results( - train_preds, - y_train, - regressor_name, - dataset_name, - results_path, + def generate_test_preds(self, x_test, y_test): + return time_function(self.estimator.predict, x_test) + + def needs_fit(self): + return False + + def write_results( + self, split, y, preds, fit_time, predict_time, benchmark_time, memory_usage + ): + third_line = self.get_third_line( + y, preds, fit_time, predict_time, benchmark_time, memory_usage + ) + write_results_to_tsml_format( + preds, + y, + self.estimator_name, + self.dataset_name, + self.results_path, full_path=False, - first_line_regressor_name=f"{regressor_name} ({type(regressor).__name__})", - split="TRAIN", - resample_id=resample_id, + first_line_estimator_name=f"{self.estimator_name} ({type(self.estimator).__name__})", + split=split, + resample_id=self.resample_id, time_unit="MILLISECONDS", - first_line_comment=first_comment, - parameter_info=second, - mse=train_mse, + first_line_comment=self.first_comment, + second_line=self.second_comment, + third_line=third_line, + ) + + def get_third_line( + self, y, preds, fit_time, predict_time, benchmark_time, memory_usage + ): + return results_third_line( + y=y, + preds=preds, fit_time=fit_time, - predict_time=-1, - benchmark_time=benchmark, - memory_usage=mem_usage, - train_estimate_method="Custom" if use_fit_predict else f"{cv_size}F-CV", - train_estimate_time=train_time, - fit_and_estimate_time=fit_and_train_time, + predict_time=predict_time, + benchmark_time=benchmark_time, + memory_usage=memory_usage, ) - if build_test_file: - if needs_fit: - mem_usage, fit_time = record_max_memory( - regressor.fit, - args=(X_train, y_train), - interval=MEMRECORD_INTERVAL, - return_func_time=True, + +class ForecastingExperiment(Experiment): + def __init__(self): + pass + + def load_experimental_data(self): + train = pd.read_csv( + f"{self.problem_path}/{self.dataset_name}/{self.dataset_name}_TRAIN.csv", + index_col=0, + ).squeeze("columns") + train = train.astype(float).to_numpy() + test = pd.read_csv( + f"{self.problem_path}/{self.dataset_name}/{self.dataset_name}_TEST.csv", + index_col=0, + ).squeeze("columns") + test = test.astype(float).to_numpy() + return train, None, test, None + + def generate_test_preds(self, x_test, y_test): + # TODO Implement this and train_preds properly + # Remove last value as we have no actual data for it + test_preds, test_time = time_function(self.estimator.predict, x_test) + test_preds = test_preds.flatten()[:-1] + return test_preds, test_time + + def validate_estimator(self, estimator): + return validate_forecaster(estimator) + + +class RegressionExperiment(Experiment): + def __init__( + self, + ignore_custom_train_estimate=False, + predefined_resample=False, + problem_path="", + ): + self.is_fitted = False + self.ignore_custom_train_estimate = ignore_custom_train_estimate + self.problem_path = problem_path + self.predefined_resample = predefined_resample + + def load_experimental_data(self): + X_train, y_train, X_test, y_test, resample = load_experiment_data( + self.problem_path, + self.dataset_name, + self.resample_id, + self.predefined_resample, + ) + + if resample: + X_train, y_train, X_test, y_test = resample_data( + X_train, y_train, X_test, y_test, random_state=self.resample_id ) - fit_time += int(round(getattr(regressor, "_fit_time_milli", 0))) + return X_train, y_train, X_test, y_test - if attribute_file_path is not None: - estimator_attributes_to_file( - regressor, attribute_file_path, max_list_shape=att_max_shape + def generate_train_preds(self, X_train, y_train): + if self.estimate_train_data and not self.ignore_custom_train_estimate: + self.train_estimate_method = "Custom" + train_preds, train_time = time_function( + self.estimator.fit_predict, (X_train, y_train) + ) + self.is_fitted = True + else: + train_preds, train_time, self.train_estimate_method = ( + cross_validate_train_data(self.estimator, y_train, X_train) ) + return train_preds, train_time - start = int(round(time.time() * 1000)) - test_preds = regressor.predict(X_test) - test_time = (int(round(time.time() * 1000)) - start) + int( - round(getattr(regressor, "_predict_time_milli", 0)) + def needs_fit(self): + return not self.is_fitted + + def get_third_line( + self, y, preds, fit_time, predict_time, benchmark_time, memory_usage + ): + return regression_results_third_line( + y=y, + preds=preds, + fit_time=fit_time, + predict_time=predict_time, + benchmark_time=benchmark_time, + memory_usage=memory_usage, + train_estimate_method=self.train_estimate_method, ) - test_mse = mean_squared_error(y_test, test_preds) + def validate_estimator(self, estimator): + estimator, estimate_train_data = validate_regressor(estimator) + self.estimate_train_data = estimate_train_data + return estimator - write_regression_results( - test_preds, - y_test, - regressor_name, - dataset_name, - results_path, - full_path=False, - first_line_regressor_name=f"{regressor_name} ({type(regressor).__name__})", - split="TEST", - resample_id=resample_id, - time_unit="MILLISECONDS", - first_line_comment=first_comment, - parameter_info=second, - mse=test_mse, - fit_time=fit_time, - predict_time=test_time, - benchmark_time=benchmark, - memory_usage=mem_usage, - train_estimate_method="N/A", - train_estimate_time=-1, - fit_and_estimate_time=fit_and_train_time, + +def validate_forecaster(estimator): + if isinstance(estimator, BaseForecaster): + return estimator + else: + try: + estimator, _ = validate_regressor(estimator) + return RegressionForecaster(regressor=estimator) + except TypeError: + raise TypeError( + "forecaster must be an aeon forecaster or a tsml, aeon or sklearn regressor." + ) + + +def validate_regressor(estimator): + estimate_train_data = False + if isinstance(estimator, BaseRegressor): + if estimator.get_tag("capability:train_estimate", False, False): + estimate_train_data = True + return estimator, estimate_train_data + elif isinstance(estimator, BaseTimeSeriesEstimator) and is_regressor(estimator): + return estimator, estimate_train_data + elif isinstance(estimator, BaseEstimator) and is_regressor(estimator): + return ( + SklearnToTsmlRegressor( + regressor=estimator, + pad_unequal=True, + concatenate_channels=True, + clone_estimator=False, + random_state=( + estimator.random_state + if hasattr(estimator, "random_state") + else None + ), + ), + estimate_train_data, ) + else: + raise TypeError("regressor must be a tsml, aeon or sklearn regressor.") def load_and_run_regression_experiment( problem_path, results_path, dataset, - regressor, - regressor_name=None, + estimator, + estimator_name=None, resample_id=0, data_transforms=None, build_train_file=False, @@ -665,10 +826,10 @@ def load_and_run_regression_experiment( overwrite=False, predefined_resample=False, ): - """Load a dataset and run a regression experiment. + """Load a dataset and run an experiment. - Function to load a dataset, run a basic regression experiment for a - // combination, and write the results to csv file(s) + Function to load a dataset, run a basic experiment for a + // combination, and write the results to csv file(s) at a given location. Parameters @@ -680,11 +841,11 @@ def load_and_run_regression_experiment( dataset : str Name of problem. Files must be //+"_TRAIN.ts", same for "_TEST.ts". - regressor : BaseRegressor - Regressor to be used in the experiment. - regressor_name : str or None, default=None - Name of regressor used in writing results. If None, the name is taken from - the regressor. + estimator : BaseRegressor + Estimator to be used in the experiment. + estimator_name : str or None, default=None + Name of estimator used in writing results. If None, the name is taken from + the estimator. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. @@ -695,7 +856,7 @@ def load_and_run_regression_experiment( Calls fit_transform on the training data and transform on the test data. build_train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold - cross-validation on the train data and saves. If the regressor can produce its + cross-validation on the train data and saves. If the estimator can produce its own estimates, those are used instead. benchmark_time : bool, default=True Whether to benchmark the hardware used with a simple function and write the @@ -708,58 +869,7 @@ def load_and_run_regression_experiment( the file format must include the resample_id at the end of the dataset name i.e. //++"_TRAIN.ts". """ - if regressor_name is None: - regressor_name = type(regressor).__name__ - - build_test_file, build_train_file = _check_existing_results( - results_path, - regressor_name, - dataset, - resample_id, - overwrite, - True, - build_train_file, - ) - - if not build_test_file and not build_train_file: - warnings.warn("All files exist and not overwriting, skipping.", stacklevel=1) - return - - X_train, y_train, X_test, y_test, resample = load_experiment_data( - problem_path, dataset, resample_id, predefined_resample - ) - - if resample: - X_train, y_train, X_test, y_test = resample_data( - X_train, y_train, X_test, y_test, random_state=resample_id - ) - - if write_attributes: - attribute_file_path = f"{results_path}/{regressor_name}/Workspace/{dataset}/" - else: - attribute_file_path = None - - # Ensure labels are floats - y_train = y_train.astype(float) - y_test = y_test.astype(float) - - run_regression_experiment( - X_train, - y_train, - X_test, - y_test, - regressor, - results_path, - regressor_name=regressor_name, - dataset_name=dataset, - resample_id=resample_id, - data_transforms=data_transforms, - build_test_file=build_test_file, - build_train_file=build_train_file, - attribute_file_path=attribute_file_path, - att_max_shape=att_max_shape, - benchmark_time=benchmark_time, - ) + pass def run_clustering_experiment( @@ -1150,122 +1260,71 @@ def load_and_run_clustering_experiment( def run_forecasting_experiment( train, - test, - forecaster, + y_test, + estimator, results_path, - forecaster_name=None, - dataset_name="N/A", - random_seed=None, + estimator_name=None, + dataset_name="", + resample_id=None, + data_transforms=None, + build_test_file=True, + build_train_file=False, attribute_file_path=None, att_max_shape=0, benchmark_time=True, ): - """Run a forecasting experiment and save the results to file. + """Run an experiment and save the results to file. - Function to run a basic forecasting experiment for a - // combination and write the results to csv file(s) + Function to run a basic experiment for a + // combination and write the results to csv file(s) at a given location. Parameters ---------- train : pd.DataFrame or np.array - The series used to train the forecaster. - test : pd.DataFrame or np.array - The series used to test the trained forecaster. - forecaster : BaseForecaster - Regressor to be used in the experiment. + The series used to train the estimator. + y_test : pd.DataFrame or np.array + The series used to y_test the trained estimator. + estimator : BaseForecaster + Estimator to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. - forecaster_name : str or None, default=None - Name of forecaster used in writing results. If None, the name is taken from - the forecaster. + estimator_name : str or None, default=None + Name of estimator used in writing results. If None, the name is taken from + the estimator. dataset_name : str, default="N/A" Name of dataset. - random_seed : int or None, default=None - Indicates what random seed was used as a random_state for the forecaster. Only + resample_id : int or None, default=None + Indicates what random seed was used as a random_state for the estimator. Only used for the results file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. benchmark_time : bool, default=True Whether to benchmark the hardware used with a simple function and write the results. This will typically take ~2 seconds, but is hardware dependent. """ - if not isinstance(forecaster, BaseForecaster): - raise TypeError("forecaster must be an aeon forecaster.") - - if forecaster_name is None: - forecaster_name = type(forecaster).__name__ - - benchmark = -1 - if benchmark_time: - benchmark = timing_benchmark(random_state=random_seed) - - first_comment = ( - "Generated by run_forecasting_experiment on " - f"{datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}" - ) - - second = str(forecaster.get_params()).replace("\n", " ").replace("\r", " ") - - mem_usage, fit_time = record_max_memory( - forecaster.fit, - args=(train,), - interval=MEMRECORD_INTERVAL, - return_func_time=True, - ) - fit_time += int(round(getattr(forecaster, "_fit_time_milli", 0))) - - if attribute_file_path is not None: - estimator_attributes_to_file( - forecaster, attribute_file_path, max_list_shape=att_max_shape - ) - - start = int(round(time.time() * 1000)) - test_preds = forecaster.predict(np.arange(1, len(test) + 1)) - test_time = ( - int(round(time.time() * 1000)) - - start - + int(round(getattr(forecaster, "_predict_time_milli", 0))) - ) - test_preds = test_preds.flatten() - - test_mape = mean_absolute_percentage_error(test, test_preds) - - write_forecasting_results( - test_preds, - test, - forecaster_name, - dataset_name, - results_path, - full_path=False, - first_line_forecaster_name=f"{forecaster_name} ({type(forecaster).__name__})", - split="TEST", - random_seed=random_seed, - time_unit="MILLISECONDS", - first_line_comment=first_comment, - parameter_info=second, - mape=test_mape, - fit_time=fit_time, - predict_time=test_time, - benchmark_time=benchmark, - memory_usage=mem_usage, - ) + pass def load_and_run_forecasting_experiment( problem_path, results_path, dataset, - forecaster, - forecaster_name=None, - random_seed=None, + estimator, + estimator_name=None, + resample_id=None, write_attributes=False, att_max_shape=0, benchmark_time=True, overwrite=False, ): - """Load a dataset and run a regression experiment. + """Load a dataset and run an experiment. - Function to load a dataset, run a basic regression experiment for a - / combination, and write the results to csv file(s) + Function to load a dataset, run a basic experiment for a + / combination, and write the results to csv file(s) at a given location. Parameters @@ -1277,13 +1336,13 @@ def load_and_run_forecasting_experiment( dataset : str Name of problem. Files must be //+"_TRAIN.csv", same for "_TEST.csv". - forecaster : BaseForecaster - Regressor to be used in the experiment. - forecaster_name : str or None, default=None - Name of forecaster used in writing results. If None, the name is taken from - the forecaster. - random_seed : int or None, default=None - Indicates what random seed was used as a random_state for the forecaster. Only + estimator : BaseForecaster + Estimator to be used in the experiment. + estimator_name : str or None, default=None + Name of estimator used in writing results. If None, the name is taken from + the estimator. + resample_id : int or None, default=None + Indicates what random seed was used as a random_state for the estimator. Only used for the results file name. benchmark_time : bool, default=True Whether to benchmark the hardware used with a simple function and write the @@ -1292,45 +1351,28 @@ def load_and_run_forecasting_experiment( If set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. """ - if forecaster_name is None: - forecaster_name = type(forecaster).__name__ - - build_test_file, _ = _check_existing_results( - results_path, - forecaster_name, - dataset, - random_seed, - overwrite, - True, - False, + tmpdir = tempfile.mkdtemp() + dataset = load_forecasting(dataset, tmpdir) + series = ( + dataset[dataset["series_name"] == series_name]["series_value"] + .iloc[0] + .to_numpy() ) + from aeon.transformations.series import TrainTestTransformer - if not build_test_file: - warnings.warn("All files exist and not overwriting, skipping.", stacklevel=1) - return - - if write_attributes: - attribute_file_path = f"{results_path}/{forecaster_name}/Workspace/{dataset}/" - else: - attribute_file_path = None - - train = pd.read_csv( - f"{problem_path}/{dataset}/{dataset}_TRAIN.csv", index_col=0 - ).squeeze("columns") + dataset = f"{dataset}_{series_name}" + train, test = TrainTestTransformer().fit_transform(series) train = train.astype(float).to_numpy() - test = pd.read_csv( - f"{problem_path}/{dataset}/{dataset}_TEST.csv", index_col=0 - ).squeeze("columns") test = test.astype(float).to_numpy() run_forecasting_experiment( train, test, - forecaster, + estimator, results_path, - forecaster_name=forecaster_name, + estimator_name=estimator_name, dataset_name=dataset, - random_seed=random_seed, + resample_id=resample_id, attribute_file_path=attribute_file_path, att_max_shape=att_max_shape, benchmark_time=benchmark_time, diff --git a/tsml_eval/experiments/forecasting_experiments.py b/tsml_eval/experiments/forecasting_experiments.py index d7b83d85..2bcffe64 100644 --- a/tsml_eval/experiments/forecasting_experiments.py +++ b/tsml_eval/experiments/forecasting_experiments.py @@ -20,6 +20,9 @@ os.environ["TF_NUM_INTEROP_THREADS"] = "1" os.environ["TF_NUM_INTRAOP_THREADS"] = "1" +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/aeon") +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/tsml-eval") + import numba from aeon.utils.validation._dependencies import _check_soft_dependencies diff --git a/tsml_eval/experiments/regression_experiments.py b/tsml_eval/experiments/regression_experiments.py index 6f681bdf..5880fd1b 100644 --- a/tsml_eval/experiments/regression_experiments.py +++ b/tsml_eval/experiments/regression_experiments.py @@ -20,6 +20,9 @@ os.environ["TF_NUM_INTEROP_THREADS"] = "1" os.environ["TF_NUM_INTRAOP_THREADS"] = "1" +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/aeon") +sys.path.append("C:/Users/alexb/Documents/University/PhD/aeon/tsml-eval") + import numba from aeon.utils.validation._dependencies import _check_soft_dependencies @@ -73,6 +76,32 @@ def run_experiment(args): ): print("Ignoring, results already present") else: + if current_dataset != item[0]: + dataset = load_forecasting(item[0], tmpdir) + current_dataset = item[0] + print(f"Current Dataset: {current_dataset}") # noqa + f.write(f"{item[0]}_{item[1]}\n") + series = ( + dataset[dataset["series_name"] == item[1]]["series_value"] + .iloc[0] + .to_numpy() + ) + dataset_name = f"{item[0]}_{item[1]}" + full_file_path = f"{location_of_datasets}/{dataset_name}" + if not os.path.exists(full_file_path): + os.makedirs(full_file_path) + if problem_type == "regression": + write_regression_dataset( + series, + full_file_path, + dataset_name, + difference_series=False, + difference_y=True, + ) + elif problem_type == "forecasting": + write_forecasting_dataset( + series, full_file_path, dataset_name, difference_series=False + ) load_and_run_regression_experiment( args.data_path, args.results_path, diff --git a/tsml_eval/utils/functions.py b/tsml_eval/utils/functions.py index 4c08a0ed..f325b5d2 100644 --- a/tsml_eval/utils/functions.py +++ b/tsml_eval/utils/functions.py @@ -7,6 +7,8 @@ "rank_array", ] +import time + import numpy as np @@ -100,3 +102,9 @@ def rank_array(arr, higher_better=True): ranks = len(arr) + 1 - ranks return ranks + + +def time_function(function, args=None, kwargs=None): + start = int(round(time.time() * 1000)) + output = function(*args, **kwargs) + return int(round(time.time() * 1000)) - start, output diff --git a/tsml_eval/utils/results_writing.py b/tsml_eval/utils/results_writing.py index 7546f623..8c081c70 100644 --- a/tsml_eval/utils/results_writing.py +++ b/tsml_eval/utils/results_writing.py @@ -4,15 +4,14 @@ __all__ = [ "write_classification_results", - "write_regression_results", "write_clustering_results", - "write_forecasting_results", "write_results_to_tsml_format", ] import os import numpy as np +from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error def write_classification_results( @@ -148,20 +147,51 @@ def write_classification_results( ) -def write_regression_results( - predictions, - labels, - regressor_name, - dataset_name, - file_path, - full_path=True, - first_line_regressor_name=None, - split=None, - resample_id=None, - time_unit="N/A", - first_line_comment=None, - parameter_info="No Parameter Info", - mse=-1, +def results_third_line( + y=None, + preds=None, + fit_time=-1, + predict_time=-1, + benchmark_time=-1, + memory_usage=-1, +): + """Create the third line for forecasting results files. + + Parameters + ---------- + y: np.array, default=None + The true data + preds: np.array, default=None + The predicted data + fit_time : int, default=-1 + The time taken to fit the estimator. + predict_time : int, default=-1 + The time taken to predict the labels. + benchmark_time : int, default=-1 + A benchmark time for the hardware used to scale other timings. + memory_usage : int, default=-1 + The memory usage of the estimator. + """ + mape = -1 + mse = -1 + if y is not None and preds is not None: + # The mean absolute percentage error of the predictions. + mape = mean_absolute_percentage_error(y, preds) + # The mean squared error of the predictions. + mse = mean_squared_error(y, preds) + return ( + f"{mape}," + f"{mse}," + f"{fit_time}," + f"{predict_time}," + f"{benchmark_time}," + f"{memory_usage}" + ) + + +def regression_results_third_line( + y=None, + preds=None, fit_time=-1, predict_time=-1, benchmark_time=-1, @@ -170,61 +200,28 @@ def write_regression_results( train_estimate_time=-1, fit_and_estimate_time=-1, ): - """Write the predictions for a regression experiment in the format used by tsml. + """Create the third line for regression results files. Parameters ---------- - predictions : np.array - The predicted values to write to file. Must be the same length as labels. - labels : np.array - The actual label values written to file with the predicted values. - regressor_name : str - Name of the regressor that made the predictions. Written to file and can - determine file structure if full_path is False. - dataset_name : str - Name of the problem the regressor was built on. - file_path : str - Path to write the results file to or the directory to build the default file - structure if full_path is False. - full_path : boolean, default=True - If True, results are written directly to the directory passed in file_path. - If False, then a standard file structure using the regressor and dataset names - is created and used to write the results file. - first_line_regressor_name : str or None, default=None - Alternative name for the regressor to be written to the file. If None, the - regressor_name is used. Useful if full_path is False and extra information is - wanted in the regressor name (i.e. and alias and class name) - split : str or None, default=None - Either None, 'TRAIN' or 'TEST'. Influences the result file name and first line - of the file. - resample_id : int or None, default=None - Indicates what random seed was used to resample the data or used as a - random_state for the regressor. - time_unit : str, default="N/A" - The format used for timings in the file, i.e. 'Seconds', 'Milliseconds', - 'Nanoseconds' - first_line_comment : str or None, default=None - Optional comment appended to the end of the first line, i.e. the file used to - generate the results. - parameter_info : str, default="No Parameter Info" - Unstructured estimator dependent information, i.e. estimator parameters or - values from the model build. - mse: float, default=-1 - The mean squared error of the predictions. + y: np.array, default=None + The true data + preds: np.array, default=None + The predicted data fit_time : int, default=-1 - The time taken to fit the regressor. + The time taken to fit the estimator. predict_time : int, default=-1 - The time taken to predict the regression labels. + The time taken to predict the labels. benchmark_time : int, default=-1 A benchmark time for the hardware used to scale other timings. memory_usage : int, default=-1 - The memory usage of the regressor. + The memory usage of the estimator. train_estimate_method : str, default="" The method used to generate predictions for results on training data. train_estimate_time : int, default=-1 The time taken to generate predictions for results on training data. fit_and_estimate_time : int, default=-1 - The time taken to fit the regressor to build and generate predictions for + The time taken to fit the estimator to build and generate predictions for results on training data. This is not necessarily always going to be fit_time + train_estimate_time, @@ -232,33 +229,21 @@ def write_regression_results( included in the train_estimate_time value. In this case fit_time + train_estimate_time would time fitting the model twice. """ - third_line = ( - f"{mse}," - f"{fit_time}," - f"{predict_time}," - f"{benchmark_time}," - f"{memory_usage}," + base_third_line = results_third_line( + y=y, + preds=preds, + fit_time=fit_time, + predict_time=predict_time, + benchmark_time=benchmark_time, + memory_usage=memory_usage, + ) + return ( + f"{base_third_line}," f"{train_estimate_method}," f"{train_estimate_time}," f"{fit_and_estimate_time}" ) - write_results_to_tsml_format( - predictions, - labels, - regressor_name, - dataset_name, - file_path, - full_path=full_path, - first_line_estimator_name=first_line_regressor_name, - split=split, - resample_id=resample_id, - time_unit=time_unit, - first_line_comment=first_line_comment, - second_line=parameter_info, - third_line=third_line, - ) - def write_clustering_results( cluster_predictions, @@ -380,99 +365,6 @@ def write_clustering_results( ) -def write_forecasting_results( - predictions, - labels, - forecaster_name, - dataset_name, - file_path, - full_path=True, - first_line_forecaster_name=None, - split=None, - random_seed=None, - time_unit="N/A", - first_line_comment=None, - parameter_info="No Parameter Info", - mape=-1, - fit_time=-1, - predict_time=-1, - benchmark_time=-1, - memory_usage=-1, -): - """Write the predictions for a forecasting experiment in the format used by tsml. - - Parameters - ---------- - predictions : np.array - The predicted values to write to file. Must be the same length as labels. - labels : np.array - The actual label values written to file with the predicted values. - forecaster_name : str - Name of the forecaster that made the predictions. Written to file and can - determine file structure if full_path is False. - dataset_name : str - Name of the problem the forecaster was built on. - file_path : str - Path to write the results file to or the directory to build the default file - structure if full_path is False. - full_path : boolean, default=True - If True, results are written directly to the directory passed in file_path. - If False, then a standard file structure using the forecaster and dataset names - is created and used to write the results file. - first_line_forecaster_name : str or None, default=None - Alternative name for the forecaster to be written to the file. If None, the - forecaster_name is used. Useful if full_path is False and extra information is - wanted in the forecaster name (i.e. and alias and class name) - split : str or None, default=None - Either None, 'TRAIN' or 'TEST'. Influences the result file name and first line - of the file. - random_seed : int or None, default=None - Indicates what random seed was used as a random_state for the forecaster. - time_unit : str, default="N/A" - The format used for timings in the file, i.e. 'Seconds', 'Milliseconds', - 'Nanoseconds' - first_line_comment : str or None, default=None - Optional comment appended to the end of the first line, i.e. the file used to - generate the results. - parameter_info : str, default="No Parameter Info" - Unstructured estimator dependent information, i.e. estimator parameters or - values from the model build. - mape: float, default=-1 - The mean absolute percentage error of the predictions. - fit_time : int, default=-1 - The time taken to fit the forecaster. - predict_time : int, default=-1 - The time taken to predict the forecasting labels. - benchmark_time : int, default=-1 - A benchmark time for the hardware used to scale other timings. - memory_usage : int, default=-1 - The memory usage of the forecaster. - """ - third_line = ( - f"{mape}," - f"{fit_time}," - f"{predict_time}," - f"{benchmark_time}," - f"{memory_usage}" - ) - - write_results_to_tsml_format( - predictions, - labels, - forecaster_name, - dataset_name, - file_path, - full_path=full_path, - first_line_estimator_name=first_line_forecaster_name, - split=split, - resample_id=random_seed, - time_unit=time_unit, - first_line_comment=first_line_comment, - second_line=parameter_info, - third_line=third_line, - ) - - def write_results_to_tsml_format( predictions, labels, diff --git a/tsml_eval/utils/tests/test_results_writing.py b/tsml_eval/utils/tests/test_results_writing.py index c3e6f2a2..ce8f4d48 100644 --- a/tsml_eval/utils/tests/test_results_writing.py +++ b/tsml_eval/utils/tests/test_results_writing.py @@ -23,10 +23,10 @@ _check_second_line, ) from tsml_eval.utils.results_writing import ( + regression_results_third_line, + results_third_line, write_classification_results, write_clustering_results, - write_forecasting_results, - write_regression_results, write_results_to_tsml_format, ) @@ -93,8 +93,8 @@ def test_write_classification_results_invalid(): def test_write_regression_results(): """Test writing of regression results files.""" labels, predictions, _ = _generate_labels_and_predictions() - - write_regression_results( + third_line = regression_results_third_line() + write_results_to_tsml_format( predictions, labels, "Test", @@ -102,6 +102,7 @@ def test_write_regression_results(): _REGRESSOR_RESULTS_PATH, full_path=False, first_line_comment="test_write_regression_results", + third_line=third_line, ) _check_regression_file_format( @@ -127,8 +128,8 @@ def _check_regression_file_format(file_path, num_results_lines=None): def test_write_forecasting_results(): """Test writing of forecasting results files.""" labels, predictions, _ = _generate_labels_and_predictions() - - write_forecasting_results( + third_line = results_third_line() + write_results_to_tsml_format( predictions, labels, "Test", @@ -136,6 +137,7 @@ def test_write_forecasting_results(): _FORECASTER_RESULTS_PATH, full_path=False, first_line_comment="test_write_forecasting_results", + third_line=third_line, ) _check_forecasting_file_format(