diff --git a/cesnet_tszoo/benchmarks.py b/cesnet_tszoo/benchmarks.py index b06ebb8..23b9175 100644 --- a/cesnet_tszoo/benchmarks.py +++ b/cesnet_tszoo/benchmarks.py @@ -28,37 +28,17 @@ class Benchmark: **Intended usage:** - For time-based: + 1. Call [`load_benchmark`][cesnet_tszoo.benchmarks.load_benchmark] with the desired benchmark. You can use your own saved benchmark or you can use already built-in one. This will download the dataset and annotations (if available) if they have not been previously downloaded. + 2. Retrieve the initialized dataset using [`get_initialized_dataset`](reference_benchmarks.md#cesnet_tszoo.benchmarks.Benchmark.get_initialized_dataset). This will provide a dataset that is ready to use. Check beforehand what type of dataset is returned. + 3. Use [`get_train_dataloader`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_train_dataloader)/[`get_train_df`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_train_df)/[`get_train_numpy`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_train_numpy) to get training data for chosen model. + 4. [Optional] Modify used preprocessing steps with [`update_dataset_config_and_initialize`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.update_dataset_config_and_initialize). + 5. Validate the model and perform the hyperparameter optimalization on [`get_val_dataloader`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_val_dataloader)/[`get_val_df`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_val_df)/[`get_val_numpy`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_val_numpy). + 6. Evaluate the model on [`get_test_dataloader`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_test_dataloader)/[`get_test_df`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_test_df)/[`get_test_numpy`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.get_test_numpy). - When using [`TimeBasedCesnetDataset`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset) (`dataset_type` = `DatasetType.TIME_BASED`): + You can create custom benchmarks with [`save_benchmark`](reference_cesnet_dataset.md#cesnet_tszoo.datasets.cesnet_dataset.CesnetDataset.save_benchmark). + They will be saved to `"data_root"/tszoo/benchmarks/` directory, where `data_root` was passed as parameter to [`load_benchmark`][cesnet_tszoo.benchmarks.load_benchmark]. - 1. Create an instance of the dataset with the desired data root by calling [`get_dataset`](reference_cesnet_database.md#cesnet_tszoo.datasets.databases.cesnet_database.CesnetDatabase.get_dataset). This will download the dataset if it has not been previously downloaded and return instance of dataset. - 2. Create an instance of [`TimeBasedConfig`](reference_time_based_config.md#references.TimeBasedConfig) and set it using [`set_dataset_config_and_initialize`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.set_dataset_config_and_initialize). - This initializes the dataset, including data splitting (train/validation/test), fitting transformers (if needed), selecting features, and more. This is cached for later use. - 3. Use [`get_train_dataloader`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset)/[`get_train_df`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_train_df)/[`get_train_numpy`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_train_numpy) to get training data for chosen model. - 4. Validate the model and perform the hyperparameter optimalization on [`get_val_dataloader`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_val_dataloader)/[`get_val_df`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_val_df)/[`get_val_numpy`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_val_numpy). - 5. Evaluate the model on [`get_test_dataloader`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_test_dataloader)/[`get_test_df`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_test_df)/[`get_test_numpy`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.get_test_numpy). - - When using [`SeriesBasedCesnetDataset`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset) (`dataset_type` = `DatasetType.SERIES_BASED`): - - 1. Create an instance of the dataset with the desired data root by calling [`get_dataset`](reference_cesnet_database.md#cesnet_tszoo.datasets.databases.cesnet_database.CesnetDatabase.get_dataset). This will download the dataset if it has not been previously downloaded and return instance of dataset. - 2. Create an instance of [`SeriesBasedConfig`](reference_series_based_config.md#references.SeriesBasedConfig) and set it using [`set_dataset_config_and_initialize`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.set_dataset_config_and_initialize). - This initializes the dataset, including data splitting (train/validation/test), fitting transformers (if needed), selecting features, and more. This is cached for later use. - 3. Use [`get_train_dataloader`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_train_dataloader)/[`get_train_df`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_train_df)/[`get_train_numpy`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_train_numpy) to get training data for chosen model. - 4. Validate the model and perform the hyperparameter optimalization on [`get_val_dataloader`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_val_dataloader)/[`get_val_df`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_val_df)/[`get_val_numpy`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_val_numpy). - 5. Evaluate the model on [`get_test_dataloader`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_test_dataloader)/[`get_test_df`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_test_df)/[`get_test_numpy`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.get_test_numpy). - - When using [`DisjointTimeBasedCesnetDataset`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset) (`dataset_type` = `DatasetType.DISJOINT_TIME_BASED`): - - 1. Create an instance of the dataset with the desired data root by calling [`get_dataset`](reference_cesnet_database.md#cesnet_tszoo.datasets.databases.cesnet_database.CesnetDatabase.get_dataset). This will download the dataset if it has not been previously downloaded and return instance of dataset. - 2. Create an instance of [`DisjointTimeBasedConfig`](reference_disjoint_time_based_config.md#references.DisjointTimeBasedConfig) and set it using [`set_dataset_config_and_initialize`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.set_dataset_config_and_initialize). - This initializes the dataset, including data splitting (train/validation/test), fitting transformers (if needed), selecting features, and more. This is cached for later use. - 3. Use [`get_train_dataloader`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_train_dataloader)/[`get_train_df`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_train_df)/[`get_train_numpy`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_train_numpy) to get training data for chosen model. - 4. Validate the model and perform the hyperparameter optimalization on [`get_val_dataloader`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_val_dataloader)/[`get_val_df`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_val_df)/[`get_val_numpy`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_val_numpy). - 5. Evaluate the model on [`get_test_dataloader`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_test_dataloader)/[`get_test_df`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_test_df)/[`get_test_numpy`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.get_test_numpy). - - You can create custom time-based benchmarks with [`save_benchmark`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.save_benchmark), series-based benchmarks with [`save_benchmark`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.save_benchmark) or disjoint-time-based with [`save_benchmark`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.save_benchmark). - They will be saved to `"data_root"/tszoo/benchmarks/` directory, where `data_root` was set when you created instance of dataset. + Above steps are practically the same for all dataset types, but there can be small differences is method parameters. Check each of them for info about that. """ def __init__(self, config: DatasetConfig, dataset: CesnetDataset, description: str = None): diff --git a/cesnet_tszoo/datasets/disjoint_time_based_cesnet_dataset.py b/cesnet_tszoo/datasets/disjoint_time_based_cesnet_dataset.py index 0568b31..1056318 100644 --- a/cesnet_tszoo/datasets/disjoint_time_based_cesnet_dataset.py +++ b/cesnet_tszoo/datasets/disjoint_time_based_cesnet_dataset.py @@ -37,7 +37,7 @@ class DisjointTimeBasedCesnetDataset(CesnetDataset): - **Numpy array**: For loading the entire training, validation or test set at once. - See [loading data][loading-data] for more details. - The dataset is stored in a [PyTables](https://www.pytables.org/) database. The internal `TimeBasedDataset`, `SplittedDataset`, `TimeBasedInitializerDataset` classes (used only when calling [`set_dataset_config_and_initialize`](reference_disjoint_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.disjoint_time_based_cesnet_dataset.DisjointTimeBasedCesnetDataset.set_dataset_config_and_initialize)) act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) + The dataset is stored in a [PyTables](https://www.pytables.org/) dataset. The internal `DisjointTimeBasedSplittedDataset`, `DisjointTimeBasedSplitDataset` and `DisjointTimeBasedInitializerDataset` classes act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) interface. These wrappers are compatible with PyTorch’s [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), providing efficient parallel data loading. The dataset configuration is done through the [`DisjointTimeBasedConfig`](reference_disjoint_time_based_config.md#references.DisjointTimeBasedConfig) class. @@ -64,11 +64,11 @@ class DisjointTimeBasedCesnetDataset(CesnetDataset): """Configuration of the dataset.""" train_dataset: Optional[DisjointTimeBasedSplittedDataset] = field(default=None, init=False) - """Training set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database.""" + """Training set as a `DisjointTimeBasedSplittedDataset` instance wrapping multiple `DisjointTimeBasedSplitDataset` that wrap the PyTables dataset.""" val_dataset: Optional[DisjointTimeBasedSplittedDataset] = field(default=None, init=False) - """Validation set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database.""" + """Validation set as a `DisjointTimeBasedSplittedDataset` instance wrapping multiple `DisjointTimeBasedSplitDataset` that wrap the PyTables dataset.""" test_dataset: Optional[DisjointTimeBasedSplittedDataset] = field(default=None, init=False) - """Test set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database. """ + """Test set as a `DisjointTimeBasedSplittedDataset` instance wrapping multiple `DisjointTimeBasedSplitDataset` that wrap the PyTables dataset. """ train_dataloader: Optional[DisjointTimeBasedDataloader] = field(default=None, init=False) """Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for training set.""" diff --git a/cesnet_tszoo/datasets/series_based_cesnet_dataset.py b/cesnet_tszoo/datasets/series_based_cesnet_dataset.py index 30cb079..4d14335 100644 --- a/cesnet_tszoo/datasets/series_based_cesnet_dataset.py +++ b/cesnet_tszoo/datasets/series_based_cesnet_dataset.py @@ -39,7 +39,7 @@ class SeriesBasedCesnetDataset(CesnetDataset): - **Numpy array**: For loading the entire training, validation, test or all set at once. - See [loading data][loading-data] for more details. - The dataset is stored in a [PyTables](https://www.pytables.org/) database. The internal `SeriesBasedDataset` and `SeriesBasedInitializerDataset` classes (used only when calling [`set_dataset_config_and_initialize`](reference_series_based_cesnet_dataset.md#cesnet_tszoo.datasets.series_based_cesnet_dataset.SeriesBasedCesnetDataset.set_dataset_config_and_initialize)) act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) + The dataset is stored in a [PyTables](https://www.pytables.org/) dataset. The internal `SeriesBasedDataset` and `SeriesBasedInitializerDataset` classes act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) interface. These wrappers are compatible with PyTorch’s [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), providing efficient parallel data loading. The dataset configuration is done through the [`SeriesBasedConfig`](reference_series_based_config.md#references.SeriesBasedConfig) class. @@ -66,13 +66,13 @@ class SeriesBasedCesnetDataset(CesnetDataset): """Configuration of the dataset.""" train_dataset: Optional[SeriesBasedDataset] = field(default=None, init=False) - """Training set as a `SeriesBasedDataset` instance wrapping the PyTables database.""" + """Training set as a `SeriesBasedDataset` instance wrapping the PyTables dataset.""" val_dataset: Optional[SeriesBasedDataset] = field(default=None, init=False) - """Validation set as a `SeriesBasedDataset` instance wrapping the PyTables database.""" + """Validation set as a `SeriesBasedDataset` instance wrapping the PyTables dataset.""" test_dataset: Optional[SeriesBasedDataset] = field(default=None, init=False) - """Test set as a `SeriesBasedDataset` instance wrapping the PyTables database.""" + """Test set as a `SeriesBasedDataset` instance wrapping the PyTables dataset.""" all_dataset: Optional[SeriesBasedDataset] = field(default=None, init=False) - """All set as a `SeriesBasedDataset` instance wrapping the PyTables database. """ + """All set as a `SeriesBasedDataset` instance wrapping the PyTables dataset. """ train_dataloader: Optional[SeriesBasedDataloader] = field(default=None, init=False) """Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for training set.""" diff --git a/cesnet_tszoo/datasets/time_based_cesnet_dataset.py b/cesnet_tszoo/datasets/time_based_cesnet_dataset.py index acca79d..98517dd 100644 --- a/cesnet_tszoo/datasets/time_based_cesnet_dataset.py +++ b/cesnet_tszoo/datasets/time_based_cesnet_dataset.py @@ -38,7 +38,7 @@ class TimeBasedCesnetDataset(CesnetDataset): - **Numpy array**: For loading the entire training, validation, test or all set at once. - See [loading data][loading-data] for more details. - The dataset is stored in a [PyTables](https://www.pytables.org/) database. The internal `TimeBasedDataset`, `SplittedDataset`, `TimeBasedInitializerDataset` classes (used only when calling [`set_dataset_config_and_initialize`](reference_time_based_cesnet_dataset.md#cesnet_tszoo.datasets.time_based_cesnet_dataset.TimeBasedCesnetDataset.set_dataset_config_and_initialize)) act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) + The dataset is stored in a [PyTables](https://www.pytables.org/) dataset. The internal `TimeBasedSplittedDataset`, `TimeSplitBasedDataset` and `TimeBasedInitializerDataset` classes act as wrappers that implement the PyTorch [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) interface. These wrappers are compatible with PyTorch’s [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), providing efficient parallel data loading. The dataset configuration is done through the [`TimeBasedConfig`](reference_time_based_config.md#references.TimeBasedConfig) class. @@ -65,16 +65,16 @@ class TimeBasedCesnetDataset(CesnetDataset): """Configuration of the dataset.""" train_dataset: Optional[TimeBasedSplittedDataset] = field(default=None, init=False) - """Training set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database.""" + """Training set as a `TimeBasedSplittedDataset` instance wrapping multiple `TimeSplitBasedDataset` that wrap the PyTables dataset.""" val_dataset: Optional[TimeBasedSplittedDataset] = field(default=None, init=False) - """Validation set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database.""" + """Validation set as a `TimeBasedSplittedDataset` instance wrapping multiple `TimeSplitBasedDataset` that wrap the PyTables dataset.""" test_dataset: Optional[TimeBasedSplittedDataset] = field(default=None, init=False) - """Test set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database. """ + """Test set as a `TimeBasedSplittedDataset` instance wrapping multiple `TimeSplitBasedDataset` that wrap the PyTables dataset. """ all_dataset: Optional[TimeBasedSplittedDataset] = field(default=None, init=False) - """All set as a `SplittedDataset` instance wrapping multiple `TimeBasedDataset` that wrap the PyTables database. """ + """All set as a `TimeBasedSplittedDataset` instance wrapping multiple `TimeSplitBasedDataset` that wrap the PyTables dataset. """ train_dataloader: Optional[TimeBasedDataloader] = field(default=None, init=False) """Iterable PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) for training set.""" diff --git a/cesnet_tszoo/pytables_data/disjoint_based_splitted_dataset.py b/cesnet_tszoo/pytables_data/disjoint_based_splitted_dataset.py index 3640109..168ade6 100644 --- a/cesnet_tszoo/pytables_data/disjoint_based_splitted_dataset.py +++ b/cesnet_tszoo/pytables_data/disjoint_based_splitted_dataset.py @@ -2,7 +2,7 @@ from cesnet_tszoo.pytables_data.disjoint_based_split_dataset import DisjointTimeBasedSplitDataset from cesnet_tszoo.pytables_data.base_datasets.splitted_dataset import SplittedDataset -from cesnet_tszoo.data_models.load_dataset_configs.time_load_config import TimeLoadConfig +from cesnet_tszoo.data_models.load_dataset_configs.disjoint_time_load_config import DisjointTimeLoadConfig class DisjointTimeBasedSplittedDataset(SplittedDataset): @@ -12,7 +12,7 @@ class DisjointTimeBasedSplittedDataset(SplittedDataset): Splits ts_row_ranges based on workers and for each worker creates a DisjointTimeBasedSplitDataset with subset of values from ts_row_ranges. Then each worker gets a dataloader. """ - def __init__(self, database_path: str, table_data_path: str, load_config: TimeLoadConfig, workers: int): + def __init__(self, database_path: str, table_data_path: str, load_config: DisjointTimeLoadConfig, workers: int): self.load_config = load_config super().__init__(database_path, table_data_path, load_config, workers) diff --git a/docs/using_datasets.md b/docs/using_datasets.md index bda9ad9..9c85c06 100644 --- a/docs/using_datasets.md +++ b/docs/using_datasets.md @@ -66,7 +66,7 @@ from cesnet_tszoo.datasets import CESNET_AGG23 # Using dataset from CESNET_AGG23 # Only time-based -time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/") +time_based_dataset = CESNET_AGG23.get_dataset(data_root="/some_directory/") config = TimeBasedConfig(ts_ids=1) time_based_dataset.set_dataset_config_and_initialize(config)