-
-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmain.py
More file actions
669 lines (495 loc) · 29.9 KB
/
main.py
File metadata and controls
669 lines (495 loc) · 29.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = 'Kayuã Oleques Paim'
__email__ = 'kayuaolequesp@gmail.com'
__version__ = '{1}.{0}.{1}'
__initial_data__ = '2022/06/01'
__last_update__ = '2025/03/29'
__credits__ = ['Kayuã Oleques']
# MIT License
#
# Copyright (c) 2025 Synthetic Ocean AI
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
try:
import sys
import time
import numpy
import pandas
import logging
from sklearn.utils import shuffle
from Engine.Metrics.Metrics import Metrics
from Engine.DataIO.CSVLoader import autosave
from Engine.DataIO.CSVLoader import autoload
from Engine.Arguments.Arguments import Arguments
from Engine.Arguments.Arguments import arguments
from Engine.Metrics.Metrics import import_metrics
from Engine.Evaluation.Evaluation import Evaluation
from sklearn.model_selection import StratifiedKFold
from Engine.DataIO.CSVLoader import CSVDataProcessor
from Engine.Classifiers.Classifiers import Classifiers
from Engine.Models.GenerativeModels import import_models
from Engine.Models.GenerativeModels import GenerativeModels
from Engine.Evaluation.CrossValidation import StratifiedData
from Engine.Classifiers.Classifiers import import_classifiers
from Engine.Support.HardwareManager import HardwareManager
except ImportError as error:
print(error)
print()
print("1. (optional) Setup a virtual environment: ")
print(" python3 -m venv ~/Python3venv/SyntheticOceanAI ")
print(" source ~/Python3venv/SyntheticOceanAI/bin/activate ")
print()
print("2. Install requirements:")
print(" pip3 install --upgrade pip")
print(" pip3 install -r requirements.txt ")
print()
sys.exit(-1)
DEFAULT_VERBOSITY = logging.INFO
TIME_FORMAT = '%Y-%m-%d,%H:%M:%S'
DEFAULT_DATA_TYPE = "float32"
class SynDataGen(Arguments, CSVDataProcessor, Metrics, GenerativeModels, Classifiers, Evaluation):
"""
SYNTHETIC DATA GENERATION AND EVALUATION FRAMEWORK
=================================================
The Synthetic Ocean AI library is designed for the generation of tabular data using generative models.
It provides comprehensive GPU support, allowing for efficient processing, and can be used either as a
framework or as a Python library, offering flexibility depending on the user's requirements.
The library supports data ingestion from various formats, including CSV and XLS, enabling seamless
integration with existing datasets. It features a wide array of pre-implemented generative algorithms
and includes pre-trained models for immediate use, reducing the need for extensive training.
One of the key strengths of Synthetic Ocean AI is its ability to provide fine-grained model and
algorithm parameterization, giving users control over hyperparameters, training configurations,
and other aspects of the generative process. The library also includes built-in support for evaluation
metrics, allowing users to assess the quality of generated data. Additionally, it offers tools for
graph generation, enabling visual analysis of model performance and data generation processes.
A comprehensive pipeline for generating and evaluating synthetic data using various generative models.
The class combines data processing, model training, generation, and evaluation capabilities.
Key Features:
-------------
@ Supports multiple generative models (adversarial, autoencoder, variational, etc.)
@ Built-in stratified k-fold cross validation
@ Multiple evaluation strategies (TS-TR, TR-TS)
@ Automated metrics calculation and reporting
@ Model persistence and data export capabilities
# Version: 1.0.1
# Last Updated: 2025-5-28
# Author: Synthetic Ocean AI - Team
# License: MIT
Purpose:
--------
Provides an end-to-end pipeline for:
- Generating synthetic datasets using state-of-the-art generative models
- Evaluating synthetic data quality through multiple validation strategies
- Comparing model performance across different architectures
- Producing publication-ready metrics and visualizations
Architecture Overview:
---------------------
+--------+-------+ +--------+-------+ +--------+-------+
| Activations +------+ Layers +------+ Specials |
+-------+--------+ +-------+--------+ +--------+-------+
|
|
+-----------------+ +-------+--------+ +--------+-------+
| Arguments | | Models | | Loss |
+--------+--------+ +--------+-------+ +--------+-------+
| | |
+---------------+ +-------+--------+ | +--------+-------+ +--------+-------+
| DataProcessor +-------+ Generative +---------------@------------+ Algorithms +---------+ Optimizers |
+---------------+ | Models | +----------------+ +----------------+
+-------+--------+
|
+-------v--------+ +-------v--------+ +----------------+
| Plotter +------+ Metrics +---------------+ Classifiers |
+-------+--------+ +-------+--------+ +----------------+
|
+-------v--------+
| SynDataGen |
+----------------+
Model Catalog:
--------------
1. Adversarial (GAN) [model_type='adversarial']
Implements an adversarial training algorithm, typically used in Generative Adversarial Networks (GANs).
This class performs adversarial training by utilizing a generator and a discriminator,
optimizing the generator to produce realistic data while training the discriminator to differentiate
between real and fake data.
2. Autoencoder [model_type='autoencoder']
Implements a AutoEncoder model for generating synthetic data.
This class implements an Autoencoder model by inheriting from the VanillaEncoder and VanillaDecoder classes.
It constructs an autoencoder architecture by combining both an encoder and a decoder with customizable
hyperparameters. The autoencoder is typically used for tasks such as dimensionality reduction, feature learning,
and denoising.
3. Variational Autoencoder [model_type='variational']
Implements a Variational AutoEncoder (VAE) model for generating synthetic data.
The model includes an encoder and a decoder for encoding input data and reconstructing
it from a learned latent space. During training, it computes both the reconstruction loss
and the KL divergence loss. The trained decoder can be used to generate synthetic data.
4. Vector Quantized Variational Autoencoder [model_type='quantized']
Implements a Vector Quantized Variational Autoencoder (VQ-VAE) model for generating synthetic data.
This class implements a VQ-VAE by combining an encoder, a quantized latent space with a codebook,
and a decoder. The model learns discrete latent representations by mapping encoder outputs to
the nearest codebook vectors during training. The decoder then reconstructs the input from
these quantized latent embeddings.
5. Wasserstein GAN [model_type='wasserstein']
A Wasserstein Generative Adversarial Network (Wasserstein GAN) model.
This class represents a Wasserstein GAN consisting of a generator and discriminator (critic) model.
It implements the Wasserstein loss to train the discriminator and generator, promoting more
stable training compared to traditional GANs.
6. Wasserstein GP GAN [model_type='wasserstein_gp']
A Wasserstein GP Generative Adversarial Network (WassersteinGP GAN) model.
This class represents a WassersteinGP GAN consisting of a generator and discriminator model.
It implements the WassersteinGP loss with gradient penalty to improve the training of the
discriminator and generator.
7. Latent Diffusion Models [model_type='latent_diffusion']
Implements a diffusion process using UNet architectures for generating synthetic data.
This model integrates an autoencoder and a diffusion network, enabling both data
reconstruction and controlled generative modeling through Gaussian diffusion.
8. Denoising Diffusion [model_type='denoising_diffusion']
Implements a diffusion process using UNet architectures for generating synthetic data.
This model integrates an autoencoder and a diffusion network, enabling both data
reconstruction and controlled generative modeling through Gaussian diffusion.
9. Copy/Paste [model_type='copy']
Copy is a naive machine learning model designed to generate synthetic data samples
for specific classes based on provided real samples. This simple approach is primarily used
for testing and comparison purposes, serving as a baseline method in experiments.
Data Flow:
---------
1. Input Data → 2. Preprocessing → 3. Stratified Splitting
↓ ↓
7. Results Collection ← 6. Evaluation ← 5. Generation ← 4. Model Training
Evaluation Strategies:
--------------------
A. TS-TR (Train Synthetic - Assess Real)
- Trains: On generated synthetic data
- Tests: On held-out real validation data
- Measures: Generalization capability
B. TR-TS (Train Real - Assess Synthetic)
- Trains: On real training data
- Tests: On generated synthetic data
- Measures: Generation quality
Metrics Tracked:
---------------
Primary Metrics:
- Accuracy, Precision, Recall, F1, ROC-AUC, FalseNegativeRate, MSE, MAE, TrueNegativeRate
Secondary Metrics:
- EuclideanDistance, HellingerDistance, LogLikelihood, ManhattanDistance
Example Workflows:
-----------------
1. Basic Usage:
>>> gen = SynDataGen()
>>> gen.run_experiments()
2. Custom Configuration:
>>> gen = SynDataGen()
>>> gen.arguments.model_type = 'variational'
>>> gen.arguments.number_k_folds = 5
>>> gen.run_experiments()
3. Research Pipeline:
>>> for model in ['adversarial', 'variational', 'latent_diffusion']:
... gen = SynDataGen()
... gen.arguments.model_type = model
... gen.run_experiments()
>>> gen.save_comparison_report()
"""
@arguments
def __init__(self):
"""
CONSTRUCTOR
==========
Initializes the synthetic data generation pipeline with default parameters.
Detailed Initialization Sequence:
-------------------------------
1. Parent Class Initialization:
- Arguments: Loads CLI/config file parameters
- CSVDataProcessor: Initializes data loading pipelines
- Metrics: Sets up metric tracking structures
- GenerativeModels: Prepares model architectures
- Classifiers: Loads evaluation classifiers
2. Instance Variable Setup:
- fold_number: Initialized to None, tracks current CV fold [0, n_folds-1]
- data_generated: Dictionary structure:
{
class_0: np.ndarray (n_samples, n_features),
class_1: np.ndarray (n_samples, n_features),
...
}
- generator_name: String identifier matching model_type
- directory_output_data: Path object with structure:
./output/
├── models/
├── data/
│ ├── fold_1/
│ ├── ...
└── metrics/
3. Filesystem Preparation:
- Creates required directory structure
- Initializes log files with timestamp
- Validates write permissions
"""
super().__init__()
self.fold_number = None
self.data_generated = None
self.generator_name = None
self.directory_output_data = None
self.directory_output_data = self.get_data_generated_path()
self._manager = HardwareManager(use_gpu=self.arguments.use_gpu)
self._manager.configure()
self._sdv = None
@import_metrics
@import_classifiers
@StratifiedData
def run_experiments(self):
"""
Runs the experiment across multiple folds, using the stratified data splits.
For each fold, the method trains a model, evaluates it on both synthetic and real data,
and logs the results. The method also updates evaluation results and saves them in a JSON file.
The method applies decorators for importing metrics, classifiers, and stratified data splits,
and ensures that each experiment is logged and processed properly.
Logs the progress and completion time for each fold and the total experiment runtime.
This method involves the following steps:
1. Stratified data splitting for training and evaluation.
2. Model training and prediction for each fold.
3. Evaluation using synthetic and real data.
4. Saving the results to a JSON file.
Args:
:None
"""
logging.info("Starting experiment runs across %d folds.", len(self.list_folds))
# Start time for the entire experiment
total_start_time = time.time()
try:
# Iterate over each fold in the stratified list of folds
for fold, dictionary_data in enumerate(self.list_folds):
# Start time for the current fold
fold_start_time = time.time()
logging.info("")
# Log the fold number
logging.info("Running experiment for fold %d.", fold + 1)
# Log the size and shape of the training data (features and labels)
logging.info("Fold %d training data shape: X=%s, Y=%s", fold + 1,
dictionary_data['x_training_real'].shape,
dictionary_data['y_training_real'].shape)
# Update the fold number in the class instance
self.fold_number = fold
logging.debug("\t\tFold number updated to %d in the class.", self.fold_number)
# Get the path to monitor the experiment's progress
monitor_path = self.get_monitor_path()
# number_samples_per_class = self.arguments.number_samples_per_class
# print("number_samples_per_class", number_samples_per_class)
# Create the model and make predictions using the training data
self.train_model(dictionary_data['x_training_real'],
dictionary_data['y_training_real'],
monitor_path, fold)
self.monitoring_start_generating()
evaluation_synthetic = self.synthesize_data(
dictionary_data['x_evaluation_real'],
dictionary_data['y_evaluation_real'],
)
self.monitoring_stop_generating(fold)
logging.info("\t\tModel creation and prediction completed for fold %d.", fold + 1)
# Log the start of the evaluation process
logging.info("")
logging.info("")
logging.info(" starting evaluation for fold %d.", fold + 1)
# Perform the evaluations using synthetic and real data
self.evaluation_TR_TS(dictionary_data, evaluation_synthetic)
self.evaluation_TS_TR(dictionary_data, evaluation_synthetic)
#self.evaluation_TR_TR(dictionary_data)
# self.calculate_sdv_metrics(dictionary_data, fold)
# End of fold, log the time taken for the current fold
fold_end_time = time.time()
logging.info("Fold %d experiment completed in %.2f seconds.", fold + 1, fold_end_time - fold_start_time)
logging.info("------\n\n")
self.save_dictionary_to_json(self.get_evaluation_results_path()+"/Results.json")
# sys.exit(0)
# Update and log the mean and standard deviation of the evaluation results
self.update_mean_std_fold()
self.save_dictionary_to_json(self.get_evaluation_results_path()+"/Results.json")
total_end_time = time.time() # Separate folds for clarity in logs
# Save the evaluation results to a JSON file
logging.info("All experiments completed in %.2f seconds.", total_end_time - total_start_time)
except Exception as e:
logging.error("An error occurred during experiment execution: %s", str(e))
raise
@import_models
def train_model(self, x_real_samples, y_real_samples, monitor_path, k_fold):
"""
This method is responsible for creating a model based on the specified model type, training it on real data,
generating synthetic data using the trained model, and optionally saving the model and generated data.
It supports various model types including adversarial, autoencoder, variational, WassersteinGP, diffusion,
and copy-paste algorithms. After training and generating data, it logs the completion of each step and saves the models
and data if specified in the arguments.
Args:
x_real_samples (array): The real input samples (features) for training the model.
y_real_samples (array): The real target labels corresponding to the input samples.
monitor_path (str): Path to monitor the training process, such as for storing logs or checkpoints.
k_fold (int): The fold number in a cross-validation setup, used to save models and data for each fold.
Raises:
Exception: If an error occurs during model creation, training, or data generation, an exception is raised.
"""
logging.info("Starting model creation and prediction process.")
logging.info("Number of real samples: %d", len(x_real_samples)) # Log the number of real samples
try:
# Train the model with the provided real samples and labels
logging.info("Training model with %d samples and model type: %s", len(x_real_samples),
self.arguments.model_type)
self.monitoring_start_training()
if self.arguments.model_type in ["copula", "ctgan", "tvae"]:
self.generator_name = self.arguments.model_type
logging.info(f"Training SDV's model {self.generator_name} algorithm.")
from Engine.Algorithms.ThirdParty.SDVInterfaceAlgorithm import SDVInterfaceAlgorithm
self._sdv = SDVInterfaceAlgorithm()
self._sdv.training_model( x_real_samples, y_real_samples,
self._data_original_header,
self.arguments.model_type)
else:
self.training_model(self.arguments,
self.get_number_columns(),
x_real_samples,
y_real_samples,
monitor_path,
k_fold)
self.monitoring_stop_training(k_fold)
logging.info("Model training completed.")
if self.arguments.save_models:
logging.info("Saving trained model.")
if self.arguments.model_type == 'adversarial':
self._adversarial_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == 'autoencoder':
self._autoencoder_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "variational":
self._latent_variational_algorithm_diffusion.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "wasserstein":
self._wasserstein_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "wasserstein_gp":
self._wasserstein_gp_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "latent_diffusion":
self._latent_diffusion_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "denoising_diffusion":
self._denoising_diffusion_algorithm.save_model(self.get_models_saved_path(), k_fold)
elif self.arguments.model_type == "quantized":
self._quantized_vae_algorithm.save_model(self.get_models_saved_path(), k_fold)
else:
# If an invalid model type is specified, log the error and exit the program
logging.error("Error during model selection")
exit(-1)
except Exception as e:
# If any error occurs during model creation, training, or data generation, log the error
logging.error("Error during model creation or data generation: %s", str(e))
raise # Reraise the exception for further handling or termination
def synthesize_data(self, x_real_samples, y_real_samples):
# Generate synthetic data based on the specified model type
# Depending on the selected model, we use the corresponding algorithm for data generation
#dictionary_data['y_training_real']
#labels = dictionary_data['y_evaluation_real']
labels = y_real_samples
labels = labels.astype(int)
unique_classes, counts = numpy.unique(labels, return_counts=True)
class_counts = dict(zip(unique_classes, counts))
number_samples_per_class = {'classes': class_counts, 'number_classes': len(unique_classes)}
logging.info("\t\tnumber_samples_per_class", number_samples_per_class)
if self.arguments.model_type == 'adversarial':
# Using adversarial model to generate synthetic data
self.generator_name = 'adversarial'
logging.info("Generating data using Adversarial algorithm.")
self.data_generated = self._adversarial_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == 'autoencoder':
# Using autoencoder model to generate synthetic data
self.generator_name = 'autoencoder'
logging.info("Generating data using Autoencoder algorithm.")
self.data_generated = self._autoencoder_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "variational":
# Using variational model to generate synthetic data
self.generator_name = 'variational'
logging.info("Generating data using Variational algorithm.")
self.data_generated = self._variational_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "wasserstein":
# Using WassersteinGP model to generate synthetic data
self.generator_name = 'wasserstein'
logging.info("Generating data using Wasserstein algorithm.")
self.data_generated = self._wasserstein_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "wasserstein_gp":
# Using WassersteinGP model to generate synthetic data
self.generator_name = 'wasserstein_gp'
logging.info("Generating data using Wasserstein GP algorithm.")
self.data_generated = self._wasserstein_gp_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "latent_diffusion":
# Using diffusion model to generate synthetic data
self.generator_name = 'latent_diffusion'
logging.info("Generating data using LatentDiffusion algorithm.")
self.data_generated = self._latent_diffusion_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "denoising_diffusion":
# Using diffusion model to generate synthetic data
self.generator_name = 'denoising_diffusion'
logging.info("Generating data using Denoising Diffusion algorithm.")
self.data_generated = self._denoising_diffusion_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "copy":
# Using copy-paste model to generate synthetic data (by copying and pasting from real data)
self.generator_name = 'copy'
logging.info("Generating data using copy & paste algorithm.")
self.data_generated = self._copy_algorithm.get_samples(number_samples_per_class,
x_real_samples, y_real_samples)
elif self.arguments.model_type == "quantized":
# Using copy-paste model to generate synthetic data (by copying and pasting from real data)
self.generator_name = 'quantized'
logging.info("Generating data using Vector Quantized Variational Autoencoder algorithm.")
self.data_generated = self._quantized_vae_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "random":
# Using Random Noise Model
self.generator_name = 'random'
logging.info("Generating data using Random Noise algorithm.")
self.data_generated = self._random_noise_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type == "smote":
# Using SMOTE model to generate synthetic data
self.generator_name = 'smote'
logging.info("Generating data using SMOTE algorithm.")
self.data_generated = self._smote_algorithm.get_samples(number_samples_per_class)
elif self.arguments.model_type in ["copula", "ctgan", "tvae"]:
# Using copy-paste model to generate synthetic data (by copying and pasting from real data)
self.generator_name = self.arguments.model_type
logging.info(f"Generating data using SDV's {self.generator_name} algorithm.")
self.data_generated = self._sdv.get_samples(number_samples_per_class)
else:
# If an invalid model type is specified, log the error and exit the program
logging.error("Error during model selection")
exit(-1)
# Completion log for data generation process
logging.info("Data generation completed successfully for model type: %s", self.arguments.model_type)
# If specified, save the generated synthetic data
if self.arguments.save_data:
self.save_data_generated()
return self.data_generated
@autosave
def save_data_generated(self):
"""
Save the generated data to the specified location.
"""
logging.info("Entered the save_data_generated method.")
try:
logging.info("Attempting to save generated data.")
"""
@autosave
"""
logging.info("Generated data saved successfully.")
except Exception as e:
logging.error(f"Error while saving generated data: {str(e)}")
raise
if __name__ == "__main__":
dataGeneration = SynDataGen()
dataGeneration.show_all_settings()
dataGeneration.run_experiments()