|
| 1 | +# AuctionDataset (BAD - NOT UPDATED) |
| 2 | + |
| 3 | +**`src.data_processing.datasets.AuctionDataset(dataframe, target_column='conversion_flag')`** |
| 4 | + |
| 5 | +A PyTorch `Dataset` designed to handle cleaned auction data. This class takes a preprocessed pandas DataFrame, separates the features from the target variable, and provides an interface for iterating over individual samples (feature dictionary and target tensor). |
| 6 | + |
| 7 | +## Parameters |
| 8 | + |
| 9 | +* **`dataframe`** (`pd.DataFrame`): The input DataFrame containing the cleaned and preprocessed auction data. Must be a pandas DataFrame. |
| 10 | +* **`target_column`** (`str`, *optional*, default=`'conversion_flag'`): The name of the column in the `dataframe` that represents the target variable. This column must exist in the provided DataFrame. |
| 11 | + |
| 12 | +## Attributes |
| 13 | + |
| 14 | +* **`target`** (`torch.Tensor`): A tensor containing the target variable values extracted from the `target_column`. The data type is set to `torch.float32`, suitable for many binary classification loss functions. |
| 15 | +* **`features`** (`pd.DataFrame`): A DataFrame containing only the feature columns derived from the input `dataframe`. Columns like `target_column`, `unique_id`, `impression_dttm_utc`, `conv_dttm_utc`, and `dte` are automatically dropped. You might need to adjust the drop logic within the class if your preprocessing yields different non-feature columns. |
| 16 | +* **`feature_names`** (`list[str]`): A list containing the names of the columns considered as features. |
| 17 | + |
| 18 | +## Methods |
| 19 | + |
| 20 | +### `__len__(self)` |
| 21 | + |
| 22 | +Returns the total number of samples (rows) in the dataset. |
| 23 | + |
| 24 | +* **Returns**: `int` - The number of samples. |
| 25 | + |
| 26 | +### `__getitem__(self, idx)` |
| 27 | + |
| 28 | +Retrieves the features and target for a specific sample index. |
| 29 | + |
| 30 | +* **Parameters**: |
| 31 | + * **`idx`** (`int` or `torch.Tensor`): The index of the sample to retrieve. |
| 32 | +* **Returns**: `tuple` - A tuple containing: |
| 33 | + 1. `dict`: A dictionary where keys are feature names (`str`) and values are the corresponding feature values for the sample at the given index. |
| 34 | + 2. `torch.Tensor`: A tensor containing the target value for the sample. |
| 35 | + |
| 36 | +## Raises |
| 37 | + |
| 38 | +* **`TypeError`**: If the input `dataframe` is not a pandas DataFrame. |
| 39 | +* **`ValueError`**: If the specified `target_column` does not exist in the input `dataframe`. |
| 40 | + |
| 41 | +## Example Usage |
| 42 | + |
| 43 | +```python |
| 44 | +import pandas as pd |
| 45 | +from torch.utils.data import DataLoader |
| 46 | +from src.data_processing.datasets import AuctionDataset # Assuming the class is in this path |
| 47 | + |
| 48 | +# Assume 'cleaned_data.csv' contains your preprocessed data |
| 49 | +try: |
| 50 | + df = pd.read_csv('cleaned_data.csv') |
| 51 | +except FileNotFoundError: |
| 52 | + print("Error: cleaned_data.csv not found. Please provide a valid path.") |
| 53 | + # Create a dummy DataFrame for demonstration if file not found |
| 54 | + data = { |
| 55 | + 'feature1': [1, 2, 3, 4, 5], |
| 56 | + 'feature2': [0.1, 0.2, 0.3, 0.4, 0.5], |
| 57 | + 'device_type': ['mobile', 'desktop', 'mobile', 'tablet', 'desktop'], |
| 58 | + 'conversion_flag': [0, 1, 0, 1, 0], |
| 59 | + 'unique_id': [101, 102, 103, 104, 105], |
| 60 | + 'impression_dttm_utc': pd.to_datetime(['2023-01-01 10:00:00', '2023-01-01 10:05:00', '2023-01-01 10:10:00', '2023-01-01 10:15:00', '2023-01-01 10:20:00']), |
| 61 | + 'conv_dttm_utc': pd.to_datetime([pd.NaT, '2023-01-01 10:06:00', pd.NaT, '2023-01-01 10:17:00', pd.NaT]), |
| 62 | + 'dte': ['2023-01-01'] * 5 |
| 63 | + } |
| 64 | + df = pd.DataFrame(data) |
| 65 | + |
| 66 | + |
| 67 | +# Create the dataset instance |
| 68 | +# Note: Further preprocessing like one-hot encoding 'device_type' might be needed |
| 69 | +# depending on the model. This dataset returns features as they are in the DataFrame. |
| 70 | +# One-hot encode categorical features before passing to the dataset if needed by the model |
| 71 | +df = pd.get_dummies(df, columns=['device_type'], drop_first=True) # Example encoding |
| 72 | + |
| 73 | +auction_dataset = AuctionDataset(dataframe=df, target_column='conversion_flag') |
| 74 | + |
| 75 | +# Get the number of samples |
| 76 | +print(f"Number of samples: {len(auction_dataset)}") |
| 77 | + |
| 78 | +# Get a single sample |
| 79 | +features, target = auction_dataset[0] |
| 80 | +print("\nSample 0:") |
| 81 | +print("Features:", features) |
| 82 | +print("Target:", target) |
| 83 | + |
| 84 | +# Use with DataLoader |
| 85 | +data_loader = DataLoader(auction_dataset, batch_size=2, shuffle=True) |
| 86 | + |
| 87 | +print("\nIterating through DataLoader:") |
| 88 | +for batch_idx, (batch_features, batch_targets) in enumerate(data_loader): |
| 89 | + print(f"\nBatch {batch_idx}:") |
| 90 | + # Note: batch_features will be a dictionary where each value is a list/tensor of features for the batch |
| 91 | + # You might need to further process this structure depending on your model's input requirements |
| 92 | + print(" Batch Features Structure (Keys):", batch_features.keys()) |
| 93 | + print(" Feature 'feature1' batch:", batch_features['feature1']) |
| 94 | + print(" Batch Targets:", batch_targets) |
| 95 | + if batch_idx >= 1: # Show first 2 batches |
| 96 | + break |
| 97 | +``` |
| 98 | + |
| 99 | +``` |
0 commit comments