diff --git a/CODEBOOK.md b/CODEBOOK.md index f47093a..837a7ac 100644 --- a/CODEBOOK.md +++ b/CODEBOOK.md @@ -22,6 +22,7 @@ The purpose of this section is to label radar imagery for warm-season precipitat | Field | Units | Description | |--------------------------|-------|-------------------------------------| | *reflectivity* | dBZ | Intensity of returned radar signal | +| n_gates_50dBZ | percent | The percentage of gates greater than 50 dBZ | ### 2.2 Image Format @@ -42,8 +43,8 @@ Each image or region-of-interest must be assigned exactly one primary class. | Label | Description | |------------------------|-----------------------------------------------------------------------------| | No Precipitation | No significant return; background noise only. The image will only have blue and black colors.| -| Stratiform Precipitation | The image must have no pink colors. Green, yellow and red colors are present in a widespread blob. | -| Isolated Convection | The image must have regions of dark red and pink colors. These dark red and pink regions must be separated by regions of black and blue, with no connection to other dark red and pink regions through yellow regions. Over half of the image must be blue or black. | +| Stratiform Precipitation | The image must have no pink colors. Green, yellow and red colors are present in a widespread blob. The percentage of gates greater than 50 dBZ must not exceed 0.02 percent. If it does exceed 0.02 percent, then classify as a mesoscale convective system. | +| Isolated Convection | The image must have regions of dark red and pink colors. These dark red and pink regions must be separated by regions of black and blue, with no connection to other dark red and pink regions through yellow regions. Over half of the image must be blue or black. The perentages of gates with reflectivity greater than 30 dBZ must not exceed 10 percent. If it does exceed 30 percent, then classify as a mesoscale convective system. | | Mesoscale Convective System | A string or connected cluster of dark red and pink colors must be present in the image. This string can take on a curved structure. There can be more than one such string or cluster in the image. The dark red and pink colors in the clusters must be connected by yellow regions. | | Ambiguous / Uncertain | Cannot be classified with confidence. | diff --git a/lars/nepho/inference.py b/lars/nepho/inference.py index 23e456f..11282da 100644 --- a/lars/nepho/inference.py +++ b/lars/nepho/inference.py @@ -174,6 +174,14 @@ async def label_radar_data(radar_df, model, categories=None, guidelines=None, for category, description in categories.items(): prompt += f"{category}: {description}; " prompt += f"The reflectivity values range from {vmin} dBZ as indicated by the blue colors to {vmax} dBZ as indicated by the red colors." + for key in radar_df.columns: + if key.startswith("pct_gates_") and key.endswith("dbz"): + threshold = key[len("pct_gates_"):-len("dbz")] + prompt += f" The percentage of gates with relfectivity above {threshold} dBZ is provided as {key} in the data." + if key.startswith("n_gates_") and key.endswith("dbz"): + threshold = key[len("n_gates_"):-len("dbz")] + prompt += f" The number of gates with relfectivity above {threshold} dBZ is provided as {key} in the data." + if guidelines: prompt += " When classifying, follow these annotator guidelines: " prompt += " ".join(guidelines) diff --git a/lars/preprocessing/__init__.py b/lars/preprocessing/__init__.py index 5f86c8d..54de16f 100644 --- a/lars/preprocessing/__init__.py +++ b/lars/preprocessing/__init__.py @@ -1,2 +1,2 @@ from .radar_preprocessing import preprocess_radar_data # noqa: F401 -from .labels import load_labels, save_labels, change_file_path # noqa: F401 \ No newline at end of file +from .labels import load_labels, save_labels, change_file_path, copy_labels # noqa: F401 \ No newline at end of file diff --git a/lars/preprocessing/labels.py b/lars/preprocessing/labels.py index 912461f..3700b0a 100644 --- a/lars/preprocessing/labels.py +++ b/lars/preprocessing/labels.py @@ -36,6 +36,50 @@ def load_labels(label_file): """ return pd.read_csv(label_file) +def copy_labels(source_df, target_df, match_on='time', label_column='label'): + """ + Copy labels from a source DataFrame to a target DataFrame. + + Matches rows either on the time index or on the file name (basename only, + so the directory portion of the path does not need to match). + + Parameters + ---------- + source_df (pd.DataFrame): DataFrame containing the labels to copy from. + target_df (pd.DataFrame): DataFrame to copy the labels into. + match_on (str): Either 'time' (match on the DataFrame index) or + 'file_path' (match on the basename of the 'file_path' column). + label_column (str): Name of the column containing labels. Default 'label'. + + Returns + ------- + pd.DataFrame + A copy of ``target_df`` with labels filled in from ``source_df`` where + a match was found. Rows with no match keep their existing label value. + """ + if match_on not in ('time', 'file_path'): + raise ValueError("match_on must be either 'time' or 'file_path'") + + target_df = target_df.copy() + + if match_on == 'time': + lookup = source_df[label_column] + new_labels = target_df.index.map(lookup) + else: + source_keys = source_df['file_path'].apply(os.path.basename) + lookup = pd.Series(source_df[label_column].values, index=source_keys) + target_keys = target_df['file_path'].apply(os.path.basename) + new_labels = target_keys.map(lookup) + + new_labels = pd.Series(new_labels, index=target_df.index) + if label_column in target_df.columns: + target_df[label_column] = new_labels.where(new_labels.notna(), + target_df[label_column]) + else: + target_df[label_column] = new_labels + return target_df + + def save_labels(label_df, output_file): """ Save labels to a CSV file.