|
| 1 | +import matplotlib |
| 2 | +import matplotlib.pyplot as plt |
| 3 | +import numpy as np |
| 4 | +import paddle |
| 5 | +import paddle.nn as nn |
| 6 | +import paddle.nn.functional as F |
| 7 | +import pandas as pd |
| 8 | +from sklearn.compose import ColumnTransformer |
| 9 | +from sklearn.metrics import ConfusionMatrixDisplay |
| 10 | +from sklearn.metrics import accuracy_score |
| 11 | +from sklearn.metrics import classification_report |
| 12 | +from sklearn.metrics import confusion_matrix |
| 13 | +from sklearn.model_selection import StratifiedShuffleSplit |
| 14 | +from sklearn.preprocessing import OneHotEncoder |
| 15 | +from sklearn.preprocessing import StandardScaler |
| 16 | + |
| 17 | +matplotlib.rcParams["font.family"] = "SimHei" |
| 18 | +matplotlib.rcParams["axes.unicode_minus"] = False |
| 19 | + |
| 20 | +# ==== Parameter Configuration (Customizable) ==== |
| 21 | +EPOCHS = 500 |
| 22 | +LR = 0.01 |
| 23 | +CLASS_WEIGHTS = [3.5, 3.5, 2.0, 2.5] # Class order: Low, Mid-Low, Mid-High, High |
| 24 | + |
| 25 | +# ==== Classification Label Function ==== |
| 26 | +def classify_emission(value): |
| 27 | + if value < 1500: |
| 28 | + return 0 |
| 29 | + elif value < 7800: |
| 30 | + return 1 |
| 31 | + elif value < 40000: |
| 32 | + return 2 |
| 33 | + else: |
| 34 | + return 3 |
| 35 | + |
| 36 | + |
| 37 | +# ==== FocalLoss Definition ==== |
| 38 | +class FocalLoss(nn.Layer): |
| 39 | + def __init__(self, gamma=2, weight=None): |
| 40 | + super(FocalLoss, self).__init__() |
| 41 | + self.gamma = gamma |
| 42 | + self.weight = weight |
| 43 | + |
| 44 | + def forward(self, input, target): |
| 45 | + logpt = F.cross_entropy(input, target, weight=self.weight, reduction="none") |
| 46 | + pt = paddle.exp(-logpt) |
| 47 | + loss = ((1 - pt) ** self.gamma) * logpt |
| 48 | + return loss.mean() |
| 49 | + |
| 50 | + |
| 51 | +# ==== Load and Clean Data ==== |
| 52 | +df = pd.read_excel("./Fusion_Data.xlsx") |
| 53 | +df = df.dropna( |
| 54 | + subset=[ |
| 55 | + "企业CO₂排放量 (kg)", |
| 56 | + "匹配时间", |
| 57 | + "企业省份", |
| 58 | + "卫星中心纬度", |
| 59 | + "卫星中心经度", |
| 60 | + "卫星CO₂浓度 (xco2)", |
| 61 | + "风向", |
| 62 | + "风速", |
| 63 | + ] |
| 64 | +) |
| 65 | +df["匹配时间"] = pd.to_datetime(df["匹配时间"]) |
| 66 | +df["hour"] = df["匹配时间"].dt.hour |
| 67 | +df["month"] = df["匹配时间"].dt.month |
| 68 | + |
| 69 | +# ==== Feature Processing ==== |
| 70 | +numeric_features = ["卫星中心纬度", "卫星中心经度", "卫星CO₂浓度 (xco2)", "风向", "风速", "hour", "month"] |
| 71 | +categorical_features = ["企业省份"] |
| 72 | +X_raw = df[numeric_features + categorical_features] |
| 73 | +y_raw = df["企业CO₂排放量 (kg)"].values.reshape(-1, 1) |
| 74 | +labels = np.vectorize(classify_emission)(y_raw.flatten()) |
| 75 | +enterprise_names = df["企业名称"].values |
| 76 | + |
| 77 | +ct = ColumnTransformer( |
| 78 | + [ |
| 79 | + ("num", StandardScaler(), numeric_features), |
| 80 | + ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features), |
| 81 | + ] |
| 82 | +) |
| 83 | +X = ct.fit_transform(X_raw) |
| 84 | + |
| 85 | +# ==== Stratified Sampling ==== |
| 86 | +sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) |
| 87 | +for train_idx, test_idx in sss.split(X, labels): |
| 88 | + X_train, X_test = X[train_idx], X[test_idx] |
| 89 | + y_train, y_test = labels[train_idx], labels[test_idx] |
| 90 | + name_train, name_test = enterprise_names[train_idx], enterprise_names[test_idx] |
| 91 | + |
| 92 | +X_train = paddle.to_tensor(X_train, dtype="float32") |
| 93 | +y_train = paddle.to_tensor(y_train, dtype="int64") |
| 94 | +X_test = paddle.to_tensor(X_test, dtype="float32") |
| 95 | +y_test = paddle.to_tensor(y_test, dtype="int64") |
| 96 | + |
| 97 | +# ==== Network Architecture ==== |
| 98 | +class EmissionClassifier(nn.Layer): |
| 99 | + def __init__(self, input_dim): |
| 100 | + super().__init__() |
| 101 | + self.shared = nn.Sequential( |
| 102 | + nn.Linear(input_dim, 256), |
| 103 | + nn.ReLU(), |
| 104 | + nn.Dropout(0.3), |
| 105 | + nn.Linear(256, 128), |
| 106 | + nn.ReLU(), |
| 107 | + nn.Dropout(0.2), |
| 108 | + nn.Linear(128, 64), |
| 109 | + nn.ReLU(), |
| 110 | + ) |
| 111 | + self.classifier = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 4)) |
| 112 | + |
| 113 | + def forward(self, x): |
| 114 | + x = self.shared(x) |
| 115 | + return self.classifier(x) |
| 116 | + |
| 117 | + |
| 118 | +# ==== Model Training ==== |
| 119 | +model = EmissionClassifier(input_dim=X.shape[1]) |
| 120 | +optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=LR) |
| 121 | +loss_fn = FocalLoss(gamma=2, weight=paddle.to_tensor(CLASS_WEIGHTS, dtype="float32")) |
| 122 | + |
| 123 | +train_loss_record = [] |
| 124 | +val_acc_record = [] |
| 125 | + |
| 126 | +for epoch in range(EPOCHS): |
| 127 | + model.train() |
| 128 | + logits = model(X_train) |
| 129 | + loss = loss_fn(logits, y_train) |
| 130 | + loss.backward() |
| 131 | + optimizer.step() |
| 132 | + optimizer.clear_grad() |
| 133 | + train_loss_record.append(loss.numpy()) |
| 134 | + |
| 135 | + if (epoch + 1) % 20 == 0: |
| 136 | + model.eval() |
| 137 | + with paddle.no_grad(): |
| 138 | + val_logits = model(X_test) |
| 139 | + preds = paddle.argmax(val_logits, axis=1) |
| 140 | + acc = accuracy_score(y_test.numpy(), preds.numpy()) |
| 141 | + val_acc_record.append(acc) |
| 142 | + print(f"[Epoch {epoch+1}] loss={loss.numpy():.4f}, acc={acc:.4f}") |
| 143 | + |
| 144 | +# ==== Model Evaluation ==== |
| 145 | +model.eval() |
| 146 | +X_all_tensor = paddle.to_tensor(X, dtype="float32") |
| 147 | +with paddle.no_grad(): |
| 148 | + preds = paddle.argmax(model(X_all_tensor), axis=1).numpy() |
| 149 | + |
| 150 | +print("\n🎯 Overall Accuracy: {:.2f}%".format(accuracy_score(labels, preds) * 100)) |
| 151 | +print("\n📊 Classification Report:") |
| 152 | +report = classification_report( |
| 153 | + labels, preds, target_names=["Low", "Mid-Low", "Mid-High", "High"], output_dict=True |
| 154 | +) |
| 155 | +print( |
| 156 | + classification_report( |
| 157 | + labels, preds, target_names=["Low", "Mid-Low", "Mid-High", "High"] |
| 158 | + ) |
| 159 | +) |
| 160 | + |
| 161 | +# ==== 📈 Training Loss Curve ==== |
| 162 | +plt.figure() |
| 163 | +plt.plot(train_loss_record, label="Training Loss") |
| 164 | +plt.title("Training Loss Curve") |
| 165 | +plt.xlabel("Epoch") |
| 166 | +plt.ylabel("Loss") |
| 167 | +plt.grid(True) |
| 168 | +plt.legend() |
| 169 | +plt.tight_layout() |
| 170 | +plt.show() |
| 171 | + |
| 172 | +# ==== 📊 Recall per Class Bar Chart ==== |
| 173 | +plt.figure() |
| 174 | +target_names = ["Low", "Mid-Low", "Mid-High", "High"] |
| 175 | +recalls = [report[name]["recall"] for name in target_names] |
| 176 | +plt.bar(target_names, recalls) |
| 177 | +plt.title("Recall per Class") |
| 178 | +plt.ylabel("Recall") |
| 179 | +plt.ylim(0, 1) |
| 180 | +plt.grid(axis="y") |
| 181 | +plt.tight_layout() |
| 182 | +plt.show() |
| 183 | + |
| 184 | +# ==== Confusion Matrix ==== |
| 185 | +cm = confusion_matrix(labels, preds) |
| 186 | +ConfusionMatrixDisplay( |
| 187 | + confusion_matrix=cm, display_labels=["Low", "Mid-Low", "Mid-High", "High"] |
| 188 | +).plot(cmap="Blues") |
| 189 | +plt.title("Predicted vs Actual Class") |
| 190 | +plt.tight_layout() |
| 191 | +plt.show() |
| 192 | + |
| 193 | +# ==== Export Results ==== |
| 194 | +pd.DataFrame( |
| 195 | + { |
| 196 | + "Enterprise Name": enterprise_names, |
| 197 | + "Actual Class": labels, |
| 198 | + "Predicted Class": preds, |
| 199 | + } |
| 200 | +).to_csv("carbon_emission_prediction_results.csv", index=False) |
| 201 | +print("✅ Results exported to: carbon_emission_prediction_results.csv") |
0 commit comments