Dowan/homework3_code.py at main · DOWANESS/Dowan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统常用字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # 禁用OneDNN优化（解决某些兼容性问题）

# 数据加载与预处理
def load_and_preprocess(file_path):
    # 加载原始数据
    df = pd.read_csv(
        file_path,
        parse_dates=['date'],
        index_col='date',
        encoding='utf-8'
    )
    print("原始数据样例：\n", df.head())

    # 分离特征类型
    numeric_cols = ['pollution', 'dew', 'temp', 'press', 'wnd_spd', 'snow', 'rain']
    categorical_cols = ['wnd_dir']

    # 处理缺失值（以pollution列为例）
    df['pollution'].fillna(df['pollution'].median(), inplace=True)

    # 编码分类特征（风向）
    wind_encoder = OneHotEncoder(drop='first', sparse_output=False)
    wind_encoded = wind_encoder.fit_transform(df[categorical_cols])
    wind_cols = wind_encoder.get_feature_names_out(categorical_cols)

    # 标准化数值特征
    scaler = MinMaxScaler()
    numeric_scaled = scaler.fit_transform(df[numeric_cols])

    # 合并处理后的特征
    processed_data = np.hstack([numeric_scaled, wind_encoded])
    feature_cols = numeric_cols + wind_cols.tolist()

    return pd.DataFrame(processed_data, columns=feature_cols, index=df.index), scaler


# 创建时间序列数据集
def create_dataset(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps])
        y.append(data[i + n_steps, 0])  # 预测pollution（第一个特征）
    return np.array(X), np.array(y)


# 反标准化函数
def inverse_scale(scaler, y_values, feature_index=0):
    """反标准化特定特征"""
    dummy = np.zeros((len(y_values), len(scaler.feature_names_in_)))
    dummy[:, feature_index] = y_values
    return scaler.inverse_transform(dummy)[:, feature_index]


# 主程序
if __name__ == "__main__":
    # 文件路径配置
    file_path = "LSTM-Multivariate_pollution.csv"
    if not os.path.exists(file_path):
        print(f"文件不存在：{file_path}")
        exit()

    # 数据预处理
    df_processed, scaler = load_and_preprocess(file_path)
    print("\n处理后的特征维度:", df_processed.shape)
    print("特征列表：", df_processed.columns.tolist())

    # 创建时间序列数据集
    n_steps = 24
    X, y = create_dataset(df_processed.values, n_steps)
    print("\n数据集维度:")
    print("X shape:", X.shape)  # (samples, timesteps, features)
    print("y shape:", y.shape)

    # 划分数据集
    train_size = int(0.8 * len(X))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # 构建LSTM模型
    model = Sequential([
        LSTM(100, activation='relu',
             input_shape=(n_steps, X.shape[2]),
             return_sequences=True),
        Dropout(0.3),
        LSTM(50, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.summary()

    # 训练模型
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=64,
        validation_split=0.2,
        callbacks=[EarlyStopping(patience=5)],
        verbose=1
    )

    # 损失曲线可视化
    plt.figure(figsize=(8, 4))
    plt.plot(history.history['loss'], label='训练集损失')
    plt.plot(history.history['val_loss'], label='验证集损失')
    plt.title('LSTM模型训练过程损失曲线')
    plt.xlabel('训练轮次（Epoch）')
    plt.ylabel('均方误差（MSE）')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # 预测与评估
    y_pred = model.predict(X_test)

    # 反标准化
    y_test_inv = inverse_scale(scaler, y_test)
    y_pred_inv = inverse_scale(scaler, y_pred.flatten())

    # 预测结果可视化
    plt.figure(figsize=(12, 6))
    plt.plot(y_test_inv, label='真实值', alpha=0.7, linewidth=2)
    plt.plot(y_pred_inv, label='预测值', alpha=0.7, linestyle='--')
    plt.title('PM2.5浓度预测结果对比（测试集）')
    plt.xlabel('时间步')
    plt.ylabel('PM2.5浓度 (μg/m³)')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 评估指标
    mse = np.mean((y_test_inv - y_pred_inv) ** 2)
    mae = np.mean(np.abs(y_test_inv - y_pred_inv))
    print("\n=== 模型性能 ===")
    print(f"均方误差（MSE）: {mse:.2f}")
    print(f"平均绝对误差（MAE）: {mae:.2f}")
    print(f"均方根误差（RMSE）: {np.sqrt(mse):.2f}")