diff --git a/docs/zh/examples/REDD.md b/docs/zh/examples/REDD.md new file mode 100644 index 000000000..fe875e7ef --- /dev/null +++ b/docs/zh/examples/REDD.md @@ -0,0 +1,157 @@ +# 企业碳排放等级分类模型 Classification model for carbon emission levels of enterprises + +## 背景简介 + +本项目基于 PaddlePaddle 构建了一个企业碳排放等级分类模型,融合了遥感卫星观测、气象信息与企业地理属性数据,使用加权 FocalLoss 抵抗类别不平衡,并引入 Dropout 机制与多层感知器结构提升模型表达能力和泛化能力。模型支持对企业在不同空间、时间、环境条件下的 CO₂ 排放等级进行预测,并提供训练过程多维指标可视化,适用于碳排放配额管理、碳达峰路线推演、排放结构治理等典型应用场景。经训练本模型分类的准确率可达75%+ + +本项目具备以下特性: +- 融合卫星反演数据、地面排放数据与风场特征进行多源建模; +- 采用分层抽样保证训练集分布均衡,增强泛化能力; +- 使用加权 FocalLoss 加强对中高、高等级的学习; +- 引入梯度范数等训练指标进行稳定性监控; +- 支持分类性能多角度可视化展示与结果导出。 + +## 企业碳排放等级分类模型原理说明 + +本模型旨在利用多源融合数据,构建对企业单位时间内 CO₂ 排放等级的智能分类器,支撑精准监管与排放评估。 + +--- + +## 🌐 模型理论基础 + +### ✅ 多因子驱动假设 +企业碳排放受多重因素影响,包括: +- 空间位置(纬度、经度) +- 气象条件(风向、风速) +- 时序因素(小时、月份) +- 卫星遥感观测值(xco₂) + +### ✅ 分级建模思路 +将连续型 CO₂ 排放值通过区间划分,映射为四类等级,实现多分类问题建模。 + +### ✅ 不平衡处理原则 +使用 FocalLoss 及类别权重策略,提升对中高、高等级样本的辨识能力,抑制主导类干扰。 + +### ✅ 时空特征建模思想 +引入企业的空间位置与采样时间,提取 `hour` 与 `month` 信息,捕捉排放时空异质性。 + +--- + +## 📊 字段与特征说明 + +| 字段名 | 含义 | 类型 | 说明 | +|------------------|--------------|--------|------------------------------------| +| 企业CO₂排放量 (kg) | 企业碳排放值 | 数值型 | 模型目标分类依据(4类) | +| 卫星CO₂浓度 (xco2) | 卫星观测值 | 数值型 | 区域背景 CO₂ 浓度参考 | +| 风向、风速 | 气象参数 | 数值型 | 空间传输扩散影响因子 | +| 纬度、经度 | 企业地理位置 | 数值型 | 区域空间特征 | +| 企业省份 | 所属行政区域 | 类别型 | OneHot 编码用于嵌入处理 | +| 匹配时间 | 数据采集时间 | 时间型 | 提取 hour 与 month 作为时间特征 | + +--- + +## 🏷 标签分类规则 + +| CO₂ 排放范围 (kg) | 等级标签 | +|------------------|--------| +| < 1500 | 低 (0) | +| 1500 – 7800 | 中低 (1) | +| 7800 – 40000 | 中高 (2) | +| ≥ 40000 | 高 (3) | + +--- +## 模型构建 +本模型采用经典多层感知器(MLP)结构,适用于低维、融合型特征构建分类任务。其构建流程如下: + +### 🔍 特征预处理 + +- **数值特征**:使用 `StandardScaler` 标准化 +- **类别特征**:使用 `OneHotEncoder` 编码 +- **整合方式**:通过 `ColumnTransformer` 管道统一预处理流程 + +### 🏗 模型结构设计(MLP) + +```text +输入特征 → Linear(256) → ReLU → Dropout(0.3) + → Linear(128) → ReLU → Dropout(0.2) + → Linear(64) → ReLU + → Linear(32) → ReLU + → Linear(4) → Softmax 输出分类概率 +``` + +## 训练与评估 + + +### 数据划分策略 + +- **方法**:`StratifiedShuffleSplit` +- **目的**:保持各类别比例稳定,避免训练偏斜 + +### 训练参数设置 + +- Epoch 数:500 +- 批量训练:全量训练(后续可扩展 mini-batch) +- 学习率:0.001(Adam 优化) +- 验证频率:每 20 轮评估一次 + +### 核心监控指标 + +- 📈 **Loss 曲线**:训练损失随 epoch 变化趋势 +- ✅ **Accuracy 曲线**:验证集分类准确率 +- 📊 **Confusion Matrix**:评估误判情况 +- 🔍 **Gradient Norm**:追踪每轮梯度大小监控训练稳定性 +- 📉 **各类 Recall 曲线**:检测模型对不同等级的学习表现 + +--- + +## 📌 模型评价指标 + +| 指标 | 含义 | +|-----------|--------------------------| +| Accuracy | 全部预测样本中正确分类的比例 | +| Precision | 各类别预测为正样本中正确的比例 | +| Recall | 各类别中被成功预测出的比例 | +| F1-score | Precision 与 Recall 的调和平均 | +| 混淆矩阵 | 观测各类别间的预测错配情况 | + +--- + +## 结果可视化 + +训练过程展示以下图表: +- 📉 Loss 与 Accuracy 曲线 +![Loss](https://www.craes-air.cn/official/REDD_Loss.png) + +- 📊 Recall 柱状图 +![Recall](https://www.craes-air.cn/official/REDD_Recall.png) + +- 混淆矩阵热图 +![混淆矩阵热图](https://www.craes-air.cn/official/REDD_Confusion.png) + + +## 结果展示 + +示例输出格式如下: + +```csv +企业名称,实际等级,预测等级 +企业A,2,2 +企业B,3,2 +企业C,1,1 +``` + +## 完整代码 + +确保数据文件在当前目录后运行: + +``` py linenums="1" title="examples/REDD/REDD.py" +--8<-- +examples/REDD/REDD.py +--8<-- +``` + + +## 参考资料 + +- https://github.com/PaddlePaddle/PaddleSlim +- https://scikit-learn.org/stable/modules/classes.html diff --git a/examples/REDD/20240101_20240301_meteo_data.csv b/examples/REDD/20240101_20240301_meteo_data.csv new file mode 100644 index 000000000..450789ffd --- /dev/null +++ b/examples/REDD/20240101_20240301_meteo_data.csv @@ -0,0 +1,8 @@ +stationIdC,year,mon,day,hour,prs,winDAvg2mi,winSAvg2mi,tem,rhu,pre3h,date,city,station +53392,2024,1,1,0,857.8,335,0.3,-23.7,78,0,2024/1/1 0:00,żҿ, +53392,2024,1,1,3,857.6,175,0.6,-12.1,85,0,2024/1/1 3:00,żҿ, +53392,2024,1,1,6,855.9,217,1.4,-8.8,67,0,2024/1/1 6:00,żҿ, +53392,2024,1,1,9,856.1,276,0.6,-12.3,72,0,2024/1/1 9:00,żҿ, +53392,2024,1,1,12,856.4,147,1.4,-17.8,85,0,2024/1/1 12:00,żҿ, +53392,2024,1,1,15,856.2,256,0.9,-17.4,84,0,2024/1/1 15:00,żҿ, +53392,2024,1,1,18,856.1,245,1.3,-18.4,86,0,2024/1/1 18:00,żҿ, diff --git a/examples/REDD/20240101_data.xlsx b/examples/REDD/20240101_data.xlsx new file mode 100644 index 000000000..4f9d2f284 Binary files /dev/null and b/examples/REDD/20240101_data.xlsx differ diff --git a/examples/REDD/Fusion_Data.xlsx b/examples/REDD/Fusion_Data.xlsx new file mode 100644 index 000000000..086203378 Binary files /dev/null and b/examples/REDD/Fusion_Data.xlsx differ diff --git a/examples/REDD/REDD.py b/examples/REDD/REDD.py new file mode 100644 index 000000000..97fdc8a59 --- /dev/null +++ b/examples/REDD/REDD.py @@ -0,0 +1,201 @@ +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.metrics import accuracy_score +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import StandardScaler + +matplotlib.rcParams["font.family"] = "SimHei" +matplotlib.rcParams["axes.unicode_minus"] = False + +# ==== Parameter Configuration (Customizable) ==== +EPOCHS = 500 +LR = 0.01 +CLASS_WEIGHTS = [3.5, 3.5, 2.0, 2.5] # Class order: Low, Mid-Low, Mid-High, High + +# ==== Classification Label Function ==== +def classify_emission(value): + if value < 1500: + return 0 + elif value < 7800: + return 1 + elif value < 40000: + return 2 + else: + return 3 + + +# ==== FocalLoss Definition ==== +class FocalLoss(nn.Layer): + def __init__(self, gamma=2, weight=None): + super(FocalLoss, self).__init__() + self.gamma = gamma + self.weight = weight + + def forward(self, input, target): + logpt = F.cross_entropy(input, target, weight=self.weight, reduction="none") + pt = paddle.exp(-logpt) + loss = ((1 - pt) ** self.gamma) * logpt + return loss.mean() + + +# ==== Load and Clean Data ==== +df = pd.read_excel("./Fusion_Data.xlsx") +df = df.dropna( + subset=[ + "企业CO₂排放量 (kg)", + "匹配时间", + "企业省份", + "卫星中心纬度", + "卫星中心经度", + "卫星CO₂浓度 (xco2)", + "风向", + "风速", + ] +) +df["匹配时间"] = pd.to_datetime(df["匹配时间"]) +df["hour"] = df["匹配时间"].dt.hour +df["month"] = df["匹配时间"].dt.month + +# ==== Feature Processing ==== +numeric_features = ["卫星中心纬度", "卫星中心经度", "卫星CO₂浓度 (xco2)", "风向", "风速", "hour", "month"] +categorical_features = ["企业省份"] +X_raw = df[numeric_features + categorical_features] +y_raw = df["企业CO₂排放量 (kg)"].values.reshape(-1, 1) +labels = np.vectorize(classify_emission)(y_raw.flatten()) +enterprise_names = df["企业名称"].values + +ct = ColumnTransformer( + [ + ("num", StandardScaler(), numeric_features), + ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features), + ] +) +X = ct.fit_transform(X_raw) + +# ==== Stratified Sampling ==== +sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) +for train_idx, test_idx in sss.split(X, labels): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = labels[train_idx], labels[test_idx] + name_train, name_test = enterprise_names[train_idx], enterprise_names[test_idx] + +X_train = paddle.to_tensor(X_train, dtype="float32") +y_train = paddle.to_tensor(y_train, dtype="int64") +X_test = paddle.to_tensor(X_test, dtype="float32") +y_test = paddle.to_tensor(y_test, dtype="int64") + +# ==== Network Architecture ==== +class EmissionClassifier(nn.Layer): + def __init__(self, input_dim): + super().__init__() + self.shared = nn.Sequential( + nn.Linear(input_dim, 256), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(256, 128), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(128, 64), + nn.ReLU(), + ) + self.classifier = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 4)) + + def forward(self, x): + x = self.shared(x) + return self.classifier(x) + + +# ==== Model Training ==== +model = EmissionClassifier(input_dim=X.shape[1]) +optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=LR) +loss_fn = FocalLoss(gamma=2, weight=paddle.to_tensor(CLASS_WEIGHTS, dtype="float32")) + +train_loss_record = [] +val_acc_record = [] + +for epoch in range(EPOCHS): + model.train() + logits = model(X_train) + loss = loss_fn(logits, y_train) + loss.backward() + optimizer.step() + optimizer.clear_grad() + train_loss_record.append(loss.numpy()) + + if (epoch + 1) % 20 == 0: + model.eval() + with paddle.no_grad(): + val_logits = model(X_test) + preds = paddle.argmax(val_logits, axis=1) + acc = accuracy_score(y_test.numpy(), preds.numpy()) + val_acc_record.append(acc) + print(f"[Epoch {epoch+1}] loss={loss.numpy():.4f}, acc={acc:.4f}") + +# ==== Model Evaluation ==== +model.eval() +X_all_tensor = paddle.to_tensor(X, dtype="float32") +with paddle.no_grad(): + preds = paddle.argmax(model(X_all_tensor), axis=1).numpy() + +print("\n🎯 Overall Accuracy: {:.2f}%".format(accuracy_score(labels, preds) * 100)) +print("\n📊 Classification Report:") +report = classification_report( + labels, preds, target_names=["Low", "Mid-Low", "Mid-High", "High"], output_dict=True +) +print( + classification_report( + labels, preds, target_names=["Low", "Mid-Low", "Mid-High", "High"] + ) +) + +# ==== 📈 Training Loss Curve ==== +plt.figure() +plt.plot(train_loss_record, label="Training Loss") +plt.title("Training Loss Curve") +plt.xlabel("Epoch") +plt.ylabel("Loss") +plt.grid(True) +plt.legend() +plt.tight_layout() +plt.show() + +# ==== 📊 Recall per Class Bar Chart ==== +plt.figure() +target_names = ["Low", "Mid-Low", "Mid-High", "High"] +recalls = [report[name]["recall"] for name in target_names] +plt.bar(target_names, recalls) +plt.title("Recall per Class") +plt.ylabel("Recall") +plt.ylim(0, 1) +plt.grid(axis="y") +plt.tight_layout() +plt.show() + +# ==== Confusion Matrix ==== +cm = confusion_matrix(labels, preds) +ConfusionMatrixDisplay( + confusion_matrix=cm, display_labels=["Low", "Mid-Low", "Mid-High", "High"] +).plot(cmap="Blues") +plt.title("Predicted vs Actual Class") +plt.tight_layout() +plt.show() + +# ==== Export Results ==== +pd.DataFrame( + { + "Enterprise Name": enterprise_names, + "Actual Class": labels, + "Predicted Class": preds, + } +).to_csv("carbon_emission_prediction_results.csv", index=False) +print("✅ Results exported to: carbon_emission_prediction_results.csv") diff --git a/examples/REDD/REDD_data.py b/examples/REDD/REDD_data.py new file mode 100644 index 000000000..bd295bcf3 --- /dev/null +++ b/examples/REDD/REDD_data.py @@ -0,0 +1,197 @@ +import numpy as np +import pandas as pd +import xarray as xr +from scipy.spatial import cKDTree + + +# 1. Load NetCDF satellite file +def load_satellite_data(file_path): + ds = xr.open_dataset(file_path) + df_all = pd.DataFrame( + { + "latitude": ds["latitude"].values, + "longitude": ds["longitude"].values, + "xco2": ds["xco2"].values, + "time": ds["time"].values, + } + ) + return df_all + + +# 2. Filter data within China region +def filter_china(df_all): + return df_all[ + (df_all["latitude"] >= 18) + & (df_all["latitude"] <= 54) + & (df_all["longitude"] >= 73) + & (df_all["longitude"] <= 135) + ] + + +# 3. Clean enterprise data +def load_enterprise_data(file_path): + df = pd.read_excel(file_path) + df = df.dropna(subset=["纬度", "经度"]) + df = df[np.isfinite(df["纬度"]) & np.isfinite(df["经度"])] + return df + + +# 4. Build KDTree for enterprise coordinates +def create_enterprise_tree(df): + enterprise_coords = df[["纬度", "经度"]].to_numpy() + return cKDTree(enterprise_coords) + + +# 5. Perform spatial matching between satellite points and enterprises +def match_satellite_to_enterprise(satellite_df, enterprise_tree, search_radius): + satellite_coords = satellite_df[["latitude", "longitude"]].to_numpy() + matches = enterprise_tree.query_ball_point(satellite_coords, r=search_radius) + return matches + + +# 6. Construct detailed matching results +def build_match_results(matches, satellite_df, enterprise_df): + detailed_results = [] + for i, matched_indices in enumerate(matches): + if matched_indices: + sat_row = satellite_df.iloc[i] + for idx in matched_indices: + ent_row = enterprise_df.iloc[idx] + detailed_results.append( + { + "Satellite Latitude": sat_row["latitude"], + "Satellite Longitude": sat_row["longitude"], + "Satellite CO₂ Concentration (xco2)": sat_row["xco2"], + "Satellite Observation Time": sat_row["time"], + "Province": ent_row.get("省份", None), + "City": ent_row.get("城市", None), + "Enterprise Name": ent_row.get("企业名称", None), + "Emission Point Name": ent_row.get("排放口名称", None), + "Enterprise Longitude": ent_row.get("经度", None), + "Enterprise Latitude": ent_row.get("纬度", None), + "Enterprise Time": ent_row.get("时间", None), + "Enterprise CO₂ Emission (kg)": ent_row.get( + "二氧化碳排放量(kg)", None + ), + } + ) + return pd.DataFrame(detailed_results) + + +# 7. Aggregate data (remove duplicates and merge) +def aggregate_data(df): + group_keys = [ + "Enterprise Name", + "Match Time", + "Satellite Latitude", + "Satellite Longitude", + ] + aggregated_df = df.groupby(group_keys, as_index=False).agg( + { + "Enterprise CO₂ Emission (kg)": "sum", + "Satellite CO₂ Concentration (xco2)": "first", + "Satellite Observation Time": "first", + "Province": "first", + "City": "first", + "Enterprise Longitude": "first", + "Enterprise Latitude": "first", + "Enterprise Time": "first", + } + ) + return aggregated_df + + +# 8. Remove duplicate rows +def deduplicate_data(df): + return df.drop_duplicates(subset=["Enterprise Name", "Match Time"], keep="first") + + +# 9. Load weather data +def load_weather_data(file_path): + return pd.read_csv(file_path) + + +# 10. Preprocess weather data +def preprocess_weather_data(df_meteo): + df_meteo["date"] = pd.to_datetime(df_meteo["date"]) + df_meteo["Standardized City"] = df_meteo["city"].str.replace("市", "", regex=False) + return df_meteo + + +# 11. Match weather data by time and city +def smart_city_match(row, meteo): + time = row["Match Time"] + city = row["Standardized City"] + + if city in ["辖区", "市辖区", "县"]: + city = row["Province"].replace("省", "").replace("市", "") + + candidates = meteo[ + (meteo["Standardized City"] == city) + & (meteo["date"] >= time - pd.Timedelta(hours=2)) + & (meteo["date"] <= time + pd.Timedelta(hours=2)) + ] + + if not candidates.empty: + return candidates.sort_values("date").iloc[0][ + ["prs", "winDAvg2mi", "winSAvg2mi", "tem", "rhu", "pre3h"] + ] + else: + return pd.Series( + [None] * 6, index=["prs", "winDAvg2mi", "winSAvg2mi", "tem", "rhu", "pre3h"] + ) + + +# 12. Merge weather data with enterprise-satellite matched data +def merge_weather_data(df_enterprise, df_meteo): + matched_weather = df_enterprise.apply( + lambda row: smart_city_match(row, df_meteo), axis=1 + ) + return pd.concat([df_enterprise, matched_weather], axis=1) + + +# === Main Process === +satellite_file_path = ( + "https://disc.gsfc.nasa.gov/datasets?keywords=oco2&page=1" # nasa data +) +enterprise_file_path = "./20240101_data.xlsx" +weather_file_path = "./20240101_20240301_meteo_data.csv" + +# Load and filter satellite data +satellite_df = load_satellite_data(satellite_file_path) +satellite_df_china = filter_china(satellite_df) + +# Load enterprise data +enterprise_df = load_enterprise_data(enterprise_file_path) + +# Build KDTree for spatial search +enterprise_tree = create_enterprise_tree(enterprise_df) + +# Define matching radius (approx. pixel size) +lat_half = 2.25 / 111 / 2 +lon_half = 1.29 / 111 / 2 +search_radius = max(lat_half, lon_half) + +# Match satellite pixels to nearby enterprises +matches = match_satellite_to_enterprise( + satellite_df_china, enterprise_tree, search_radius +) + +# Build detailed match results +detailed_df = build_match_results(matches, satellite_df_china, enterprise_df) + +# Aggregate matched data +aggregated_df = aggregate_data(detailed_df) + +# Deduplicate entries +deduplicated_df = deduplicate_data(aggregated_df) + +# Load and process meteorological data +df_meteo = load_weather_data(weather_file_path) +df_meteo = preprocess_weather_data(df_meteo) + +# Merge meteorological data with matched results +final_df = merge_weather_data(deduplicated_df, df_meteo) + +# Save final merged dataset +final_df.to_excel("merged_result.xlsx", index=False)