毕业论文代码

@author: Rebecca Li

首先引入packages和dataset

import pandas as pd
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

bonds = pd.read_excel('/Users/rebecca/Desktop/Thesis/Data/database_v4.3.xlsx', sheet_name = 'Sheet1')

查看数据表头

bonds.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BondCode	Rank	issueamount	term/year	yield%	Public_Revenue	fa_yoy_or	ROE	EBITDA2D	Current	...	Salescash2OR	OM	EG	CAreturn	Invreturn	LNProvince_GDP	is_gurarantee	LNTotal_Asset	if_public	if_basic
0	078080.IB	I	8.0	10.0	6.08	428.0265	-8.8139	2.9081	0.038838	2.4302	...	98.5065	-70.1820	-56.874827	0.0297	0.0309	8.718091	False	4.202351	True	True
1	122932.SH	I	12.0	7.0	7.00	724.6197	21.7038	2.1190	0.042112	6.8488	...	122.0677	-120.1565	-46.002507	0.0146	0.0158	9.088360	True	4.774094	True	True
2	0980167.IB	I	12.0	7.0	7.00	724.6197	21.7038	2.1190	0.042112	6.8488	...	122.0677	-120.1565	-46.002507	0.0146	0.0158	9.088360	True	4.774094	True	True
3	124307.SH	I	6.0	7.0	6.00	1792.7192	200.5891	17.7061	0.975419	4.1721	...	44.4314	96.1972	272.069235	0.2838	0.1689	9.753365	False	4.325516	True	True
4	1380222.IB	I	6.0	7.0	6.00	1792.7192	200.5891	17.7061	0.975419	4.1721	...	44.4314	96.1972	272.069235	0.2838	0.1689	9.753365	False	4.325516	True	True

5 rows × 24 columns

查看数据集缺失值情况

bonds.isnull().sum(axis=0).sort_values(ascending=False)/float(len(bonds))

if_basic          0.0
if_public         0.0
Rank              0.0
issueamount       0.0
term/year         0.0
yield%            0.0
Public_Revenue    0.0
fa_yoy_or         0.0
ROE               0.0
EBITDA2D          0.0
Current           0.0
D2A               0.0
CF2D              0.0
CF2I              0.0
Earning2A         0.0
Salescash2OR      0.0
OM                0.0
EG                0.0
CAreturn          0.0
Invreturn         0.0
LNProvince_GDP    0.0
is_gurarantee     0.0
LNTotal_Asset     0.0
BondCode          0.0
dtype: float64

查看数据集的维数和数据集的列名

bonds.shape
bonds.columns

Index(['BondCode', 'Rank', 'issueamount', 'term/year', 'yield%',
       'Public_Revenue', 'fa_yoy_or', 'ROE', 'EBITDA2D', 'Current', 'D2A',
       'CF2D', 'CF2I', 'Earning2A', 'Salescash2OR', 'OM', 'EG', 'CAreturn',
       'Invreturn', 'LNProvince_GDP', 'is_gurarantee', 'LNTotal_Asset',
       'if_public', 'if_basic'],
      dtype='object')

# 从列标题中选择特征变量(features)

cols_of_feature = bonds.columns[2:] 
cols_of_feature

Index(['issueamount', 'term/year', 'yield%', 'Public_Revenue', 'fa_yoy_or',
       'ROE', 'EBITDA2D', 'Current', 'D2A', 'CF2D', 'CF2I', 'Earning2A',
       'Salescash2OR', 'OM', 'EG', 'CAreturn', 'Invreturn', 'LNProvince_GDP',
       'is_gurarantee', 'LNTotal_Asset', 'if_public', 'if_basic'],
      dtype='object')

# 目标变量分布(Frequency)可视化
fig, axs = plt.subplots(1,2,figsize=(20,12))

sns.countplot(x='Rank',data=bonds,palette="Set3",ax=axs[0])
axs[0].set_title("Frequency of each Rank")
bonds['Rank'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of each Target")
fig.savefig("Frequency & Percentage of Sample Target")
plt.show()

# 查看数据集中各类别的个数
bonds['Rank'].value_counts()

I      15035
II      8979
III       16
Name: Rank, dtype: int64

# 在不同目标变量下，对比指标的相关性，并绘制相关性图谱
RankI = bonds.loc[bonds["Rank"] == "I"] 
RankII = bonds.loc[bonds["Rank"] == "II"] 
RankIII = bonds.loc[bonds["Rank"] == "III"]

# Rank I相关性图谱
correlationRankI = RankI.loc[:, bonds.columns.difference(['Rank', 'BondCode'])].corr()

mask = np.zeros_like(correlationRankI)
indices = np.triu_indices_from(correlationRankI)
mask[indices] = True

with sns.axes_style("white"):
    f, ax1 = plt.subplots(figsize=(15, 12))
    cmap = sns.diverging_palette(220, 8, as_cmap=True)

    ax1 =sns.heatmap(correlationRankI, vmin = -1, vmax = 1, cmap = cmap, square = False, mask = mask, linewidths = 0.1, cbar_kws={'orientation': 'vertical','ticks': [-1, -0.5, 0, 0.5, 1]})
    ax1.set_xticklabels(ax1.get_xticklabels(), size = 8)
    ax1.set_yticklabels(ax1.get_yticklabels(), size = 8)
    ax1.set_title('Rank I Correlation', size = 20)

plt.savefig('Rank I Correlation', bbox_inches='tight') 

#grid_kws = {"width_ratios": (.9, .9, .05), "wspace": 0.2}
# f, (ax1, ax2, cbar_ax) = plt.subplots(1, 3, gridspec_kw=grid_kws, figsize = (14, 9))

correlationRankII = RankII.loc[:, bonds.columns.difference(['Rank', 'BondCode'])].corr()
correlationRankIII = RankIII.loc[:, bonds.columns.difference(['Rank', 'BondCode'])].corr()

mask = np.zeros_like(correlationRankII)
indices = np.triu_indices_from(correlationRankII)
mask[indices] = True
grid_kws = {"width_ratios": (.9, .9, .05), "wspace": 0.2}

f, (ax1, ax2, cbar_ax) = plt.subplots(1, 3, gridspec_kw=grid_kws, figsize = (14, 9))

cmap = sns.diverging_palette(220, 8, as_cmap=True)


ax1 =sns.heatmap(correlationRankII, ax = ax1, vmin = -1, vmax = 1, cmap = cmap, square = False, linewidths = 0.5, mask = mask, cbar = False)
ax1.set_xticklabels(ax1.get_xticklabels(), size = 8)
ax1.set_yticklabels(ax1.get_yticklabels(), size = 8)
ax1.set_title('RankII', size = 20)


ax2 = sns.heatmap(correlationRankIII, vmin = -1, vmax = 1, cmap = cmap, ax = ax2, square = False, linewidths = 0.5, mask = mask, yticklabels = False, cbar_ax = cbar_ax,cbar_kws={'orientation': 'vertical','ticks': [-1, -0.5, 0, 0.5, 1]})
ax2.set_xticklabels(ax2.get_xticklabels(), size = 8)
ax2.set_title('RankIII', size = 20)
cbar_ax.set_yticklabels(cbar_ax.get_yticklabels(), size = 12)
plt.savefig('Rank I & Rank III Feature Correlation', bbox_inches='tight')

指标选取Feature Selection

# Import train_test_split function
from sklearn.model_selection import train_test_split

X=bonds.drop(['Rank', 'BondCode'], axis=1)  # Features
y=bonds[['Rank']] # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # 75% training and 25% test

# Create Random Forest Classifier
clf=RandomForestClassifier(n_estimators = 10,random_state = 123)

# Fitting Original Data
clf.fit(X_train, np.ravel(y_train))

RandomForestClassifier(n_estimators=10, random_state=123)

获取RF模型精确度(Accuracy)

metrics方法获取精确度

#获得测试集预测值
y_pred=clf.predict(X_test)
# Model Accuracy(how often is the classifier correct)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9181091877496671

内置score方法获取精确度

# Get the mean accuracy on the given test data and labels.
print("Accuracy on training set: {:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(clf.score(X_test, y_test)))

Accuracy on training set: 0.996
Accuracy on test set: 0.918

获取目前指标重要性排序(Identify And Select Most Important Features)

importance = clf.feature_importances_
feature_names = np.array(cols_of_feature)

# for feature in zip(cols_of_feature, clf.feature_importances_):
#     print(feature)

#fe_co = plt.bar(height=importance, x=feature_names)
plt.subplots(figsize = (20,15))
plt.barh(feature_names, importance, align='center') 
plt.yticks(feature_names) 
plt.xlabel("Feature importance")
plt.ylabel("Feature")

plt.savefig('Feature Correlation Original', bbox_inches='tight')

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_names, importance)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: LNTotal_Asset        Importance: 0.22
Variable: yield%               Importance: 0.11
Variable: issueamount          Importance: 0.05
Variable: Current              Importance: 0.05
Variable: is_gurarantee        Importance: 0.05
Variable: Public_Revenue       Importance: 0.04
Variable: Earning2A            Importance: 0.04
Variable: Salescash2OR         Importance: 0.04
Variable: OM                   Importance: 0.04
Variable: CAreturn             Importance: 0.04
Variable: Invreturn            Importance: 0.04
Variable: term/year            Importance: 0.03
Variable: ROE                  Importance: 0.03
Variable: EBITDA2D             Importance: 0.03
Variable: D2A                  Importance: 0.03
Variable: CF2D                 Importance: 0.03
Variable: CF2I                 Importance: 0.03
Variable: EG                   Importance: 0.03
Variable: LNProvince_GDP       Importance: 0.03
Variable: fa_yoy_or            Importance: 0.02
Variable: if_public            Importance: 0.01
Variable: if_basic             Importance: 0.0

由此可知，需剔除两个弱相关(<15%)变量if_public和if_basic

#根据输出的各指标重要性排序结果，剔除重要性最弱的两个变量Q、H
cols_to_drop = ['if_basic','if_public']
X_new = X.drop(cols_to_drop, axis=1)
X_new.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	issueamount	term/year	yield%	Public_Revenue	fa_yoy_or	ROE	EBITDA2D	Current	D2A	CF2D	CF2I	Earning2A	Salescash2OR	OM	EG	CAreturn	Invreturn	LNProvince_GDP	is_gurarantee	LNTotal_Asset
0	8.0	10.0	6.08	428.0265	-8.8139	2.9081	0.038838	2.4302	53.6158	0.0459	11.397205	7.728968	98.5065	-70.1820	-56.874827	0.0297	0.0309	8.718091	False	4.202351
1	12.0	7.0	7.00	724.6197	21.7038	2.1190	0.042112	6.8488	53.4305	0.0222	1.403246	7.144044	122.0677	-120.1565	-46.002507	0.0146	0.0158	9.088360	True	4.774094
2	12.0	7.0	7.00	724.6197	21.7038	2.1190	0.042112	6.8488	53.4305	0.0222	1.403246	7.144044	122.0677	-120.1565	-46.002507	0.0146	0.0158	9.088360	True	4.774094
3	6.0	7.0	6.00	1792.7192	200.5891	17.7061	0.975419	4.1721	39.9249	0.1545	137.334566	20.004795	44.4314	96.1972	272.069235	0.2838	0.1689	9.753365	False	4.325516
4	6.0	7.0	6.00	1792.7192	200.5891	17.7061	0.975419	4.1721	39.9249	0.1545	137.334566	20.004795	44.4314	96.1972	272.069235	0.2838	0.1689	9.753365	False	4.325516

运用新数据构建默认模型(default model RFC)

#默认值查看
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y, test_size=0.25)

clf0 = RandomForestClassifier(oob_score=True, random_state=123)
clf0.fit(X_train_new,np.ravel(y_train_new))
print(clf0.oob_score_)

0.9310842303850849

袋外分数0.9310842303850849，较良好；
通过调参对现有默认模型依据OOB error进行改进，其中改进过称见附录

构建最终模型(Final RFC)

#Final Model: CLF

clf1 = RandomForestClassifier(n_estimators = 503, oob_score=True, random_state=123, max_depth=32)
clf1.fit(X_train_new,np.ravel(y_train_new))
y_pred1=clf1.predict(X_test_new)

获取准确度Accuracy

print("Accuracy:",metrics.accuracy_score(y_test_new, y_pred1))
print("Accuracy on training set: {:.5f}".format(clf1.score(X_train_new, y_train_new)))
print("Accuracy on test set: {:.5f}".format(clf1.score(X_test_new, y_test_new)))

Accuracy: 0.938249001331558
Accuracy on training set: 0.99989
Accuracy on test set: 0.93825

获取OOB Error

#OOB Score indicate whether the model is overfitting
OOB_score = clf1.oob_score_

print("OOB Error: {:.5f}".format(1 - OOB_score))

OOB Error: 0.06331

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test_new, y_pred1))
print(classification_report(y_test_new, y_pred1))

[[3546  177    0]
 [ 190 2090    0]
 [   1    3    1]]
              precision    recall  f1-score   support

           I       0.95      0.95      0.95      3723
          II       0.92      0.92      0.92      2280
         III       1.00      0.20      0.33         5

    accuracy                           0.94      6008
   macro avg       0.96      0.69      0.73      6008
weighted avg       0.94      0.94      0.94      6008

解释：

precision：精度=正确预测的个数(TP)/被预测正确的个数(TP+FP)；也就是模型预测为I/II的值中，有多少是正确的
recall:召回率=正确预测的个数(TP)/预测个数(TP+FN)；也就是对于原值为I/II的值，有多少预测正确了
f1-score:F1 = 2精度召回率/(精度+召回率)
micro avg：计算所有数据下的指标值，假设全部数据 5 个样本中有 3 个预测正确，所以 micro avg 为 3/5=0.6

新模型Feature重要性排序

importance1 = clf1.feature_importances_
cols_of_feature1 = X_new.columns
feature_names1 = np.array(cols_of_feature1)

plt.subplots(figsize = (20,15))
plt.barh(feature_names1, importance1, align='center') 
plt.yticks(feature_names1) 
plt.xlabel("Feature importance for Random Forest")
plt.ylabel("Feature")

plt.savefig('Feature Correlation Final', bbox_inches='tight')

可见，总资产(对数值)和收益率，是否担保及流动比率最为重要

feature_importances1 = [(feature, round(importance1, 2)) for feature, importance1 in zip(feature_names1, importance1)]

# Sort the feature importances by most important first
feature_importances1 = sorted(feature_importances1, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances1];

Variable: LNTotal_Asset        Importance: 0.23
Variable: yield%               Importance: 0.11
Variable: is_gurarantee        Importance: 0.06
Variable: issueamount          Importance: 0.05
Variable: Current              Importance: 0.05
Variable: D2A                  Importance: 0.04
Variable: Earning2A            Importance: 0.04
Variable: CAreturn             Importance: 0.04
Variable: Invreturn            Importance: 0.04
Variable: term/year            Importance: 0.03
Variable: Public_Revenue       Importance: 0.03
Variable: fa_yoy_or            Importance: 0.03
Variable: ROE                  Importance: 0.03
Variable: EBITDA2D             Importance: 0.03
Variable: CF2D                 Importance: 0.03
Variable: CF2I                 Importance: 0.03
Variable: Salescash2OR         Importance: 0.03
Variable: OM                   Importance: 0.03
Variable: EG                   Importance: 0.03
Variable: LNProvince_GDP       Importance: 0.03

附录：调参过称

1. 对n_estimators进行调参

# Range of `n_estimators` values to explore.
min_estimators = 23
max_estimators = 800

error_rate = {n : 0 for n in range(min_estimators, max_estimators + 1, 5)}

clf3 = RandomForestClassifier(oob_score=True, random_state=123)

for i in range(min_estimators, max_estimators + 1, 5):
    clf3.set_params(n_estimators=i)
    clf3.fit(X_train_new,np.ravel(y_train_new))

    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - clf3.oob_score_
    error_rate[i] = oob_error

plt.plot(list(error_rate.keys()), list(error_rate.values()))

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.show()
plt.savefig('OOB Error Rate for n', bbox_inches='tight')

<Figure size 432x288 with 0 Axes>

#按照error_rate的value进行排序

error_sort = sorted(error_rate.items(), key = lambda kv:(kv[1], kv[0]))
error_sort[:9]
#[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in error_sort]

[(503, 0.06336699589390749),
 (498, 0.06347797136832756),
 (393, 0.06353345910553765),
 (438, 0.06353345910553765),
 (388, 0.06358894684274774),
 (408, 0.06358894684274774),
 (403, 0.06364443457995783),
 (413, 0.06364443457995783),
 (423, 0.06364443457995783)]

n_estimators根据OOB取最小值503

3. 对max_features进行调参

error_rate4 = {n : 0 for n in range(2, 24, 2)}

clf4 = RandomForestClassifier(n_estimators = 503, oob_score=True, max_depth = 32, random_state=123)

for i in range(2, 22, 2):
    clf4.set_params(max_features = i)
    clf4.fit(X_train_new,np.ravel(y_train_new))

    oob_error = 1 - clf4.oob_score_
    error_rate4[i] = oob_error
    
    print('Counting on:', i)

Counting on: 2
Counting on: 4
Counting on: 6
Counting on: 8
Counting on: 10
Counting on: 12
Counting on: 14
Counting on: 16
Counting on: 18
Counting on: 20

plt.plot(list(error_rate4.keys()), list(error_rate4.values()))

plt.xlim(2, 20)
plt.xlabel("max_features")
plt.ylabel("OOB error rate")
plt.show()
plt.savefig('OOB Error Rate for features', bbox_inches='tight')

<Figure size 432x288 with 0 Axes>

error_sort4 = sorted(error_rate4.items(), key = lambda kv:(kv[1], kv[0]))
error_sort4[:19]

[(22, 0),
 (2, 0.061757851514815276),
 (4, 0.0633115081566974),
 (6, 0.06508711574741977),
 (10, 0.06591943180557092),
 (8, 0.0660304072799911),
 (14, 0.06625235822883146),
 (12, 0.06630784596604156),
 (18, 0.0665852846520919),
 (16, 0.06680723560093216),
 (20, 0.06780601487071358)]

max_features取合理区间$[4,6]$OOB最小时值为4,近似sqrt(20),因此最终模型取默认值

2.对决策树最大深度max_depth进行调参

若等于None,表示决策树在构建最优模型的时候不会限制子树的深度。如果模型样本量多，特征也多的情况下，推荐限制最大深度；若样本量少或者特征少，则不限制最大深度。

error_rate5 = {n : 0 for n in range(4,40,2)}

clf5 = RandomForestClassifier(n_estimators = 503, oob_score=True, random_state=123)

for i in range(4,40,2):
    clf5.set_params(max_depth = i)
    clf5.fit(X_train_new,np.ravel(y_train_new))

    oob_error = 1 - clf5.oob_score_
    error_rate5[i] = oob_error

plt.plot(list(error_rate5.keys()), list(error_rate5.values()))

plt.xlim(4,40,2)
plt.xlabel("max_depth")
plt.ylabel("OOB error rate")
plt.show()
plt.savefig('OOB Error Rate for depth', bbox_inches='tight')

<Figure size 432x288 with 0 Axes>

error_sort5 = sorted(error_rate5.items(), key = lambda kv:(kv[1], kv[0]))
error_sort5[:40]

[(32, 0.0633115081566974),
 (34, 0.0633115081566974),
 (36, 0.06336699589390749),
 (38, 0.06336699589390749),
 (30, 0.06369992231716792),
 (28, 0.06419931195205864),
 (20, 0.06425479968926873),
 (26, 0.06431028742647871),
 (22, 0.06447675063810898),
 (24, 0.06464321384973926),
 (18, 0.06569748085673066),
 (16, 0.0699145488846965),
 (14, 0.07690600377316614),
 (12, 0.08872489179891241),
 (10, 0.10858950172011983),
 (8, 0.12967484185994893),
 (6, 0.14942847630673617),
 (4, 0.17045832870935518)]

max_depth取OOB最小时值32

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

毕业论文代码

@author: Rebecca Li

指标选取Feature Selection

运用新数据构建默认模型(default model RFC)

构建最终模型(Final RFC)

附录：调参过称

1. 对n_estimators进行调参

3. 对max_features进行调参

2.对决策树最大深度max_depth进行调参

FilesExpand file tree

RF_Code_v1.md

Latest commit

History

RF_Code_v1.md

File metadata and controls

毕业论文代码

@author: Rebecca Li

指标选取Feature Selection

运用新数据构建默认模型(default model RFC)

构建最终模型(Final RFC)

附录：调参过称

1. 对n_estimators进行调参

3. 对max_features进行调参

2.对决策树最大深度max_depth进行调参