| 1 | # 導(dǎo)入需要的庫(kù) | 
| 2 | import pandas as pd | 
| 3 | from sklearn.decomposition import PCA | 
| 4 | from sklearn.model_selection import train_test_split | 
| 5 | from sklearn.tree import DecisionTreeClassifier | 
| 6 | import numpy as np | 
| 7 | import seaborn as sns | 
| 8 | import matplotlib.pyplot as plt | 
| 9 | from sklearn import metrics | 
| 10 | from sklearn.metrics import roc_curve, auc | 
| 11 |  | 
| 12 | def Read_data(file): | 
| 13 | dt = pd.read_csv(file) | 
| 14 | dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved','exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target'] | 
| 15 |  | 
| 16 |  | 
| 17 | data =dt | 
| 18 | pd.set_option('display.max_rows', None) | 
| 19 | pd.set_option('display.max_columns', None) | 
| 20 | pd.set_option('display.width', None) | 
| 21 | pd.set_option('display.unicode.ambiguous_as_wide', True) | 
| 22 | pd.set_option('display.unicode.east_asian_width', True) | 
| 23 | print(data.head()) | 
| 24 | return data | 
| 25 |  | 
| 26 | # ===================數(shù)據(jù)清洗====================== | 
| 27 | def data_clean(data): | 
| 28 | # 重復(fù)值處理 | 
| 29 | print('存在' if any(data.duplicated()) else '不存在', '重復(fù)觀測(cè)值') | 
| 30 | data.drop_duplicates() | 
| 31 |  | 
| 32 | # 缺失值處理 | 
| 33 | # print(data.isnull()) | 
| 34 | # print(data.isnull().sum()) #檢測(cè)每列中缺失值的數(shù)量 | 
| 35 | # print(data.isnull().T.sum()) #檢測(cè)每行缺失值的數(shù)量 | 
| 36 | print('不存在' if any(data.isnull()) else '存在', '缺失值') | 
| 37 | data.dropna() # 直接刪除記錄 | 
| 38 | data.fillna(method='ffill') # 前向填充 | 
| 39 | data.fillna(method='bfill') # 后向填充 | 
| 40 | data.fillna(value=2) # 值填充 | 
| 41 | data.fillna(value={'resting_blood_pressure': data['resting_blood_pressure'].mean()}) # 統(tǒng)計(jì)值填充 | 
| 42 |  | 
| 43 | # 異常值處理 | 
| 44 | data1 = data['resting_blood_pressure'] | 
| 45 | # 標(biāo)準(zhǔn)差監(jiān)測(cè) | 
| 46 | xmean = data1.mean() | 
| 47 | xstd = data1.std() | 
| 48 | print('存在' if any(data1 > xmean + 2 * xstd) else '不存在', '上限異常值') | 
| 49 | print('存在' if any(data1 < xmean - 2 * xstd) else '不存在', '下限異常值') | 
| 50 | # 箱線圖監(jiān)測(cè) | 
| 51 | q1 = data1.quantile(0.25) | 
| 52 | q3 = data1.quantile(0.75) | 
| 53 | up = q3 + 1.5 * (q3 - q1) | 
| 54 | dw = q1 - 1.5 * (q3 - q1) | 
| 55 | print('存在' if any(data1 > up) else '不存在', '上限異常值') | 
| 56 | print('存在' if any(data1 < dw) else '不存在', '下限異常值') | 
| 57 | data1[data1 > up] = data1[data1 < up].max() | 
| 58 | data1[data1 < dw] = data1[data1 > dw].min() | 
| 59 | return data | 
| 60 |  | 
| 61 |  | 
| 62 | #===========數(shù)值型變量分段統(tǒng)計(jì).離散型變量分組統(tǒng)計(jì)============== | 
| 63 | def Segment_statistics(data): | 
| 64 | age = data[['age']] | 
| 65 | bins = [20, 30, 40, 50, 60, 70, 80, 90, 100, 110] | 
| 66 | age2 = pd.cut(age.values.flatten(), bins=bins) | 
| 67 | # print(age2.value_counts()) | 
| 68 | age2 = pd.DataFrame(age2, columns=['年齡段']) # | 
| 69 | age3 = pd.concat([age, age2], axis=1) | 
| 70 | # print(age3) | 
| 71 |  | 
| 72 | tmp3 = data.groupby(['chest_pain_type', 'sex']) | 
| 73 | print(tmp3.count()) | 
| 74 | return | 
| 75 |  | 
| 76 |  | 
| 77 | #========================數(shù)據(jù)編碼=========================== | 
| 78 | def data_encoding(data): | 
| 79 | data = data[['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg','max_heart_rate_achieved', 'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels','thalassemia','target']] | 
| 80 |  | 
| 81 |  | 
| 82 | Discretefeature=['sex','chest_pain_type', 'fasting_blood_sugar', 'rest_ecg','exercise_induced_angina', 'st_slope', 'thalassemia'] | 
| 83 |  | 
| 84 | Continuousfeature=['age', 'resting_blood_pressure', 'cholesterol','max_heart_rate_achieved','st_depression','num_major_vessels'] | 
| 85 |  | 
| 86 |  | 
| 87 | df = pd.get_dummies(data,columns=Discretefeature) | 
| 88 |  | 
| 89 | df[Continuousfeature]=(df[Continuousfeature]-df[Continuousfeature].mean())/(df[Continuousfeature].std()) | 
| 90 | df['target']=data[['target']] | 
| 91 | return df | 
| 92 |  | 
| 93 | def PCA_analysis(data): | 
| 94 | # X提取變量特征;Y提取目標(biāo)變量 | 
| 95 | X = data.drop('target', axis=1) | 
| 96 | y = data['target'] | 
| 97 | pca = PCA(n_components=2) | 
| 98 |  | 
| 99 | reduced_x = pca.fit_transform(X) # 得到了pca降到2維的數(shù)據(jù) | 
| 100 |  | 
| 101 | yes_x, yes_y = [], [] | 
| 102 | no_x, no_y = [], [] | 
| 103 |  | 
| 104 | for i in range(len(reduced_x)): | 
| 105 |  if y[i] == 1: | 
| 106 |  yes_x.append(reduced_x[i][0]) | 
| 107 |  yes_y.append(reduced_x[i][1]) | 
| 108 |  elif y[i] == 0: | 
| 109 |  no_x.append(reduced_x[i][0]) | 
| 110 |  no_y.append(reduced_x[i][1]) | 
| 111 |  | 
| 112 | font = {'family': 'Times New Roman', | 
| 113 | 'size': 16, | 
| 114 | } | 
| 115 | sns.set(font_scale=1.2) | 
| 116 |  | 
| 117 | plt.rc('font',family='Times New Roman') | 
| 118 | plt.scatter(yes_x, yes_y, c='r', marker='o',label='Yes') | 
| 119 | plt.scatter(no_x, no_y, c='b', marker='x',label='No') | 
| 120 | plt.title('PCA analysis') # 顯示標(biāo)題 | 
| 121 | plt.legend() | 
| 122 | plt.show() | 
| 123 | print(pca.explained_variance_ratio_) # 輸出貢獻(xiàn)率
 | 
| 124 |  | 
| 125 |  | 
| 126 | def data_partition(data): | 
| 127 | #======================數(shù)據(jù)集劃分========================== | 
| 128 | # 1.4查看樣本是否平衡 | 
| 129 | print(data['target'].value_counts()) | 
| 130 | # X提取變量特征;Y提取目標(biāo)變量 | 
| 131 | X = data.drop('target', axis=1) | 
| 132 | y = data['target'] | 
| 133 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=10) | 
| 134 | feature=list(X.columns) | 
| 135 | return X_train, y_train, X_test, y_test,feature | 
| 136 |  | 
| 137 |  | 
| 138 | def Draw_ROC(list1,list2): | 
| 139 | fpr_model,tpr_model,thresholds=roc_curve(list1,list2,pos_label=1) | 
| 140 | roc_auc_model=auc(fpr_model,tpr_model) | 
| 141 |  | 
| 142 | font = {'family': 'Times New Roman', | 
| 143 | 'size': 12, | 
| 144 | } | 
| 145 | sns.set(font_scale=1.2) | 
| 146 | plt.rc('font',family='Times New Roman') | 
| 147 |  | 
| 148 | plt.plot(fpr_model,tpr_model,'blue',label='AUC = %0.2f'% roc_auc_model) | 
| 149 | plt.legend(loc='lower right',fontsize = 12) | 
| 150 | plt.plot([0,1],[0,1],'r--') | 
| 151 | plt.ylabel('True Positive Rate',fontsize = 14) | 
| 152 | plt.xlabel('Flase Positive Rate',fontsize = 14) | 
| 153 | plt.show() | 
| 154 | return | 
| 155 |  | 
| 156 | #============決策樹(shù)===================== | 
| 157 | def DT(X_train, y_train, X_test, y_test,feature): | 
| 158 | tree1 = DecisionTreeClassifier(max_depth=5, random_state=0) | 
| 159 | tree1.fit(X_train, y_train) | 
| 160 | print('\nFinally results of decision tree fitting:') | 
| 161 | print('Accuracy on training set: {:.3f}'.format(tree1.score(X_train, y_train))) | 
| 162 | print('Accuract on test set: {:.3f}'.format(tree1.score(X_test, y_test))) | 
| 163 |  | 
| 164 | predict_target=tree1.predict(X_test) | 
| 165 | predict_target_prob=tree1.predict_proba(X_test)  | 
| 166 | predict_target_prob_dt=predict_target_prob[:,1] | 
| 167 |  | 
| 168 | df = pd.DataFrame({'prob':predict_target_prob_dt,'target':predict_target, 'labels':list(y_test)}) | 
| 169 |  | 
| 170 | print('預(yù)測(cè)正確的個(gè)數(shù):',sum(predict_target==y_test)) | 
| 171 | print('DT驗(yàn)證集報(bào)告:') | 
| 172 | print(metrics.classification_report(y_test,predict_target))  | 
| 173 | print(metrics.confusion_matrix(y_test, predict_target))  | 
| 174 |  | 
| 175 | print('DT訓(xùn)練集報(bào)告:') | 
| 176 | predict_Target=tree1.predict(X_train) | 
| 177 | print(metrics.classification_report(y_train,predict_Target)) | 
| 178 | print(metrics.confusion_matrix(y_train, predict_Target)) | 
| 179 |  | 
| 180 | id=np.argwhere(tree1.feature_importances_>0)  | 
| 181 |  | 
| 182 | id=[i for item in id for i in item] #二維數(shù)組(列表)轉(zhuǎn)化為一維 列表推導(dǎo)式 | 
| 183 | dic={} | 
| 184 | for i in id: | 
| 185 |  dic.update({feature[i]:tree1.feature_importances_[i]}) | 
| 186 |  | 
| 187 | df=pd.DataFrame.from_dict(dic,orient='index',columns=['權(quán)重']) | 
| 188 | df=df.reset_index().rename(columns={'index':'特征'}) | 
| 189 | df=df.sort_values(by='權(quán)重',ascending=False) | 
| 190 |  | 
| 191 | data_hight=df['權(quán)重'].values.tolist() | 
| 192 | data_x=df['特征'].values.tolist() | 
| 193 |  | 
| 194 | font = {'family': 'Times New Roman', | 
| 195 | 'size': 7, | 
| 196 | } | 
| 197 | sns.set(font_scale=1.2) | 
| 198 | plt.rc('font',family='Times New Roman') | 
| 199 |  | 
| 200 | plt.figure() | 
| 201 | plt.barh(range(len(data_x)), data_hight, color='#6699CC') | 
| 202 | plt.yticks(range(len(data_x)),data_x,fontsize=12) | 
| 203 |  | 
| 204 | plt.tick_params(labelsize=12) #刻度字體大小13 | 
| 205 | plt.xlabel('Feature importance',fontsize=14) | 
| 206 | plt.title('DT feature importance analysis',fontsize =14) | 
| 207 | plt.show() | 
| 208 | return list(y_test),list(predict_target_prob_dt) | 
| 209 |  | 
| 210 |  | 
| 211 | if __name__=='__main__': | 
| 212 | data1=Read_data('F:\數(shù)據(jù)雜壇\\0504\heartdisease\Heart-Disease-Data-Set-main\\UCI Heart Disease Dataset.csv')
 | 
| 213 | data1=data_clean(data1) | 
| 214 | # Segment_statistics(data1) | 
| 215 | data2=data_encoding(data1) | 
| 216 | PCA_analysis(data2) | 
| 217 | X_train, y_train, X_test, y_test,feature= data_partition(data2) | 
| 218 |  | 
| 219 | y_test,predict_target_prob_dt=DT(X_train, y_train, X_test, y_test,feature) | 
| 220 | Draw_ROC(y_test,predict_target_prob_dt) |