Feature importance refers to a class of techniques for assigning scores to input features to a predictive model that indicates the relative importance of each feature when making a prediction.
Feature importance scores can be calculated for problems that involve predicting a numerical value, called regression, and those problems that involve predicting a class label, called classification.
The scores are useful and can be used in a range of situations in a predictive modeling problem, such as:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(r'F:\Feature Engineering\Univariate Selection\cardio_train.csv',sep = ";")
df
id | age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 18393 | 2 | 168 | 62.0 | 110 | 80 | 1 | 1 | 0 | 0 | 1 | 0 |
1 | 1 | 20228 | 1 | 156 | 85.0 | 140 | 90 | 3 | 1 | 0 | 0 | 1 | 1 |
2 | 2 | 18857 | 1 | 165 | 64.0 | 130 | 70 | 3 | 1 | 0 | 0 | 0 | 1 |
3 | 3 | 17623 | 2 | 169 | 82.0 | 150 | 100 | 1 | 1 | 0 | 0 | 1 | 1 |
4 | 4 | 17474 | 1 | 156 | 56.0 | 100 | 60 | 1 | 1 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
69995 | 99993 | 19240 | 2 | 168 | 76.0 | 120 | 80 | 1 | 1 | 1 | 0 | 1 | 0 |
69996 | 99995 | 22601 | 1 | 158 | 126.0 | 140 | 90 | 2 | 2 | 0 | 0 | 1 | 1 |
69997 | 99996 | 19066 | 2 | 183 | 105.0 | 180 | 90 | 3 | 1 | 0 | 1 | 0 | 1 |
69998 | 99998 | 22431 | 1 | 163 | 72.0 | 135 | 80 | 1 | 2 | 0 | 0 | 0 | 1 |
69999 | 99999 | 20540 | 1 | 170 | 72.0 | 120 | 80 | 2 | 1 | 0 | 0 | 1 | 0 |
70000 rows × 13 columns
df.shape
(70000, 13)
df['yr'] = (df['age']/365).round(0)
df['yr']
0 50.0 1 55.0 2 52.0 3 48.0 4 48.0 ... 69995 53.0 69996 62.0 69997 52.0 69998 61.0 69999 56.0 Name: yr, Length: 70000, dtype: float64
df.describe()
id | age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | yr | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 |
mean | 49972.419900 | 19468.865814 | 1.349571 | 164.359229 | 74.205690 | 128.817286 | 96.630414 | 1.366871 | 1.226457 | 0.088129 | 0.053771 | 0.803729 | 0.499700 | 53.338686 |
std | 28851.302323 | 2467.251667 | 0.476838 | 8.210126 | 14.395757 | 154.011419 | 188.472530 | 0.680250 | 0.572270 | 0.283484 | 0.225568 | 0.397179 | 0.500003 | 6.765294 |
min | 0.000000 | 10798.000000 | 1.000000 | 55.000000 | 10.000000 | -150.000000 | -70.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 30.000000 |
25% | 25006.750000 | 17664.000000 | 1.000000 | 159.000000 | 65.000000 | 120.000000 | 80.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 48.000000 |
50% | 50001.500000 | 19703.000000 | 1.000000 | 165.000000 | 72.000000 | 120.000000 | 80.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 54.000000 |
75% | 74889.250000 | 21327.000000 | 2.000000 | 170.000000 | 82.000000 | 140.000000 | 90.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 58.000000 |
max | 99999.000000 | 23713.000000 | 2.000000 | 250.000000 | 200.000000 | 16020.000000 | 11000.000000 | 3.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 65.000000 |
df.corr()
id | age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | yr | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | 1.000000 | 0.003457 | 0.003502 | -0.003038 | -0.001830 | 0.003356 | -0.002529 | 0.006106 | 0.002467 | -0.003699 | 0.001210 | 0.003755 | 0.003799 | 0.003050 |
age | 0.003457 | 1.000000 | -0.022811 | -0.081515 | 0.053684 | 0.020764 | 0.017647 | 0.154424 | 0.098703 | -0.047633 | -0.029723 | -0.009927 | 0.238159 | 0.999090 |
gender | 0.003502 | -0.022811 | 1.000000 | 0.499033 | 0.155406 | 0.006005 | 0.015254 | -0.035821 | -0.020491 | 0.338135 | 0.170966 | 0.005866 | 0.008109 | -0.023017 |
height | -0.003038 | -0.081515 | 0.499033 | 1.000000 | 0.290968 | 0.005488 | 0.006150 | -0.050226 | -0.018595 | 0.187989 | 0.094419 | -0.006570 | -0.010821 | -0.081456 |
weight | -0.001830 | 0.053684 | 0.155406 | 0.290968 | 1.000000 | 0.030702 | 0.043710 | 0.141768 | 0.106857 | 0.067780 | 0.067113 | -0.016867 | 0.181660 | 0.053661 |
ap_hi | 0.003356 | 0.020764 | 0.006005 | 0.005488 | 0.030702 | 1.000000 | 0.016086 | 0.023778 | 0.011841 | -0.000922 | 0.001408 | -0.000033 | 0.054475 | 0.020793 |
ap_lo | -0.002529 | 0.017647 | 0.015254 | 0.006150 | 0.043710 | 0.016086 | 1.000000 | 0.024019 | 0.010806 | 0.005186 | 0.010601 | 0.004780 | 0.065719 | 0.017754 |
cholesterol | 0.006106 | 0.154424 | -0.035821 | -0.050226 | 0.141768 | 0.023778 | 0.024019 | 1.000000 | 0.451578 | 0.010354 | 0.035760 | 0.009911 | 0.221147 | 0.154386 |
gluc | 0.002467 | 0.098703 | -0.020491 | -0.018595 | 0.106857 | 0.011841 | 0.010806 | 0.451578 | 1.000000 | -0.004756 | 0.011246 | -0.006770 | 0.089307 | 0.098596 |
smoke | -0.003699 | -0.047633 | 0.338135 | 0.187989 | 0.067780 | -0.000922 | 0.005186 | 0.010354 | -0.004756 | 1.000000 | 0.340094 | 0.025858 | -0.015486 | -0.047884 |
alco | 0.001210 | -0.029723 | 0.170966 | 0.094419 | 0.067113 | 0.001408 | 0.010601 | 0.035760 | 0.011246 | 0.340094 | 1.000000 | 0.025476 | -0.007330 | -0.029918 |
active | 0.003755 | -0.009927 | 0.005866 | -0.006570 | -0.016867 | -0.000033 | 0.004780 | 0.009911 | -0.006770 | 0.025858 | 0.025476 | 1.000000 | -0.035653 | -0.009819 |
cardio | 0.003799 | 0.238159 | 0.008109 | -0.010821 | 0.181660 | 0.054475 | 0.065719 | 0.221147 | 0.089307 | -0.015486 | -0.007330 | -0.035653 | 1.000000 | 0.237749 |
yr | 0.003050 | 0.999090 | -0.023017 | -0.081456 | 0.053661 | 0.020793 | 0.017754 | 0.154386 | 0.098596 | -0.047884 | -0.029918 | -0.009819 | 0.237749 | 1.000000 |
df = df.drop(['yr'],axis = 1)
x = df.iloc[:,:-1]
y = df.iloc[:,12]
y
0 0 1 1 2 1 3 1 4 0 .. 69995 0 69996 1 69997 1 69998 1 69999 0 Name: cardio, Length: 70000, dtype: int64
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x,y)
ExtraTreesClassifier()
model.feature_importances_
array([0.15970282, 0.20506294, 0.00625649, 0.1362955 , 0.14810443, 0.16937946, 0.10423893, 0.04591991, 0.01149954, 0.00371542, 0.00443684, 0.00538771])
top = pd.Series(model.feature_importances_,index = x.columns)
top
id 0.159703 age 0.205063 gender 0.006256 height 0.136296 weight 0.148104 ap_hi 0.169379 ap_lo 0.104239 cholesterol 0.045920 gluc 0.011500 smoke 0.003715 alco 0.004437 active 0.005388 dtype: float64
top.nlargest(10).plot(kind = 'barh')
<matplotlib.axes._subplots.AxesSubplot at 0x2285e704848>