Download our e-book of Introduction To Python
Shashank Shanu
2 years ago
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("People Charm case.csv")
data
data.shape
(14999, 10)
data.info()
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 satisfactoryLevel 14999 non-none float64
1 lastEvaluation 14999 non-none float64
2 numberOfProjects 14999 non-none int64
3 avgMonthlyHours 14999 non-none int64
4 timeSpent.company 14999 non-none int64
5 workAccident 14999 non-none int64
6 left 14999 non-none int64
7 promotionInLast5years 14999 non-none int64
8 dept 14999 non-none object
9 salary 14999 non-none object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
data.isnone().sum()
satisfactoryLevel 0
lastEvaluation 0
numberOfProjects 0
avgMonthlyHours 0
timeSpent.company 0
workAccident 0
left 0
promotionInLast5years 0
dept 0
salary 0
dtype: int64
sns.heatmap(data.isnone())
data['dept'].unique()
array(['sales', 'accounting', 'hr', 'technical', 'support', 'IT',
'product_mng', 'marketing', 'management', 'RandD'], dtype=object)
data['dept'].nunique()
10
data['dept'].value_counts()
sales 4140
technical 2720
support 2229
IT 1227
product_mng 902
marketing 858
RandD 787
accounting 767
hr 739
management 630
Name: dept, dtype: int64
data['salary'].unique()
array(['low', 'medium', 'high'], dtype=object)
data['salary'].value_counts()
low 7316
medium 6446
high 1237
Name: salary, dtype: int64
data['satisfactoryLevel'].value_counts()
0.10 358
0.11 335
0.74 257
0.77 252
0.84 247
...
0.25 34
0.28 31
0.27 30
0.12 30
0.26 30
Name: satisfactoryLevel, Length: 92, dtype: int64
data['numberOfProjects'].value_counts()
4 4365
3 4055
5 2761
2 2388
6 1174
7 256
Name: numberOfProjects, dtype: int64
sns.boxplot(data['avgMonthlyHours'])
sns.boxplot(data['satisfactoryLevel'])
sns.boxplot(data['lastEvaluation'])
sns.distplot(data["avgMonthlyHours"])
sns.distplot(data["lastEvaluation"])
numerical_features = ['satisfactoryLevel','lastEvaluation','numberOfProjects','avgMonthlyHours','timeSpent.company']
categorical_features = ['dept','salary','workAccident','promotionInLast5years']
print(data[numerical_features].hist(bins=15, figsize=(15, 6), layout=(2, 4)))
sns.countplot(data['dept'])
sns.countplot(data['salary'])
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 4, figsize=(20, 8))
for variable, subplot in zip(categorical_features, ax.flatten()):
sns.countplot(data[variable], ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(90)
plt.figure(figsize = (15,10))
sns.boxplot(x="salary",y="timeSpent.company",data=data) #boxplot
plt.xticks(rotation=90)
plt.figure(figsize = (15,10))
sns.boxplot(x="salary",y="avgMonthlyHours",data=data) #boxplot
plt.xticks(rotation=90)
data.head()
from sklearn.preprocessing import LabelEncoder
x1= LabelEncoder()
data['salary'] = x1.fit_transform(data['salary'])
data.head()
data['salary'].nunique()
3
data['dept'] = x1.fit_transform(data['dept'])
data.head(3)
# importing libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
X = data.drop(['left'],axis=1) # independent variables
X.head(3)
Y = data["left"] # dependent variables
Y.head()
0 1
1 1
2 1
3 1
4 1
Name: left, dtype: int64
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=3)
y_test.shape
(3000,)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("Confusion Matrix: ",confusion_matrix(y_test,y_pred),sep='\n')
print("Accuracy Score: ",accuracy_score(y_test, y_pred)*100)
Confusion Matrix:
[[2274 3]
[ 21 702]]
Accuracy Score: 99.2
from sklearn import metrics
probs = rf.predict_proba(x_test)
prob_positive = probs[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_test,prob_positive)
roc_auc = metrics.auc(fpr,tpr)
print('Area under the curve:',roc_auc)
Area under the curve: 0.990623050518414
plt.title('Reciever Operating characterstics')
plt.plot(fpr, tpr,'Orange',label='AUC= %0.2f'%roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('false Positive Rate')
plt.ylabel('true Positive Rate')
plt.show()