World's Best AI Learning Platform with profoundly Demanding Certification Programs
Designed by IITians, only for AI Learners.
Designed by IITians, only for AI Learners.
New to InsideAIML? Create an account
Employer? Create an account
Download our e-book of Introduction To Python
4.5 (1,292 Ratings)
559 Learners
Shashank Shanu
a year ago
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("People Charm case.csv")
data
data.shape
(14999, 10)
data.info()
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 satisfactoryLevel 14999 non-none float64
1 lastEvaluation 14999 non-none float64
2 numberOfProjects 14999 non-none int64
3 avgMonthlyHours 14999 non-none int64
4 timeSpent.company 14999 non-none int64
5 workAccident 14999 non-none int64
6 left 14999 non-none int64
7 promotionInLast5years 14999 non-none int64
8 dept 14999 non-none object
9 salary 14999 non-none object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
data.isnone().sum()
satisfactoryLevel 0
lastEvaluation 0
numberOfProjects 0
avgMonthlyHours 0
timeSpent.company 0
workAccident 0
left 0
promotionInLast5years 0
dept 0
salary 0
dtype: int64
sns.heatmap(data.isnone())
data['dept'].unique()
array(['sales', 'accounting', 'hr', 'technical', 'support', 'IT',
'product_mng', 'marketing', 'management', 'RandD'], dtype=object)
data['dept'].nunique()
10
data['dept'].value_counts()
sales 4140
technical 2720
support 2229
IT 1227
product_mng 902
marketing 858
RandD 787
accounting 767
hr 739
management 630
Name: dept, dtype: int64
data['salary'].unique()
array(['low', 'medium', 'high'], dtype=object)
data['salary'].value_counts()
low 7316
medium 6446
high 1237
Name: salary, dtype: int64
data['satisfactoryLevel'].value_counts()
0.10 358
0.11 335
0.74 257
0.77 252
0.84 247
...
0.25 34
0.28 31
0.27 30
0.12 30
0.26 30
Name: satisfactoryLevel, Length: 92, dtype: int64
data['numberOfProjects'].value_counts()
4 4365
3 4055
5 2761
2 2388
6 1174
7 256
Name: numberOfProjects, dtype: int64
sns.boxplot(data['avgMonthlyHours'])
sns.boxplot(data['satisfactoryLevel'])
sns.boxplot(data['lastEvaluation'])
sns.distplot(data["avgMonthlyHours"])
sns.distplot(data["lastEvaluation"])
numerical_features = ['satisfactoryLevel','lastEvaluation','numberOfProjects','avgMonthlyHours','timeSpent.company']
categorical_features = ['dept','salary','workAccident','promotionInLast5years']
print(data[numerical_features].hist(bins=15, figsize=(15, 6), layout=(2, 4)))
sns.countplot(data['dept'])
sns.countplot(data['salary'])
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 4, figsize=(20, 8))
for variable, subplot in zip(categorical_features, ax.flatten()):
sns.countplot(data[variable], ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(90)
plt.figure(figsize = (15,10))
sns.boxplot(x="salary",y="timeSpent.company",data=data) #boxplot
plt.xticks(rotation=90)
plt.figure(figsize = (15,10))
sns.boxplot(x="salary",y="avgMonthlyHours",data=data) #boxplot
plt.xticks(rotation=90)
data.head()
from sklearn.preprocessing import LabelEncoder
x1= LabelEncoder()
data['salary'] = x1.fit_transform(data['salary'])
data.head()
data['salary'].nunique()
3
data['dept'] = x1.fit_transform(data['dept'])
data.head(3)
# importing libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
X = data.drop(['left'],axis=1) # independent variables
X.head(3)
Y = data["left"] # dependent variables
Y.head()
0 1
1 1
2 1
3 1
4 1
Name: left, dtype: int64
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=3)
y_test.shape
(3000,)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("Confusion Matrix: ",confusion_matrix(y_test,y_pred),sep='\n')
print("Accuracy Score: ",accuracy_score(y_test, y_pred)*100)
Confusion Matrix:
[[2274 3]
[ 21 702]]
Accuracy Score: 99.2
from sklearn import metrics
probs = rf.predict_proba(x_test)
prob_positive = probs[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_test,prob_positive)
roc_auc = metrics.auc(fpr,tpr)
print('Area under the curve:',roc_auc)
Area under the curve: 0.990623050518414
plt.title('Reciever Operating characterstics')
plt.plot(fpr, tpr,'Orange',label='AUC= %0.2f'%roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('false Positive Rate')
plt.ylabel('true Positive Rate')
plt.show()