Even the most qualified and confident applicants worry about getting into grad school. Unfortunately, grad school admissions statistics tend to be more difficult to find than undergrad acceptance rates.
Here, we're using data from two datasets. The first one is GradCafe_data that was scraped from this website which is a database of over 500,000 user-submitted admission results. And the second being 'Graduate Admissions dataset' submitted by user Mohan Acharya. It's a tidy set of 500 data points with features such as gre, toefl, cgpa, sop scores.
Few things to note before we start:
In the grad_cafe dataset self-reported candidates are likelier to post when they are accepted and they're likely to post results for multiple applications. The data in no means is perfect, but we can get some interesting insights anyway.
In the Graduate Admissions dataset, the data very is likely representative of indian applicants as hinted in the overview section here.
import pandas as pd
import numpy as np
import pickle
import math
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib import figure, style
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.max_columns', 30)
style.reload_library()
# importing gradcafe data
df = pd.read_csv("D:\\DataSets\\git_grad_admiss\\clean_data.csv", index_col=0)
Clean data once and save to csv to avoid wasting time on the process every time we run the file.
# One time cleaning data set
# df.drop('row_id', axis=1, inplace=True)
# df.drop(['decision_timestamp', 'post_timestamp'], axis=1, inplace=True)
# df['decision_date'] = pd.to_datetime(df['decision_date'].str.replace(r'[()]', ''), errors='coerce')
# df['post_date'] = pd.to_datetime(df['post_date'].str.replace(r'[()]', ''), errors='coerce')
df.to_csv("D:\\DataSets\\git_grad_admiss\\clean_data.csv")
df.info()
We can use the comments data to create a word cloud to see what is being discussed most frequently.
def create_wordcloud():
# Create the wordcloud object
wordcloud = WordCloud(width=1600, height=800).generate(' '.join(df['comments'].fillna('')))
# Display the generated image:
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.tight_layout(pad=0)
plt.show()
create_wordcloud()
# Top 10 Majors
print(df['major'].value_counts().head(10))
perc = (len(df[df['major']=='Computer Science']) + len(df[df['major']=='Economics']))/len(df)
print(f"\nComputer Science and Economics majors combined make up for {round(perc*100, 2)} percent of the crowd.")
# Top Degrees
print(df['degree'].value_counts())
rest = len(df[(df['degree'] != 'PhD') & (df['degree'] != 'MS')])
phd = len(df[(df['degree'] == 'PhD')])/rest
ms = len(df[(df['degree'] == 'MS')])/rest
print(f"\n\nratio of PhD : MS : rest\n\t {round(phd)} : {round(ms)} : 1")
# Statuses
print(df['status'].value_counts())
print("\nMost of the data we have is from Americans applying to American Universities, "
"with the number of international students being only half of that.")
# Count of Decisions
df['decision'].value_counts()
We'll club all decisions that are not accepted or rejected as under process.
df.loc[(df['decision'] != 'Accepted') & (df['decision'] != 'Rejected'), 'decision'] = 'In process'
df['decision'].value_counts()
One of the columns has a binary feature called 'is_new_gre' to differentiate the candidates with different gre evaluation scales (Old and New). We'll be using just the new scores to avoid complexity.
# Distribution of GRE Scores
df["gre_subject"][df["is_new_gre"] == True].plot(kind = 'hist',bins = 200,figsize = (8,8), color='k')
plt.title("GRE Scores")
plt.xlabel("GRE Score")
plt.ylabel("Frequency")
plt.show()
print("Many scores are close to 1000. This is possibly because we've used the feature 'gre_subject'\
which may be a subject specific test with a different score metric.")
We'll make a gre_total feature instead, by combining all the individual scores.
# Creating a true GRE total feature
df['gre_total'] = df['gre_verbal'] + df['gre_quant'] + df['gre_writing']
# Distribution of GRE Scores
plt.figure(figsize=(12, 10))
df["gre_total"][df["is_new_gre"] == True].plot(kind = 'hist', bins=300, figsize = (8,8), color='k')
plt.title("GRE Scores")
plt.xlabel("GRE Score")
plt.ylabel("Frequency")
plt.show()
print("The new GRE has a max score of 340, but here we can see that a few scores exceed 340. "
"If we choose to drop these values we\nwould be losing around "
f"{round((len(df[df['gre_total'] > 340])/len(df['gre_total']))*100, 2)} percent of GRE data. "
"We'll choose to discard the data.")
# dropping incorrect gre scores
df = df[df['gre_total'] <= 340]
# GRE score vs GPA scatter plot
plt.figure(figsize=(8, 8))
plt.scatter(df["gre_total"][df["is_new_gre"] == True], df["ugrad_gpa"][df["is_new_gre"] == True], s=7, color='k')
plt.title("GPA for GRE Scores")
plt.xlabel("GRE Score")
plt.ylabel("GPA")
plt.show()
print("While most people have entered gpa out of 4, a few people didn't get the memo and entered a CGPA out of 10")
# GRE score vs GPA scatter plot
print("Here's what our plot looks like by only including applicants between 2 and 4 gpa.")
plt.figure(figsize=(8, 8))
plt.scatter(df["gre_total"][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df["is_new_gre"] == True],
df['ugrad_gpa'][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df["is_new_gre"] == True],
s=1, color='k')
plt.title("CGPA for GRE Scores")
plt.xlabel("GRE Score")
plt.ylabel("CGPA")
plt.show()
P.S: Its entirely possible we've included applicants with a low CGPA on the 10 point scale. But there's not much we can do about that.
def verbal_quant_jointplot():
grev = df['gre_verbal'][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True]
grem = df['gre_quant'][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True]
ax = sns.jointplot(grev, grem, stat_func=None, kind='kde', size=6, xlim=(130,170), ylim=(130,170))
ax.set_axis_labels('GRE Verbal','GRE Quantitative')
plt.show()
verbal_quant_jointplot()
Here we can see GRE Quantitative scores going all the way upto the max possible (170), most applicants having quant scores in the 160-170 range. Verbal scores seems relatively high as well.
def verbal_gpa_jointplot():
grev = df['gre_verbal'][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True]
gpa = df['ugrad_gpa'][df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True]
ax = sns.jointplot(grev, gpa, stat_func=None, kind='kde', size=6, xlim=(130,170), ylim=(2,4.3))
ax.set_axis_labels('GRE Verbal','Undergraduate GPA')
plt.show()
verbal_gpa_jointplot()
It looks like almost all candidates have their gpa between 3.5 and 4. Graduate schools, particularly PhD programs, tend to have a minimum GPA cutoff for applicants usually around the 3.5 area.
def top_uni_applied(count=5, x='gre_total', major=None):
'''
Function that plots a feature against gpa, grouped by Universities, aggregated by mean score of all applied,
and sorted by Rank.
Returns a list of all Universities ordered by rank. This list is used for the top_uni_accepted() function.
'''
scaler = MinMaxScaler((0.1, 1))
uni_list = df[df['ugrad_gpa'] <= 4.0][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True]
if major:
uni_list = uni_list[uni_list['major'] == major]
uni_list = uni_list.groupby(['uni_name']).agg({'decision':'count', 'gre_total':'mean', 'ugrad_gpa':'mean', 'gre_verbal':'mean', 'gre_quant': 'mean'})
uni_list = uni_list[uni_list['decision'] > (uni_list['decision'].mean() if major else 150)].sort_values(['ugrad_gpa', x], ascending=False)
uni_list.rename(columns={'decision': 'entries'}, inplace=True)
uni_list.reset_index(inplace=True)
ranked_uni_list = uni_list.uni_name.tolist()
uni_list = uni_list.head(count)
s = scaler.fit_transform(uni_list.entries.values.reshape(-1, 1))
with open("ranked_uni_list.pickle", "wb") as f:
pickle.dump(ranked_uni_list, f)
#Create figure
with plt.style.context('ggplot'):
plt.figure(figsize = (20, 18))
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
if x == 'gre_quant':
score_hue = uni_list.gre_verbal
elif x == 'gre_verbal':
score_hue = uni_list.gre_quant
else:
score_hue = uni_list.gre_total
# Create scatterplot. alpha controls the opacity and s controls the size.
ax = sns.scatterplot(uni_list[x], uni_list.ugrad_gpa, alpha=0.4, s=s*7000, hue=score_hue, palette='winter')
xstart = math.floor(uni_list[x].min() - 1)
xend = math.ceil(uni_list[x].max() + 1)
ystart = uni_list.ugrad_gpa.min() - 0.02
yend = uni_list.ugrad_gpa.max() + 0.02
ax.set_xlim(xstart, xend)
ax.set_ylim(ystart, yend)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels() + ax.get_legend().get_texts()):
item.set_fontsize(16)
#For each point, we add a text inside the bubble
for line in range(0, uni_list.shape[0]):
ax.text(uni_list[x][line], uni_list.ugrad_gpa[line], uni_list.uni_name[line].replace('University Of', '').replace('University', '').split(',')[-1] ,
horizontalalignment='center',
size='small', color='black', fontsize=11)
return ranked_uni_list
print("Bubble plot showing the average level of candidates applying, grouped by University. "
"The size of the bubbles indicates the number of applications to that perticular uni. "
"The Color indicates average total GRE score for those applicants.")
ranked_uni_list = top_uni_applied(count=17, x='gre_total', major=None)
_ = top_uni_applied(count=15, x='gre_verbal', major='Psychology')
Psychology students are expected to have a higher verbal score without too much leniancy on quant. Although Stanford and UPenn applicants have pretty good quant scores.
def top_uni_accepted(count=5, x='gre_total', major=None, ranked_uni_list=ranked_uni_list):
if not ranked_uni_list:
with open("ranked_uni_list.pickle", "rb") as f:
ranked_uni_list = pickle.load(f)
scaler = MinMaxScaler((0.1, 1))
uni_list = df[df['ugrad_gpa'] <= 4.3][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True][df['decision'] != 'In process']
if major:
uni_list = uni_list[uni_list['major'] == major]
uni_list = uni_list.groupby(['uni_name', 'decision']).agg({'decision':'count', 'gre_total':'mean', 'ugrad_gpa':'mean', 'gre_verbal':'mean', 'gre_quant': 'mean'})
uni_list = uni_list.sort_values(['ugrad_gpa', x], ascending=False)
uni_list.rename(columns={'decision': 'entries'}, inplace=True)
uni_list.reset_index(inplace=True)
uni_list = uni_list[uni_list['uni_name'].isin(ranked_uni_list[:count])]
uni_list = uni_list.reset_index().drop('index', axis=1)
s = scaler.fit_transform(uni_list.entries.values.reshape(-1, 1))
#Create figure
with plt.style.context('ggplot'):
plt.figure(figsize = (20, 18))
# Create scatterplot. alpha controls the opacity and s controls the size.
ax = sns.scatterplot(uni_list[x], uni_list.ugrad_gpa, alpha=0.4, s=s*7000, hue=uni_list.decision,
hue_order=['Rejected','', 'Accepted'], palette='Set1')
xstart = math.floor(uni_list[x].min() - 1)
xend = math.ceil(uni_list[x].max() + 1)
ystart = uni_list.ugrad_gpa.min() - 0.02
yend = uni_list.ugrad_gpa.max() + 0.02
ax.set_xlim(xstart, xend)
ax.set_ylim(ystart, yend)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels() + ax.get_legend().get_texts()):
item.set_fontsize(16)
#For each point, we add a text inside the bubble
for line in range(0, uni_list.shape[0]):
ax.text(uni_list[x][line], uni_list.ugrad_gpa[line], uni_list.uni_name[line].replace('University Of', '').replace('University', '').split(',')[-1] ,
horizontalalignment='center',
size='small', color='black', fontsize=11)
print("When we split the applicants by accepted and reject, we can see how much difference there is between them.")
top_uni_accepted(count=10, x='gre_total')
top_uni_accepted(count=12, x='gre_quant', major='Economics')
print("Accepted/Rejected split for Economics Major")
def top_uni_status(count=5, x='gre_verbal', major=None):
scaler = MinMaxScaler((0.1, 1))
uni_list = df[df['ugrad_gpa'] <= 4.3][df['ugrad_gpa'] >= 2.0][df['is_new_gre'] == True][df['decision'] == 'Accepted'][df['status'] != 'Other']
if major:
uni_list = uni_list[uni_list['major'] == major]
uni_list = uni_list.groupby(['uni_name', 'status']).agg({'status':'count', 'gre_total':'mean', 'ugrad_gpa':'mean', 'gre_verbal':'mean', 'gre_quant': 'mean'})
uni_list = uni_list.sort_values(['ugrad_gpa', x], ascending=False)
uni_list.rename(columns={'status': 'entries'}, inplace=True)
uni_list.reset_index(inplace=True)
uni_list = uni_list[uni_list['uni_name'].isin(ranked_uni_list[:count])]
uni_list = uni_list.reset_index().drop('index', axis=1)
s = scaler.fit_transform(uni_list.entries.values.reshape(-1, 1))
#Create figure
with plt.style.context('ggplot'):
plt.figure(figsize = (20, 18))
style.use(['fivethirtyeight'])
# Create scatterplot. alpha controls the opacity and s controls the size.
ax = sns.scatterplot(uni_list[x], uni_list.ugrad_gpa, alpha=0.4, s=s*7000, hue=uni_list.status,
hue_order=['International with US Degree', 'International', 'American'], palette='Dark2')
xstart = math.floor(uni_list[x].min() - 1)
xend = math.ceil(uni_list[x].max() + 1)
ystart = uni_list.ugrad_gpa.min() - 0.02
yend = uni_list.ugrad_gpa.max() + 0.02
ax.set_xlim(xstart, xend)
ax.xaxis.set_ticks(np.arange(xstart, xend, 1))
ax.set_ylim(ystart, yend)
ax.yaxis.set_ticks(np.arange(ystart, yend, 0.05))
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(16)
#For each point, we add a text inside the bubble
for line in range(0, uni_list.shape[0]):
ax.text(uni_list[x][line], uni_list.ugrad_gpa[line], uni_list.uni_name[line].replace('University Of', '').replace('University', '').split(',')[-1][:10] ,
horizontalalignment='center',
size='small', color='black', fontsize=11)
print("Split by status of applicants accepted by their respective Unis. This visual is pretty interesting. "
"All three statuses seem to form their own group, "
"with there being some intersection between Americans and International applicants with US Degrees.")
top_uni_status(count=12, x='gre_quant')
It looks like International students are judged significantly more on their GRE Quant scores than their gpa. On the other hand American students are given more weightage based on gpa than quant scores.
top_uni_status(count=12, x='gre_verbal')
International students also seem to have been given a pass on their gre Verbal scores. Overall it looks like a mixed bag for Int. students with US Degree.
There's alot more analysis that can be done here by varying count, major, status, and degree specifically. But we'll stop here for now.
This is the second dataset we'll work on. it has very few data points. but it has scores for all the tests one has to take to apply to Universities abroad. We'll analyze the data, and perform both, regression and classification to predict if one will be accepted into a university based on their scores.
df2 = pd.read_csv("D:\\DataSets\\grad_admiss_kaggle\\Admission_Predict_Ver1.1.csv")
df2.head()
df2.info()
x = ['Have Research Exp', 'No Research Exp']
y = np.array([len(df2[df2.research == 0]),len(df2[df2.research == 1])])
plt.bar(x, y, width= 0.3, color='k', edgecolor='w')
plt.title("Research Experience")
plt.xlabel("Canditates")
plt.ylabel("Frequency")
plt.show()
df2.corr()
plt.figure(figsize=(10, 8))
style.use('fast')
sns.heatmap(df2.corr(), 0.3, 1, cmap="YlGnBu", annot=True)
Highest predictors of chance of admit are cgpa, gre, and toefl (and university rating to some extent)
def plot_dist(x=['cgpa']):
for col in x:
with plt.style.context('ggplot'):
plt.figure(figsize = (12, 8))
sns.distplot(df2[col], rug=True, norm_hist=False,
rug_kws={"color": "purple", "alpha":0.3, "linewidth": 2.5, "height":0.07},
kde_kws={"color": "g", "alpha":0.3, "linewidth": 2, "shade":True })
plt.show()
plot_dist()
print("Distribution plot for cgpa shows us most people have their cgpa somewhere around 8.5-9 mark")
r_map = {
1: 'y',
0: 'n'
}
df2['research_y_n'] = df2['research'].map(r_map)
df2['accepted'] = 0
df2.loc[df2['chance_of_admit'] >= 0.8, 'accepted'] = 1
print("We've made a new column 'accepted' where all applicants with a chance of admit 0.8 and above are considered as accepted and the rest rejected.")
df2.head()
sns.pairplot(df2[['gre', 'toefl', 'cgpa', 'chance_of_admit', 'research_y_n']], kind='scatter',
hue='research_y_n', palette="husl", diag_kind='kde',
diag_kws={'shade':True, 'linewidth': 1})
print("Scatter plots of most important features against each other, "
"with data points divided by whether a candidate has research experience.")
Its clear that in all fronts, applicants with research experience stand out with higher scores in all fronts, and thus are more likely to be accepted into uni. Some of the interesting plots we could focus on are:
def plot_key():
with plt.style.context('ggplot'):
plt.figure(figsize = (10, 8))
sns.lmplot( x="cgpa", y="chance_of_admit", data=df2, hue='research_y_n', ci=10, height= 8, aspect=1.4,
palette="husl",
scatter_kws=dict(alpha=0.5, edgecolors="black"),
line_kws={"alpha":0.5,"lw":3})
plt.show()
plt.figure(figsize = (12, 8))
sns.lmplot( x="sop", y="uni_rating", data=df2, hue='research_y_n', ci=90, x_jitter=0.3, y_jitter=0.5, height= 8, aspect=1.4,
palette="husl",
scatter_kws=dict(alpha=0.5, edgecolors="black"),
line_kws={"alpha":0.5,"lw":2})
plt.show()
plt.figure(figsize = (10, 8))
sns.lmplot( x="toefl", y="sop", data=df2, hue='research_y_n', ci=10, x_jitter=0.3, y_jitter=0.5, height= 8, aspect=1.4,
palette="husl",
scatter_kws=dict(alpha=0.5, edgecolors="black"),
line_kws={"alpha":0.5,"lw":3})
plt.show()
plot_key()
From the correlation matrix we found the most important predictors of 'chance_of_admit' to be CGPA, GRE score, TOEFL score.
main_df = df2[['cgpa', 'gre', 'toefl', 'chance_of_admit']]
X = main_df.drop('chance_of_admit', axis=1)
y = main_df['chance_of_admit']
col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
def scale_features(X_train, X_test, col):
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=col)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=col)
return X_train, X_test
X_train, X_test = scale_features(X_train, X_test, col)
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)
pred = forest_reg.predict(X_test)
error = mse(y_test, pred)
print(f"mse: {round(error, 4)}")
forest_score = r2_score(y_test, pred)
print("r_square score: ", forest_score)
res = pd.DataFrame()
res['Prediction'] = np.round(pred, 2)
res['Actual'] = np.array(y_test)
res.head()
lin_reg = LinearRegression(n_jobs=-1, copy_X=True)
lin_reg.fit(X_train, y_train)
pred = lin_reg.predict(X_test)
error = mse(y_test, pred)
print(f"mse: {round(error, 4)}")
lin_score = r2_score(y_test, pred)
print("r_square score: ", lin_score)
res = pd.DataFrame()
res['Prediction'] = np.round(pred, 2)
res['Actual'] = np.array(y_test)
res.head()
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
pred = tree_reg.predict(X_test)
error = mse(y_test, pred)
print(f"mse: {round(error, 4)}")
tree_score = r2_score(y_test, pred)
print("r_square score: ", tree_score)
res = pd.DataFrame()
res['Prediction'] = np.round(pred, 2)
res['Actual'] = np.array(y_test)
res.head()
x = ['Random Forest', 'Linear Reg', 'Decision Tree']
y = np.array([forest_score, lin_score, tree_score])
plt.bar(x, y, color='k', width=0.6)
plt.title("Choosing best algorithm")
plt.xlabel("Model")
plt.ylabel("r-squared")
plt.show()
main_df = df2[['cgpa', 'gre', 'toefl', 'accepted']]
X = main_df.drop('accepted', axis=1)
y = main_df['accepted']
col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = scale_features(X_train, X_test, col)
def get_stats(tn, fp, fn, tp, all_stats=False):
total = (tn + fp + fn + tp)
freq_majority_class = (tn + fp) if (tn + fp) > (fn + tp) else (fn + tp)
recall = tp/(fn + tp)
precision = tp/(fp + tp)
f1 = (precision*recall*2)/(precision + recall)
if all_stats:
print(f'''
True Negative: {tn}, True Positive: {tp}, False Negative: {fn}, False Positive: {fp}
Accuracy: {(tp + tn)/total}
Misclassification Rate: {1 - (tp + tn)/total}
True Positive Rate (Sensitivity/Recall): {recall}
False Positive Rate: {fp/(tn+fp)}
True Negative Rate: {tn/(tn + fp)}
Precision: {precision}
Prevalance: {(fn + tp)/total}
Null error rate: {1 - (freq_majority_class/total)}
f1 score: {f1}
''')
else:
print(f'''
True Negative: {tn}, True Positive: {tp}, False Negative: {fn}, False Positive: {fp}
True Positive Rate (Sensitivity/Recall): {recall}
Precision: {precision}
Prevalance: {(fn + tp)/total}
Null error rate: {1 - (freq_majority_class/total)}
f1 score: {f1}
''')
return recall, precision, f1
forest_clf = RandomForestClassifier(n_estimators=125, random_state=42)
forest_clf.fit(X_train, y_train)
pred = forest_clf.predict(X_test)
forest_score = accuracy_score(y_test, pred)
print(f"Accuracy: {forest_score}")
cm = confusion_matrix(y_test, pred).ravel()
tn, fp, fn, tp = cm
_, _, forest_f1 = get_stats(tn, fp, fn, tp)
res = pd.DataFrame()
res['Prediction'] = pred
res['Actual'] = np.array(y_test)
res.loc[::20]
log_clf = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
log_clf.fit(X_train, y_train)
pred = log_clf.predict(X_test)
log_score = accuracy_score(y_test, pred)
print(f"Accuracy: {log_score}")
cm = confusion_matrix(y_test, pred).ravel()
tn, fp, fn, tp = cm
_, _, log_f1 = get_stats(tn, fp, fn, tp)
res = pd.DataFrame()
res['Prediction'] = pred
res['Actual'] = np.array(y_test)
res.loc[::20]
sv_clf = svm.SVC(random_state=42)
sv_clf.fit(X_train, y_train)
pred = sv_clf.predict(X_test)
sv_score = accuracy_score(y_test, pred)
print(f"Accuracy: {sv_score}")
cm = confusion_matrix(y_test, pred).ravel()
tn, fp, fn, tp = cm
_, _, sv_f1 = get_stats(tn, fp, fn, tp)
res = pd.DataFrame()
res['Prediction'] = pred
res['Actual'] = np.array(y_test)
res.loc[::20]
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
pred = tree_clf.predict(X_test)
tree_score = accuracy_score(y_test, pred)
print(f"Accuracy: {tree_score}")
cm = confusion_matrix(y_test, pred).ravel()
tn, fp, fn, tp = cm
_, _, tree_f1 = get_stats(tn, fp, fn, tp)
res = pd.DataFrame()
res['Prediction'] = pred
res['Actual'] = np.array(y_test)
res.loc[::20]
nb = GaussianNB()
nb.fit(X_train, y_train)
pred = nb.predict(X_test)
nb_score = accuracy_score(y_test, pred)
print(f"Accuracy: {nb_score}")
cm = confusion_matrix(y_test, pred).ravel()
tn, fp, fn, tp = cm
_, _, nb_f1 = get_stats(tn, fp, fn, tp)
res = pd.DataFrame()
res['Prediction'] = pred
res['Actual'] = np.array(y_test)
res.loc[::20]
x = ['Random Forest', 'Log. Reg', 'Sup. Vector', 'Dec. Tree', 'Naive Bayes']
y = np.array([forest_score, log_score, sv_score, tree_score, nb_score])
plt.bar(x, y, color='k', width=0.4, log=True)
for a,b in zip(x, y):
if b == y.max():
b = round(b, 3)
plt.text(a, b, str(b))
plt.title("Classification Algorithm accuracy")
plt.xlabel("Model")
plt.ylabel("log(accuracy)")
plt.show()
x = ['Random Forest', 'Log. Reg', 'Sup. Vector', 'Dec. Tree', 'Naive Bayes']
y = np.array([forest_f1, log_f1, sv_f1, tree_f1, nb_f1])
plt.bar(x, y, color='k', width=0.4, log=True)
for a,b in zip(x, y):
if b == y.max():
b = round(b, 3)
plt.text(a, b, str(b))
plt.title("Classification Algorithm f1 scores")
plt.xlabel("Model")
plt.ylabel("log(f1)")
plt.show()
X = main_df.drop('accepted', axis=1)
y = main_df['accepted']
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
pred = sv_clf.predict(X)
accuracy_score(y, pred)
final_df = pd.DataFrame(X, columns=col)
final_df['Prediction'] = pred
final_df['Actual'] = np.array(y)
final_df['is_correct'] = (final_df['Prediction'] == final_df['Actual'])
final_df.head()
The 'O's represent correct prediction, 'X's represent incorrect prediction. Green plots are 'Accepted', Red are 'Rejected'.
The color of the 'X's represent the actual value. (eg. A green 'X' would mean that the correct value at that point is 'Accepted' which our model got wrong)
with plt.style.context('seaborn-notebook'):
plt.figure(figsize=(12, 8))
ax = sns.scatterplot(data=final_df[final_df['is_correct']==True], x='gre', y='cgpa', hue='Actual', palette='coolwarm')
colors = {1: 'darkred',
0: 'darkblue'}
incorrect = final_df[final_df['is_correct']==False]
ax.scatter(x=incorrect['gre'], y=incorrect['cgpa'], marker='x', c=incorrect['Actual'].apply(lambda x: colors[x]), label='Incorrect')
plt.legend()