from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pickle
import datetime
import pandas as pd
import numpy as np
import timeit
from matplotlib import pyplot as plt
pd.set_option("display.max_columns", 500)
train_df = pd.read_csv("D:\\DataSets\\airbnb\\train_users_2.csv", header=0, index_col=None)
test_df = pd.read_csv("D:\\DataSets\\airbnb\\test_users.csv", header=0, index_col=None)
sessions = pd.read_csv("D:\\DataSets\\airbnb\\sessions.csv", header=0, index_col=False)
df_all = pd.concat((train_df, test_df), axis=0, ignore_index=True)
By closely inspecting the data in table we can fish out abnormalities, outliers, or redundant information.
train_df.head()
1) Date format is not consistant in columns 'date_account_created', 'timestamp_first_active', 'date_first_booking'. We can convert them all into timestamp format.
2) Gender column has 'unknown' values for some rows and age column consists of NaN values.
Let's check which columns have NaN values:
df_all.isna().sum()
"date_first_booking" column is not present in test_df. Therefore we drop it.
df_all.drop("date_first_booking", axis=1, inplace=True)
df_all["date_account_created"] = pd.to_datetime(df_all["date_account_created"], format='%Y-%m-%d')
df_all["timestamp_first_active"] = pd.to_datetime(df_all["timestamp_first_active"], format='%Y%m%d%H%M%S')
df_all["age"].hist()
Outliers are present (detected at 1750 and above)
df_all[df_all["age"] < 250]["age"].hist()
Outliers are present (detected between 90 and 120)
Lets assume legitimate ages to be between 15 and 90. We can remove the rest of the entries.
df_all["age"].values
# Remove outliers function (replaces with -1)
def remove_outliers(df, column, min_val, max_val):
col_values = df[column].values
df[column] = np.where(np.logical_or(col_values<min_val, col_values>max_val), np.NaN, col_values)
return df
df_all = remove_outliers(df_all, 'age', 15, 90)
df_all['age'].fillna(-1, inplace=True)
df_all["age"].hist()
df_all['date_account_created'].fillna(df_all.timestamp_first_active, inplace=True)
df_all['first_affiliate_tracked'].fillna(-1, inplace=True)
One-hot encoding
# Home made One Hot Encoding function
def convert_to_binary(df, column_to_convert):
categories = df[column_to_convert].drop_duplicates()
for category in categories:
cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
col_name = column_to_convert[:7] + '_' + cat_name[:10]
df[col_name] = 0
df.loc[(df[column_to_convert] == category), col_name] = 1
return df
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for col in columns_to_convert:
df_all = convert_to_binary(df_all, col)
df_all.drop(col, axis=1, inplace=True)
# Add new date related fields
df_all['day_account_created'] = df_all['date_account_created'].dt.weekday
df_all['month_account_created'] = df_all['date_account_created'].dt.month
df_all['quarter_account_created'] = df_all['date_account_created'].dt.quarter
df_all['year_account_created'] = df_all['date_account_created'].dt.year
df_all['hour_first_active'] = df_all['timestamp_first_active'].dt.hour
df_all['day_first_active'] = df_all['timestamp_first_active'].dt.weekday
df_all['month_first_active'] = df_all['timestamp_first_active'].dt.month
df_all['quarter_first_active'] = df_all['timestamp_first_active'].dt.quarter
df_all['year_first_active'] = df_all['timestamp_first_active'].dt.year
df_all['created_less_active'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).dt.days
# Drop unnecessary columns
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
if column in df_all.columns:
df_all.drop(column, axis=1, inplace=True)
# Determine primary device
sessions_device = sessions.loc[:, ['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).aggregate(np.sum)
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
df_primary = pd.DataFrame(aggregated_lvl1.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
df_primary = convert_to_binary(df=df_primary, column_to_convert='primary_device')
df_primary.drop('primary_device', axis=1, inplace=True)
# Determine Secondary device
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_secondary = pd.DataFrame(remaining.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_secondary = convert_to_binary(df=df_secondary, column_to_convert='secondary_device')
df_secondary.drop('secondary_device', axis=1, inplace=True)
# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
id_list = df[id_col].drop_duplicates()
df_counts = df.loc[:,[id_col, column_to_convert]]
df_counts['count'] = 1
df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()
new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
new_df = new_df.fillna(0)
# Rename Columns
categories = list(df[column_to_convert].drop_duplicates())
for category in categories:
cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
col_name = column_to_convert + '_' + cat_name
new_df.rename(columns = {category:col_name}, inplace=True)
return new_df
# Aggregate and combine actions taken columns
session_actions = sessions.loc[:,['user_id', 'action', 'action_type', 'action_detail']]
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True
for column in columns_to_convert:
print("Converting " + column + " column...")
current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)
# If first loop, current data becomes existing data, otherwise merge existing and current
if first:
first = False
actions_data = current_data
else:
actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')
# Merge device datasets
print("Combining results...")
df_primary.set_index('user_id', inplace=True)
df_secondary.set_index('user_id', inplace=True)
device_data = pd.concat([df_primary, df_secondary], axis=1, join="outer")
# Merge device and actions datasets
combined_results = pd.concat([device_data, actions_data], axis=1, join='outer')
df_sessions = combined_results.fillna(0)
# Merge user and session datasets
df_all.set_index('id', inplace=True)
df_all = pd.concat([df_all, df_sessions], axis=1, join='inner')
# Prepare training data for modelling
train_df.set_index('id', inplace=True)
train_df = pd.concat([train_df['country_destination'], df_all], axis=1, join='inner')
id_train = train_df.index.values
labels = train_df['country_destination']
le = LabelEncoder()
y = le.fit_transform(labels)
X = train_df.drop('country_destination', axis=1, inplace=False)
# Grid Search - Used to find best combination of parameters
# XGB_model = xgb.XGBClassifier(objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
# param_grid = {'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.3], 'n_estimators': [25, 50]}
# model = GridSearchCV(estimator=XGB_model, param_grid=param_grid, scoring='accuracy', verbose=10, n_jobs=1, iid=True, refit=True, cv=3)
# model.fit(X, y)
# print("Best score: %0.3f" % model.best_score_)
# print("Best parameters set:")
# best_parameters = model.best_estimator_.get_params()
# for param_name in sorted(param_grid.keys()):
# print("\t%s: %r" % (param_name, best_parameters[param_name]))
# After performing grid search, we havwe found best estimator
XGB_model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50, max_depth=5, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
XGB_model.fit(X, y)
# preserve classifier
with open("XGB_model_airbnb.pickle", "wb") as f:
pickle.dump(XGB_model, f)
# Prepare test data for prediction
test_df.set_index('id', inplace=True)
test_df = pd.merge(test_df.loc[:,['date_first_booking']], df_all, how='left', left_index=True, right_index=True, sort=False)
X_test = test_df.drop('date_first_booking', axis=1, inplace=False)
X_test = X_test.fillna(-1)
id_test = test_df.index.values
# Make predictions
y_pred = XGB_model.predict_proba(X_test)
y_pred[:5]
# Taking the 5 classes with highest probabilities
ids = [] # list of ids
cts = [] # list of countries
for i in range(len(id_test)):
idx = id_test[i]
ids += [idx] * 5
cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()