Logistic regression using the Titanic dataset on Kaggle.¶

Lecture notes on logistic regression: http://george1328.github.io/lecture_notes/Logistic_Regression.pdf

import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib

df = pd.read_csv('train.csv', header = 0)

df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)

There are missing values in Age, Cabin and Embarked

df.Embarked.describe()

count     889
unique      3
top         S
freq      644
dtype: object

# seaborn is cool. Here we can the equivalent of a Pandas group by AND plot in one step
sns.boxplot(df.Age, df.Sex)

<matplotlib.axes._subplots.AxesSubplot at 0x11114a810>

# loof at the distribution of ages
sns.kdeplot(df.Age, shade=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1111be390>

sns.boxplot(df.Age, df.Survived)

<matplotlib.axes._subplots.AxesSubplot at 0x1113bcd50>

# Lets populate 3 missing Embarked values with the mode value
from scipy.stats import mode
mode=mode(df.Embarked)[0][0]
print mode
df.Embarked = df.Embarked.fillna(mode)

S

#Lets use a temp df to drop missing values. First drop all Cabin NAs, then drop the rest
temp = df.drop(['Cabin'], axis =1)
temp = temp.dropna()

# Here's a way to view data using the Pandas groupby function
grouped = temp.groupby('Pclass')
grouped.get_group(1).Age.hist()
grouped.get_group(2).Age.hist()
grouped.get_group(3).Age.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x111518d50>

#Here we use matplotlip. Notice that this graph tells us that age distribution is different for each class
plt.figure(figsize(5,3))
temp.Age[temp.Pclass == 1].plot(kind='kde')    
temp.Age[temp.Pclass == 2].plot(kind='kde')
temp.Age[temp.Pclass == 3].plot(kind='kde')
plt.xlabel("Age")    
plt.title("Age by class")
plt.legend(('1st', '2nd','3rd'))

<matplotlib.legend.Legend at 0x111a31910>

#Lets get the mean and median ages
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
print meanAges
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
print medianAges

Pclass
1         38.233441
2         29.877630
3         25.140620
Name: Age, dtype: float64
Pclass
1         37
2         29
3         24
Name: Age, dtype: float64

#Lets plugin median ages for the missing values
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(meanAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)

(df.Fare==0).sum()

15

medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')

df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)

#Now our data is fixed and ready
df.describe()

#some data re-engineering
df['familySize'] = df.SibSp + df.Parch +1

df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)

port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])

#Lets view the data
fig = plt.figure(figsize=(18,9), dpi=1600)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace =.5, wspace=.15)
alpha=alpha_scatterplot = 0.2 
alpha_bar_chart = 0.55

ax1 = plt.subplot2grid((3,3),(0,0))
df.Survived.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
ax1.set_xlim(-1, 2)
ax1.set_xticklabels(["Died", "Survived"], rotation=0)
plt.title("Distribution of Survival")    

plt.subplot2grid((3,3),(0,2))
plt.scatter(df.Survived, df.Age, alpha=alpha_scatterplot)
plt.ylabel("Age")
plt.title("Survial by Age")

ax3 = plt.subplot2grid((3,3),(0,1))
df.Pclass.value_counts().plot(kind="bar", alpha=alpha_bar_chart)
ax3.set_xticklabels(["3rd Class", "1st Class", "2nd Class"], rotation=0)
plt.title("Class Distribution")

plt.subplot2grid((3,3),(1,0))
plt.scatter(df.Survived, df.Gender, alpha=alpha_bar_chart)
plt.ylabel("Age")
plt.title("Survial by Gender")

ax4 = plt.subplot2grid((3,3),(1,0))
female1class = df.Survived[df.Sex == 'female'][df.Pclass == 1].value_counts()
female1class.plot(kind='bar',  color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female1class))
plt.title("female 1st class")

ax4 = plt.subplot2grid((3,3),(1,1))
female2class = df.Survived[df.Sex == 'female'][df.Pclass == 2].value_counts()
female2class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female2class))
plt.title("female 2nd class")


ax4 = plt.subplot2grid((3,3),(1,2))
female3class = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female3class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(female3class))
plt.title("female 3rd class")


ax4 = plt.subplot2grid((3,3),(2,0))
male1 = df.Survived[df.Sex == 'male'][df.Pclass == 1].value_counts()
male1.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male1))
plt.title("male 1st class")


ax4 = plt.subplot2grid((3,3),(2,1))
male2 = df.Survived[df.Sex == 'male'][df.Pclass == 2].value_counts()
male2.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male2))
plt.title("male 2nd class")


ax4 = plt.subplot2grid((3,3),(2,2))
male3 = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male3.plot(kind='bar', color = '#2aa198', alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male3))
plt.title("male 3rd class")

<matplotlib.text.Text at 0x110dad210>

features = df[['Gender', 'Pclass','Fare', 'familySize', 'Port' ]].values
target = df['Survived'].values

feature_names = ['Gender', 'Pclass','Fare', 'familySize', 'Port']
model_lr = LogisticRegression(C=1).fit(features, target)
x = np.arange(len(feature_names))

plt.bar(x, model_lr.coef_.ravel())
_ = plt.xticks(x + 0.5, feature_names, rotation=30)

This correlation matrix tells us which features are most influential

from sklearn.cross_validation import KFold

def cross_validate(X, y, classifier, k_fold) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(X), n_folds=k_fold,
                           shuffle=True, random_state=0)

    k_score_total = 0
    # for each training and testing slices run the classifier, and score the results
    for train_slice, test_slice in k_fold_indices :

        model = classifier(X[ train_slice  ],
                         y[ train_slice  ])

        k_score = model.score(X[ test_slice ],
                              y[ test_slice ])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold

model_lr = LogisticRegression(C=1).fit
cross_validate(features, target, model_lr, 11)

0.80022446689113347

model_lr = LogisticRegression(C=1).fit(features, target)
y = model_lr.predict(features)

from sklearn import metrics
metrics.precision_score(target, y)

0.778523489932886

target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_80'] = proba[0].apply(lambda x:0 if x<.5 else 1)
metrics.precision_score(target, proba['class_0_at_80'])

0.778523489932886

from sklearn.metrics import roc_curve
from sklearn.metrics import auc

def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1
                                                                         ])
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specifity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

from sklearn.cross_validation import train_test_split

train_feat, test_feat, train_target, test_target = train_test_split(features, target, train_size=0.5)

model_lr = LogisticRegression(C=1).fit(train_feat, train_target)
target_predicted_proba = model_lr.predict_proba(test_feat)

plot_roc_curve(test_target, target_predicted_proba)

model_lr = LogisticRegression(C=1).fit(features, target)

df = pd.read_csv('test.csv', header = 0)
#mode=mode(df.Embarked)[0][0]
df.Embarked = df.Embarked.fillna('S')
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(medianAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
df.Fare = df.Fare.fillna(13) # one fare value slipped through the cracks. must investigate
df['familySize'] = df.SibSp + df.Parch +1
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
features = df[['familySize', 'Fare', 'Pclass', 'Gender', 'Port']].values

y = model_lr.predict(features)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = y
result.to_csv('titanic_results', sep = ',', index = None)

Lets set our threshold to different values

target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_58'] = proba[0].apply(lambda x:0 if x<.58 else 1)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = proba['class_0_at_58']
result.to_csv('titanic_results_with_threshold', sep = ',', index = None)

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.252716	0.523008	0.381594	32.674620
std	257.353842	0.486592	0.836071	13.211959	1.102743	0.806057	49.608084
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	4.012500
25%	223.500000	0.000000	2.000000	22.000000	0.000000	0.000000	7.925000
50%	446.000000	0.000000	3.000000	26.000000	0.000000	0.000000	14.500000
75%	668.500000	1.000000	3.000000	37.000000	1.000000	0.000000	31.275000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Logistic regression using the Titanic dataset on Kaggle.¶

Submitting these results to Kaggle, we are rewarded with a score of .70¶