Logistic regression using the Titanic dataset on Kaggle.

In [56]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
%pylab inline
Populating the interactive namespace from numpy and matplotlib

In [57]:
df = pd.read_csv('train.csv', header = 0)
In [58]:
df.describe()
Out[58]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [59]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)

There are missing values in Age, Cabin and Embarked

In [60]:
df.Embarked.describe()
Out[60]:
count     889
unique      3
top         S
freq      644
dtype: object
In [61]:
# seaborn is cool. Here we can the equivalent of a Pandas group by AND plot in one step
sns.boxplot(df.Age, df.Sex)
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x11114a810>
In [62]:
# loof at the distribution of ages
sns.kdeplot(df.Age, shade=True)
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x1111be390>
In [63]:
sns.boxplot(df.Age, df.Survived)
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x1113bcd50>
In [64]:
# Lets populate 3 missing Embarked values with the mode value
from scipy.stats import mode
mode=mode(df.Embarked)[0][0]
print mode
df.Embarked = df.Embarked.fillna(mode)
S

In [65]:
#Lets use a temp df to drop missing values. First drop all Cabin NAs, then drop the rest
temp = df.drop(['Cabin'], axis =1)
temp = temp.dropna()
In [66]:
# Here's a way to view data using the Pandas groupby function
grouped = temp.groupby('Pclass')
grouped.get_group(1).Age.hist()
grouped.get_group(2).Age.hist()
grouped.get_group(3).Age.hist()
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x111518d50>
In [67]:
#Here we use matplotlip. Notice that this graph tells us that age distribution is different for each class
plt.figure(figsize(5,3))
temp.Age[temp.Pclass == 1].plot(kind='kde')    
temp.Age[temp.Pclass == 2].plot(kind='kde')
temp.Age[temp.Pclass == 3].plot(kind='kde')
plt.xlabel("Age")    
plt.title("Age by class")
plt.legend(('1st', '2nd','3rd'))
Out[67]:
<matplotlib.legend.Legend at 0x111a31910>
In [68]:
#Lets get the mean and median ages
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
print meanAges
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
print medianAges
Pclass
1         38.233441
2         29.877630
3         25.140620
Name: Age, dtype: float64
Pclass
1         37
2         29
3         24
Name: Age, dtype: float64

In [69]:
#Lets plugin median ages for the missing values
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(meanAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
In [70]:
(df.Fare==0).sum()
Out[70]:
15
In [71]:
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
In [72]:
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
In [73]:
#Now our data is fixed and ready
df.describe()
Out[73]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.252716 0.523008 0.381594 32.674620
std 257.353842 0.486592 0.836071 13.211959 1.102743 0.806057 49.608084
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 4.012500
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.925000
50% 446.000000 0.000000 3.000000 26.000000 0.000000 0.000000 14.500000
75% 668.500000 1.000000 3.000000 37.000000 1.000000 0.000000 31.275000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [74]:
#some data re-engineering
df['familySize'] = df.SibSp + df.Parch +1
In [75]:
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
In [76]:
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
In [77]:
#Lets view the data
fig = plt.figure(figsize=(18,9), dpi=1600)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace =.5, wspace=.15)
alpha=alpha_scatterplot = 0.2 
alpha_bar_chart = 0.55

ax1 = plt.subplot2grid((3,3),(0,0))
df.Survived.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
ax1.set_xlim(-1, 2)
ax1.set_xticklabels(["Died", "Survived"], rotation=0)
plt.title("Distribution of Survival")    

plt.subplot2grid((3,3),(0,2))
plt.scatter(df.Survived, df.Age, alpha=alpha_scatterplot)
plt.ylabel("Age")
plt.title("Survial by Age")

ax3 = plt.subplot2grid((3,3),(0,1))
df.Pclass.value_counts().plot(kind="bar", alpha=alpha_bar_chart)
ax3.set_xticklabels(["3rd Class", "1st Class", "2nd Class"], rotation=0)
plt.title("Class Distribution")

plt.subplot2grid((3,3),(1,0))
plt.scatter(df.Survived, df.Gender, alpha=alpha_bar_chart)
plt.ylabel("Age")
plt.title("Survial by Gender")

ax4 = plt.subplot2grid((3,3),(1,0))
female1class = df.Survived[df.Sex == 'female'][df.Pclass == 1].value_counts()
female1class.plot(kind='bar',  color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female1class))
plt.title("female 1st class")

ax4 = plt.subplot2grid((3,3),(1,1))
female2class = df.Survived[df.Sex == 'female'][df.Pclass == 2].value_counts()
female2class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female2class))
plt.title("female 2nd class")


ax4 = plt.subplot2grid((3,3),(1,2))
female3class = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female3class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(female3class))
plt.title("female 3rd class")


ax4 = plt.subplot2grid((3,3),(2,0))
male1 = df.Survived[df.Sex == 'male'][df.Pclass == 1].value_counts()
male1.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male1))
plt.title("male 1st class")


ax4 = plt.subplot2grid((3,3),(2,1))
male2 = df.Survived[df.Sex == 'male'][df.Pclass == 2].value_counts()
male2.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male2))
plt.title("male 2nd class")


ax4 = plt.subplot2grid((3,3),(2,2))
male3 = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male3.plot(kind='bar', color = '#2aa198', alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male3))
plt.title("male 3rd class")
Out[77]:
<matplotlib.text.Text at 0x110dad210>
In [78]:
features = df[['Gender', 'Pclass','Fare', 'familySize', 'Port' ]].values
target = df['Survived'].values
In [79]:
feature_names = ['Gender', 'Pclass','Fare', 'familySize', 'Port']
model_lr = LogisticRegression(C=1).fit(features, target)
x = np.arange(len(feature_names))
In [80]:
plt.bar(x, model_lr.coef_.ravel())
_ = plt.xticks(x + 0.5, feature_names, rotation=30)

This correlation matrix tells us which features are most influential

In [81]:
from sklearn.cross_validation import KFold

def cross_validate(X, y, classifier, k_fold) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(X), n_folds=k_fold,
                           shuffle=True, random_state=0)

    k_score_total = 0
    # for each training and testing slices run the classifier, and score the results
    for train_slice, test_slice in k_fold_indices :

        model = classifier(X[ train_slice  ],
                         y[ train_slice  ])

        k_score = model.score(X[ test_slice ],
                              y[ test_slice ])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold
In [82]:
model_lr = LogisticRegression(C=1).fit
cross_validate(features, target, model_lr, 11)
Out[82]:
0.80022446689113347
In [83]:
model_lr = LogisticRegression(C=1).fit(features, target)
y = model_lr.predict(features)
In [84]:
from sklearn import metrics
metrics.precision_score(target, y)  
Out[84]:
0.778523489932886
In [85]:
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_80'] = proba[0].apply(lambda x:0 if x<.5 else 1)
metrics.precision_score(target, proba['class_0_at_80'])  
Out[85]:
0.778523489932886
In [86]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1
                                                                         ])
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specifity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
In [87]:
from sklearn.cross_validation import train_test_split

train_feat, test_feat, train_target, test_target = train_test_split(features, target, train_size=0.5)
In [88]:
model_lr = LogisticRegression(C=1).fit(train_feat, train_target)
target_predicted_proba = model_lr.predict_proba(test_feat)
In [89]:
plot_roc_curve(test_target, target_predicted_proba)
In [90]:
model_lr = LogisticRegression(C=1).fit(features, target)
In [91]:
df = pd.read_csv('test.csv', header = 0)
#mode=mode(df.Embarked)[0][0]
df.Embarked = df.Embarked.fillna('S')
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(medianAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
df.Fare = df.Fare.fillna(13) # one fare value slipped through the cracks. must investigate
df['familySize'] = df.SibSp + df.Parch +1
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
features = df[['familySize', 'Fare', 'Pclass', 'Gender', 'Port']].values
In [92]:
y = model_lr.predict(features)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = y
result.to_csv('titanic_results', sep = ',', index = None)

Lets set our threshold to different values

In [93]:
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_58'] = proba[0].apply(lambda x:0 if x<.58 else 1)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = proba['class_0_at_58']
result.to_csv('titanic_results_with_threshold', sep = ',', index = None)

Submitting these results to Kaggle, we are rewarded with a score of .70