Lecture notes on logistic regression: http://george1328.github.io/lecture_notes/Logistic_Regression.pdf
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
%pylab inline
df = pd.read_csv('train.csv', header = 0)
df.describe()
df.info()
There are missing values in Age, Cabin and Embarked
df.Embarked.describe()
# seaborn is cool. Here we can the equivalent of a Pandas group by AND plot in one step
sns.boxplot(df.Age, df.Sex)
# loof at the distribution of ages
sns.kdeplot(df.Age, shade=True)
sns.boxplot(df.Age, df.Survived)
# Lets populate 3 missing Embarked values with the mode value
from scipy.stats import mode
mode=mode(df.Embarked)[0][0]
print mode
df.Embarked = df.Embarked.fillna(mode)
#Lets use a temp df to drop missing values. First drop all Cabin NAs, then drop the rest
temp = df.drop(['Cabin'], axis =1)
temp = temp.dropna()
# Here's a way to view data using the Pandas groupby function
grouped = temp.groupby('Pclass')
grouped.get_group(1).Age.hist()
grouped.get_group(2).Age.hist()
grouped.get_group(3).Age.hist()
#Here we use matplotlip. Notice that this graph tells us that age distribution is different for each class
plt.figure(figsize(5,3))
temp.Age[temp.Pclass == 1].plot(kind='kde')
temp.Age[temp.Pclass == 2].plot(kind='kde')
temp.Age[temp.Pclass == 3].plot(kind='kde')
plt.xlabel("Age")
plt.title("Age by class")
plt.legend(('1st', '2nd','3rd'))
#Lets get the mean and median ages
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
print meanAges
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
print medianAges
#Lets plugin median ages for the missing values
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(meanAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
(df.Fare==0).sum()
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
#Now our data is fixed and ready
df.describe()
#some data re-engineering
df['familySize'] = df.SibSp + df.Parch +1
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
#Lets view the data
fig = plt.figure(figsize=(18,9), dpi=1600)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace =.5, wspace=.15)
alpha=alpha_scatterplot = 0.2
alpha_bar_chart = 0.55
ax1 = plt.subplot2grid((3,3),(0,0))
df.Survived.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
ax1.set_xlim(-1, 2)
ax1.set_xticklabels(["Died", "Survived"], rotation=0)
plt.title("Distribution of Survival")
plt.subplot2grid((3,3),(0,2))
plt.scatter(df.Survived, df.Age, alpha=alpha_scatterplot)
plt.ylabel("Age")
plt.title("Survial by Age")
ax3 = plt.subplot2grid((3,3),(0,1))
df.Pclass.value_counts().plot(kind="bar", alpha=alpha_bar_chart)
ax3.set_xticklabels(["3rd Class", "1st Class", "2nd Class"], rotation=0)
plt.title("Class Distribution")
plt.subplot2grid((3,3),(1,0))
plt.scatter(df.Survived, df.Gender, alpha=alpha_bar_chart)
plt.ylabel("Age")
plt.title("Survial by Gender")
ax4 = plt.subplot2grid((3,3),(1,0))
female1class = df.Survived[df.Sex == 'female'][df.Pclass == 1].value_counts()
female1class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female1class))
plt.title("female 1st class")
ax4 = plt.subplot2grid((3,3),(1,1))
female2class = df.Survived[df.Sex == 'female'][df.Pclass == 2].value_counts()
female2class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female2class))
plt.title("female 2nd class")
ax4 = plt.subplot2grid((3,3),(1,2))
female3class = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female3class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(female3class))
plt.title("female 3rd class")
ax4 = plt.subplot2grid((3,3),(2,0))
male1 = df.Survived[df.Sex == 'male'][df.Pclass == 1].value_counts()
male1.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male1))
plt.title("male 1st class")
ax4 = plt.subplot2grid((3,3),(2,1))
male2 = df.Survived[df.Sex == 'male'][df.Pclass == 2].value_counts()
male2.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male2))
plt.title("male 2nd class")
ax4 = plt.subplot2grid((3,3),(2,2))
male3 = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male3.plot(kind='bar', color = '#2aa198', alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male3))
plt.title("male 3rd class")
features = df[['Gender', 'Pclass','Fare', 'familySize', 'Port' ]].values
target = df['Survived'].values
feature_names = ['Gender', 'Pclass','Fare', 'familySize', 'Port']
model_lr = LogisticRegression(C=1).fit(features, target)
x = np.arange(len(feature_names))
plt.bar(x, model_lr.coef_.ravel())
_ = plt.xticks(x + 0.5, feature_names, rotation=30)
This correlation matrix tells us which features are most influential
from sklearn.cross_validation import KFold
def cross_validate(X, y, classifier, k_fold) :
# derive a set of (random) training and testing indices
k_fold_indices = KFold(len(X), n_folds=k_fold,
shuffle=True, random_state=0)
k_score_total = 0
# for each training and testing slices run the classifier, and score the results
for train_slice, test_slice in k_fold_indices :
model = classifier(X[ train_slice ],
y[ train_slice ])
k_score = model.score(X[ test_slice ],
y[ test_slice ])
k_score_total += k_score
# return the average accuracy
return k_score_total/k_fold
model_lr = LogisticRegression(C=1).fit
cross_validate(features, target, model_lr, 11)
model_lr = LogisticRegression(C=1).fit(features, target)
y = model_lr.predict(features)
from sklearn import metrics
metrics.precision_score(target, y)
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_80'] = proba[0].apply(lambda x:0 if x<.5 else 1)
metrics.precision_score(target, proba['class_0_at_80'])
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
def plot_roc_curve(target_test, target_predicted_proba):
fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1
])
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
from sklearn.cross_validation import train_test_split
train_feat, test_feat, train_target, test_target = train_test_split(features, target, train_size=0.5)
model_lr = LogisticRegression(C=1).fit(train_feat, train_target)
target_predicted_proba = model_lr.predict_proba(test_feat)
plot_roc_curve(test_target, target_predicted_proba)
model_lr = LogisticRegression(C=1).fit(features, target)
df = pd.read_csv('test.csv', header = 0)
#mode=mode(df.Embarked)[0][0]
df.Embarked = df.Embarked.fillna('S')
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(medianAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
df.Fare = df.Fare.fillna(13) # one fare value slipped through the cracks. must investigate
df['familySize'] = df.SibSp + df.Parch +1
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
features = df[['familySize', 'Fare', 'Pclass', 'Gender', 'Port']].values
y = model_lr.predict(features)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = y
result.to_csv('titanic_results', sep = ',', index = None)
Lets set our threshold to different values
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_58'] = proba[0].apply(lambda x:0 if x<.58 else 1)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = proba['class_0_at_58']
result.to_csv('titanic_results_with_threshold', sep = ',', index = None)