This notebook will demonstrate using linear and polynomial regression to model the relationship between feature variables and the response variable.
We will use the MPG dataset to practice using regression to predict a fuel economy(MPG) of a car given its features.
For the mathematical explanation and theory, please view lecture notes: http://george1328.github.io/lecture_notes/Regression_Regularization.pdf
import pandas as pd
%pylab inline
# Column/Feature label are not available in the dataset, so we create a list of features using auto-mpg.names
features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
# Import the data directly into pandas from the url, specify header=None as column labels are not in dataset
import urllib
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
# file is fixed-width-format so we will use read_fwf instead of read_csv
df = pd.read_fwf(urllib.urlopen(url), header = None)
df.columns = features
# Alternatively, we can download the data
# We use the bang(!) within iPython Notebooks to run command line statements directly from the Notebook
! curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
! curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.names
# Since this dataset has no column headings, we need to explicitely state names=features
df = pd.read_fwf("auto-mpg.data", names = features)
# Use head, describe, info and unique to get a sense of the data
df.describe()
df.head()
df.info()
# Name and Horsepower are the only non-numeric fields. Name of a car is unlikely to have an influence on the MPG.
df.horsepower.unique()
(df.horsepower == '?').sum()
# Lets convert horsepower to a numeric field so we can use it in our analysis
df['horsepower'] = df['horsepower'].convert_objects(convert_numeric = True)
# We will drop the 6 records that are missing horsepower. We could extimate these missing values but for the sake of accuracy
# we will not. Also, its only 6 missing values
df = df.dropna()
df.info()
import seaborn as sns
sns.boxplot(df.mpg, df.cylinders)
# This is interesting. 4 cylinder vehicles have better mileage on average than 3 cylinder vehicles
three_cyl = df[df.cylinders == 3]
print three_cyl['car name']
## Aha! Tiny Mazda roadsters...
sns.violinplot(df.mpg, df['model year'])
# Fancy seaborn graphing
sns.barplot(df.mpg, df.horsepower)
sns.barplot(df.mpg, df.weight)
sns.boxplot(df.mpg, df.origin)
# Although the values of origin are not given, we can guess that 1=USA, 2=Europe and 3=Japan... Maybe...
sns.boxplot(df.mpg, df.acceleration)
# Little cars have pretty good accelaration AND good mileage so not a great association
sns.kdeplot(df.mpg, df.cylinders)
# Showing different plot options in seaborn :-)
An alternate method to visualizing the data is to print the correlation matrix
df.corr()
# Create numpy variables X and y with the predictor and class variables
X = df[['weight', 'model year', 'horsepower', 'origin', 'displacement']].values
y = df['mpg'].values
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
for i, prediction in enumerate(predictions):
print 'Predicted: %s, Actual: %s' % (prediction, y_test[i])
model.score(X_test,y_test)
sns.regplot(predictions, y_test)
from sklearn.preprocessing import PolynomialFeatures
quad_model =PolynomialFeatures(degree=2)
quad_X_train = quad_model.fit_transform(X_train)
quad_X_test = quad_model.fit_transform(X_test)
model.fit(quad_X_train, y_train)
predictions = model.predict(quad_X_test)
for i, prediction in enumerate(predictions):
print 'Predicted: %s, Actual: %s' % (prediction, y_test[i])
model.score(quad_X_test,y_test)
sns.regplot(predictions, y_test)
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(model, quad_X_train, y_train, cv =10)
# R squared is the proportion of variance in the response variable that is explained by the model.
# The R squared score tells us the accuracy of our model with 1.0 being a perfect prediction.
scores