Friday, May 31, 2019

Regression model of highway-mpg and price

#The code below is for a linear regression between miles per gallon(mpg) and price of vehicle


#Simple_regression

#just to remind you a # (hash) is used to comment then I will comment the code below within the #

#import libraries

import pandas as pd      #useful to open csv comma separated values documents
import numpy as np      #useful to classify data
import matplotlib.pyplot as plt      #useful to graph
import seaborn as sns                    #useful to draw statistical data https://seaborn.pydata.org/


# path of data

path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/automobileEDA.csv'

#Code below reads and organizes the csv file with the panda library
df = pd.read_csv(path)
df.head()



# Create object
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm



X = df[['highway-mpg']]    #this creates two list one with indexes and the other with highway-mpg
Y = df['price']

#determines the width of the screen of the graph
width = 12       #determines the width of the screen of the graph
height = 5

#code below graphs the data from the csv with matplotlib as plt and seaborn as sns
plt.figure(figsize=(width, height))
sns.regplot(x="highway-mpg", y="price", data=df)
plt.ylim(0,)




correlation = df[["peak-rpm","highway-mpg","price"]].corr()
print(str(correlation)+ ' this is correlation between peak-rpm,highway-mpg,price')



#We could conclude that the slope is negative. The prediction would be within that line and the range #of the MSE



#Below is the residual
#The residual helps you to determine the accuracy of the predictor

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.residplot(df['highway-mpg'], df['price'])
plt.show()

No comments:

Post a Comment