#The code below is for a linear regression between miles per gallon(mpg) and price of vehicle
#just to remind you a # (hash) is used to comment then I will comment the code below within the #
#import libraries
import pandas as pd #useful to open csv comma separated values documents
import numpy as np #useful to classify data
import matplotlib.pyplot as plt #useful to graph
import seaborn as sns #useful to draw statistical data https://seaborn.pydata.org/
# path of data
path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/automobileEDA.csv'
#Code below reads and organizes the csv file with the panda library
df = pd.read_csv(path)
df.head()
# Create object
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm
X = df[['highway-mpg']] #this creates two list one with indexes and the other with highway-mpg
Y = df['price']
#determines the width of the screen of the graph
width = 12 #determines the width of the screen of the graph
height = 5
#code below graphs the data from the csv with matplotlib as plt and seaborn as sns
plt.figure(figsize=(width, height))
sns.regplot(x="highway-mpg", y="price", data=df)
plt.ylim(0,)
correlation = df[["peak-rpm","highway-mpg","price"]].corr()
print(str(correlation)+ ' this is correlation between peak-rpm,highway-mpg,price')
#We could conclude that the slope is negative. The prediction would be within that line and the range #of the MSE
#Below is the residual
#The residual helps you to determine the accuracy of the predictor
width = 12
height = 10
plt.figure(figsize=(width, height))
sns.residplot(df['highway-mpg'], df['price'])
plt.show()