#importing the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn


#importing the data

df=pd.read_csv('student_scores - student_scores.csv')
print("Overview:")
df.head() #getting first 5 rows

Overview:


#getting the shape of the data
print("Shape:",df.shape)

Shape: (25, 2)


#checking for null values in each column.
print(df['Hours'].isna().sum())
print(df['Scores'].isna().sum())

0
0


#description of the data
df.describe()


#plotting our coordinates(Hours,Scores) with scatterplot.
x=sns.scatterplot(data=df,x='Hours',y='Scores',color='coral')
plt.title( 'Hours / Scores' )
plt.xlabel( 'Hours Studied' )
plt.ylabel('Scores Achieved')
plt.grid()


#extracting the columns and putting them inside the variables
x=df.iloc[:,:-1].values  # df['Hours']
y=df.iloc[:,1].values  #df['Scores']

from sklearn.model_selection import train_test_split
#Splitting our dataset into train and test set with 80:20 ratio.
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)


from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)    #fitting our train data into linear regression model
y_pred=regressor.predict(x_test)
print(y_pred) #predicted
print(y_test) #actual

[16.88414476 33.73226078 75.357018   26.79480124 60.49103328]
[20 27 69 30 62]


def Visualization(x,y):
    '''This function visualizes the results of given set with linear regression given x and y'''
    plt.scatter(x,y,color='coral')
    plt.plot(x,regressor.predict(x),color='black')
    plt.title('Hours vs Scores')
    plt.xlabel('Hours_Studied')
    plt.ylabel('Scores_achieved')
    return plt.show()
help(Visualization)

Help on function Visualization in module __main__:

Visualization(x, y)
    This function visualizes the results of given set with linear regression given x and y


print("Visualization of Train Set:")
Visualization(x_train,y_train)
print("\n\nVisualization of Test Set:")
Visualization(x_test,y_test)

Visualization of Train Set:


Visualization of Test Set:


from sklearn.metrics import r2_score #We can also use MSE/RMSE or MAE
r2_score(y_test,y_pred)    
# our model is performing worst if value is negative,is at constant if 0 and nearly perfect if the value lies close to 1.

0.9454906892105355


predicted_score=float(regressor.predict([[9.25]]))
print(f"If a student studies 9.25 hours a day, He's predicted score will be {predicted_score}.")

If a student studies 9.25 hours a day, He's predicted score will be 93.69173248737535.

	Hours	Scores
count	25.000000	25.000000
mean	5.012000	51.480000
std	2.525094	25.286887
min	1.100000	17.000000
25%	2.700000	30.000000
50%	4.800000	47.000000
75%	7.400000	75.000000
max	9.200000	95.000000

	Hours	Scores
0	2.5	21
1	5.1	47
2	3.2	27
3	8.5	75
4	3.5	30

What will be the predicted score if a student studies 9.25 Hours per day ?¶