You are on page 1of 6

keyboard_arrow_down DISHANT KUMAR YADAV 2021BCS0136

#DISHANT KUMAR YADAV


import numpy as np
import pandas as pd

df = pd.read_csv('/content/sample_data/Salary_Data.csv')
df
#DISHANT KUMAR YADAV

YearsExperience Salary

0 1.1 39343.0

1 1.3 46205.0

2 1.5 37731.0

3 2.0 43525.0

4 2.2 39891.0

5 2.9 56642.0

6 3.0 60150.0

7 3.2 54445.0

8 3.2 64445.0

9 3.7 57189.0

10 3.9 63218.0

11 4.0 55794.0

12 4.0 56957.0

13 4.1 57081.0

14 4.5 61111.0

15 4.9 67938.0

16 5.1 66029.0

17 5.3 83088.0

18 5.9 81363.0

19 6.0 93940.0

20 6.8 91738.0

21 7.1 98273.0

22 7.9 101302.0

23 8.2 113812.0

24 8.7 109431.0

25 9.0 105582.0

26 9.5 116969.0

27 9.6 112635.0

28 10.3 122391.0

29 10.5 121872.0

#DISHANT KUMAR YADAV


import matplotlib.pyplot as plt

exp = df['YearsExperience']
sal = df['Salary']

plt.scatter(exp,sal)
plt.xlabel('Experience')
plt.ylabel('Salary')
#DISHANT KUMAR YADAV
Text(0, 0.5, 'Salary')

#DISHANT KUMAR YADAV


exp_np = exp.to_numpy()
sal_np = sal.to_numpy()

exp_np.shape, sal_np.shape
#DISHANT KUMAR YADAV

((30,), (30,))

#DISHANT KUMAR YADAV


from sklearn.linear_model import LinearRegression

sklearn_model = LinearRegression().fit(exp_np.reshape((30,1)), sal_np)

sklearn_sal_predictions = sklearn_model.predict(exp_np.reshape((30,1)))
sklearn_sal_predictions.shape
#DISHANT KUMAR YADAV

(30,)

#DISHANT KUMAR YADAV


exp = df['YearsExperience']
sal = df['Salary']

plt.scatter(exp,sal)
plt.xlabel('Experience')
plt.ylabel('Salary')

plt.scatter(exp,sklearn_sal_predictions )
#DISHANT KUMAR YADAV

output <matplotlib.collections.PathCollection at 0x7c7e2822d360>


#DISHANT KUMAR YADAV
predictions_df = pd.DataFrame({'YearsExperience': exp, 'Salary':sal, 'Sklearn salary prediction':sklearn_sal_predictions})

predictions_df
#DISHANT KUMAR YADAV

YearsExperience Salary Sklearn salary prediction

0 1.1 39343.0 36187.158752

1 1.3 46205.0 38077.151217

2 1.5 37731.0 39967.143681

3 2.0 43525.0 44692.124842

4 2.2 39891.0 46582.117306

5 2.9 56642.0 53197.090931

6 3.0 60150.0 54142.087163

7 3.2 54445.0 56032.079627

8 3.2 64445.0 56032.079627

9 3.7 57189.0 60757.060788

10 3.9 63218.0 62647.053252

11 4.0 55794.0 63592.049484

12 4.0 56957.0 63592.049484

13 4.1 57081.0 64537.045717

14 4.5 61111.0 68317.030645

15 4.9 67938.0 72097.015574

16 5.1 66029.0 73987.008038

17 5.3 83088.0 75877.000502

18 5.9 81363.0 81546.977895

19 6.0 93940.0 82491.974127

20 6.8 91738.0 90051.943985

21 7.1 98273.0 92886.932681

22 7.9 101302.0 100446.902538

23 8.2 113812.0 103281.891235

24 8.7 109431.0 108006.872395

25 9.0 105582.0 110841.861092

26 9.5 116969.0 115566.842252

27 9.6 112635.0 116511.838485

28 10.3 122391.0 123126.812110

29 10.5 121872.0 125016.804574


keyboard_arrow_down DISHANT KUMAR YADAV 2021BCS0136
# Step 1: Import the required python packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Step 2: Load the dataset


df = pd.read_csv('/content/sample_data/Salary_Data.csv')

# Step 3: Data analysis - distribution plot shows the variation in the data distribution.
exp = df['YearsExperience']
sal = df['Salary']

plt.scatter(exp, sal)
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Distribution of Experience vs. Salary')
plt.show()

output

# Step 4: Split the dataset into dependent/independent variables


X = df[['YearsExperience']]
y = df['Salary']

# Step 5: Split data into Train/Test sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the regression model


regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

▾ LinearRegression
LinearRegression()
# Step 7: Plot the training results
plt.scatter(X_train, y_train, color='blue')
plt.plot(X_train, regression_model.predict(X_train), color='red')
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Training Results: Experience vs. Salary')
plt.show()

# Step 7: Plot the test results


plt.scatter(X_test, y_test, color='blue')
plt.plot(X_train, regression_model.predict(X_train), color='red') # Same line as training for comparison
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Test Results: Experience vs. Salary')
plt.show()

You might also like