You are on page 1of 16

10/20/23, 3:18 PM Experiment_6

In [2]: import pandas as pd

import scipy
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('diabetes.csv')

In [3]: print(df.head())

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \

0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1

DiabetesPedigreeFunction Age Outcome

0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1

In [4]:

<bound method of Pregnancies Glucose BloodPressure SkinThicknes

s Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
.. ... ... ... ... ... ...
763 10 101 76 48 180 32.9
764 2 122 70 27 0 36.8
765 5 121 72 23 112 26.2
766 1 126 60 0 0 30.1
767 1 93 70 31 0 30.4

DiabetesPedigreeFunction Age Outcome

0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
.. ... ... ...
763 0.171 63 0
764 0.340 27 0
765 0.245 30 0
766 0.349 47 1
767 0.315 23 0

[768 rows x 9 columns]>

In [5]: df.isnull().sum()

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 1/8

10/20/23, 3:18 PM Experiment_6
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64

In [6]: df.describe()

Out[6]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre

count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 7

mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578

std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160

min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000

50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000

75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000

max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000

In [24]: fig, axs = plt.subplots(9,1,dpi=95, figsize=(7,17))

i = 0
for col in df.columns:
axs[i].boxplot(df[col], vert=False)

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 2/8

10/20/23, 3:18 PM Experiment_6

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 3/8

10/20/23, 3:18 PM Experiment_6

In [25]: # Identify the quartiles

q1, q3 = np.percentile(df['Insulin'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = df[(df['Insulin'] >= lower_bound)
& (df['Insulin'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['Pregnancies'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['Pregnancies'] >= lower_bound)
& (clean_data['Pregnancies'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['Age'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['Age'] >= lower_bound)
& (clean_data['Age'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['Glucose'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 4/8
10/20/23, 3:18 PM Experiment_6
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['Glucose'] >= lower_bound)
& (clean_data['Glucose'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['BloodPressure'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (0.75 * iqr)
upper_bound = q3 + (0.75 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['BloodPressure'] >= lower_bound)
& (clean_data['BloodPressure'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['BMI'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['BMI'] >= lower_bound)
& (clean_data['BMI'] <= upper_bound)]

# Identify the quartiles

q1, q3 = np.percentile(clean_data['DiabetesPedigreeFunction'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)

# Drop the outliers

clean_data = clean_data[(clean_data['DiabetesPedigreeFunction'] >= lower_bound)
& (clean_data['DiabetesPedigreeFunction'] <= upper_bound)]

In [26]: #correlation
corr = df.corr()

sns.heatmap(df.corr(), annot=True, fmt= '.2f')

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 5/8

10/20/23, 3:18 PM Experiment_6

In [27]: corr['Outcome'].sort_values(ascending = False)

Outcome 1.000000
Glucose 0.466581
BMI 0.292695
Age 0.238356
Pregnancies 0.221898
DiabetesPedigreeFunction 0.173844
Insulin 0.130548
SkinThickness 0.074752
BloodPressure 0.065068
Name: Outcome, dtype: float64

In [28]: plt.pie(df.Outcome.value_counts(),
labels= ['Diabetes', 'Not Diabetes'],
autopct='%.f', shadow=True)
plt.title('Outcome Proportionality')

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 6/8

10/20/23, 3:18 PM Experiment_6

In [29]: # separate array into input and output components

X = df.drop(columns =['Outcome'])
Y = df.Outcome

In [30]: # initialising the MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

# learning the statistical parameters for each of the data and transforming
rescaledX = scaler.fit_transform(X)

array([[0.35294118, 0.74371859, 0.59016393, 0.35353535, 0. ,

0.50074516, 0.23441503, 0.48333333],
[0.05882353, 0.42713568, 0.54098361, 0.29292929, 0. ,
0.39642325, 0.11656704, 0.16666667],
[0.47058824, 0.91959799, 0.52459016, 0. , 0. ,
0.34724292, 0.25362938, 0.18333333],
[0.05882353, 0.44723618, 0.54098361, 0.23232323, 0.11111111,
0.41877794, 0.03800171, 0. ],
[0. , 0.68844221, 0.32786885, 0.35353535, 0.19858156,
0.64232489, 0.94363792, 0.2 ]])

In [31]: from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 7/8

10/20/23, 3:18 PM Experiment_6
array([[ 0.63994726, 0.84832379, 0.14964075, 0.90726993, -0.69289057,
0.20401277, 0.46849198, 1.4259954 ],
[-0.84488505, -1.12339636, -0.16054575, 0.53090156, -0.69289057,
-0.68442195, -0.36506078, -0.19067191],
[ 1.23388019, 1.94372388, -0.26394125, -1.28821221, -0.69289057,
-1.10325546, 0.60439732, -0.10558415],
[-0.84488505, -0.99820778, -0.16054575, 0.15453319, 0.12330164,
-0.49404308, -0.92076261, -1.04154944],
[-1.14185152, 0.5040552 , -1.50468724, 0.90726993, 0.76583594,
1.4097456 , 5.4849091 , -0.0204964 ]])

In [ ]:

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_6.ipynb?download=false 8/8

10/20/23, 3:34 PM Experiment_7

In [2]: # Python code to demonstrate the working of mean()

# importing statstics to handle statistical operations
import statistics
li = [1,2,3,3,2,2,21]

In [3]: print("The average of list values is:",end="")


The average of list values is:4.857142857142857

In [4]: from statistics import median

In [5]: from fractions import Fraction as fr

In [6]: data1 = (2,3,4,5,7,9,11)

In [7]: print("Median of data-set 1 is %s" %(median(data1)))

Median of data-set 1 is 5

In [8]: from statistics import mode

data1 = (2,3,3,4,5,5,5,5,6,6,6,7)

In [9]: print("Mode of data set 1 is %s"%(mode(data1)))

Mode of data set 1 is 5

In [15]: arr = [1,2,3,4,5]

Maximum = max(arr)

In [16]: Minimum = min(arr)

In [17]: Range = Maximum-Minimum

In [20]: print("Maximum = {}, Minumum = {} and Range = {}".format(Maximum,Minimum,Range))

Maximum = 5, Minumum = 1 and Range = 4

In [21]: from statistics import variance

sample1 = (1,2,5,4,8,9,12)
print("Variance of Sample1 is %s"%(variance(sample1)))

Variance of Sample1 is 15.80952380952381

In [22]: from statistics import stdev

sample1 = (1,2,5,4,8,9,12)
print("The Standard Deviation of Sample1 is %s" %(stdev(sample1)))

The Standard Deviation of Sample1 is 3.9761191895520196

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_7.ipynb?download=false 1/1

10/20/23, 4:28 PM Experiment_8

In [1]: #importing pandas as pd

import pandas as pd

In [2]: print(df.head())

Name Team Number Position Age Height Weight \

0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0

College Salary
0 Texas 7730337.0
1 Marquette 6796117.0
2 Boston University NaN
3 Georgia State 1148640.0
4 NaN 5000000.0

In [3]: df1=df.groupby('Team')

In [4]: df1.first()

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_8.ipynb?download=false 1/5

10/20/23, 4:28 PM Experiment_8

Out[4]: Name Number Position Age Height Weight College Salar


Kent Bazemore 24.0 SF 26.0 6-5 201.0 Old Dominion 2000000

Avery Bradley 0.0 PG 25.0 6-2 180.0 Texas 7730337

Brooklyn Bojan Oklahoma

44.0 SG 27.0 6-8 216.0 3425510
Nets Bogdanovic State

Charlotte Virginia
Nicolas Batum 5.0 SG 27.0 6-8 200.0 13125306
Hornets Commonwealth

Chicago Bulls 41.0 PF 25.0 6-9 250.0 New Mexico 845059

Cleveland Matthew
8.0 PG 25.0 6-4 198.0 Saint Mary's 1147276
Cavaliers Dellavedova

Justin Anderson 1.0 SG 22.0 6-6 228.0 Virginia 1449000

Darrell Arthur 0.0 PF 28.0 6-9 235.0 Kansas 2814000

Joel Anthony 50.0 C 33.0 6-9 245.0 UNLV 2500000

Golden State Leandro

19.0 SG 33.0 6-3 194.0 North Carolina 2500000
Warriors Barbosa

Trevor Ariza 1.0 SF 30.0 6-8 215.0 UCLA 8193030

Lavoy Allen 5.0 PF 27.0 6-9 255.0 Temple 4050000

Los Angeles
Cole Aldrich 45.0 C 27.0 6-11 250.0 Kansas 1100602

Los Angeles
Brandon Bass 2.0 PF 31.0 6-8 250.0 LSU 3000000

Jordan Adams 3.0 SG 21.0 6-5 209.0 UCLA 1404600

Miami Heat Chris Bosh 1.0 PF 32.0 6-11 235.0 Georgia Tech 22192730

Milwaukee Giannis
34.0 SF 21.0 6-11 222.0 Arizona 1953960
Bucks Antetokounmpo

Nemanja Bjelica 88.0 PF 28.0 6-10 240.0 Louisville 3950001

New Orleans
Alexis Ajinca 42.0 C 28.0 7-2 248.0 California 4389607

New York
Arron Afflalo 4.0 SG 30.0 6-5 210.0 UCLA 8000000

Steven Adams 12.0 C 22.0 7-0 255.0 Pittsburgh 2279040
City Thunder

Orlando Dewayne
3.0 C 26.0 7-0 245.0 USC 947276
Magic Dedmon

Philadelphia Elton Brand 42.0 PF 37.0 6-9 254.0 Duke 947276

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_8.ipynb?download=false 2/5

10/20/23, 4:28 PM Experiment_8

Name Number Position Age Height Weight College Salar


Phoenix Suns Eric Bledsoe 2.0 PG 26.0 6-1 190.0 Kentucky 13500000

Portland Trail
Cliff Alexander 34.0 PF 20.0 6-8 240.0 Kansas 525093

Quincy Acy 13.0 SF 25.0 6-7 240.0 Baylor 981348

San Antonio LaMarcus

12.0 PF 30.0 6-11 240.0 Texas 19689000
Spurs Aldridge

Toronto Bismack
8.0 C 23.0 6-9 245.0 Missouri 2814000
Raptors Biyombo

Utah Jazz Trevor Booker 33.0 PF 28.0 6-8 228.0 Clemson 4775000

Alan Anderson 6.0 SG 33.0 6-6 220.0 Michigan State 4000000

In [5]: df1.get_group('Boston Celtics')

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_8.ipynb?download=false 3/5

10/20/23, 4:29 PM Experiment_8

Out[5]: Name Team Number Position Age Height Weight College Salary

Avery Boston
0 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
Bradley Celtics

Jae Boston
1 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
Crowder Celtics

John Boston Boston

2 30.0 SG 27.0 6-5 205.0 NaN
Holland Celtics University

3 R.J. Hunter 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0

Jonas Boston
4 8.0 PF 29.0 6-10 231.0 NaN 5000000.0
Jerebko Celtics

Amir Boston
5 90.0 PF 29.0 6-9 240.0 NaN 12000000.0
Johnson Celtics

Jordan Boston
6 55.0 PF 21.0 6-8 235.0 LSU 1170960.0
Mickey Celtics

Kelly Boston
7 41.0 C 25.0 7-0 238.0 Gonzaga 2165160.0
Olynyk Celtics

Terry Boston
8 12.0 PG 22.0 6-2 190.0 Louisville 1824360.0
Rozier Celtics

Marcus Boston Oklahoma

9 36.0 PG 22.0 6-4 220.0 3431040.0
Smart Celtics State

Jared Boston
10 7.0 C 24.0 6-9 260.0 Ohio State 2569260.0
Sullinger Celtics

Isaiah Boston
11 4.0 PG 27.0 5-9 185.0 Washington 6912869.0
Thomas Celtics

Evan Boston
12 11.0 SG 27.0 6-7 220.0 Ohio State 3425510.0
Turner Celtics

James Boston
13 13.0 SG 20.0 6-6 215.0 Kentucky 1749840.0
Young Celtics

Boston North
14 Tyler Zeller 44.0 C 26.0 7-0 253.0 2616975.0
Celtics Carolina

In [6]: import pandas as pd

df = pd.read_csv("nba.csv")

In [7]: df2.first()

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_8.ipynb?download=false 4/5

10/20/23, 4:29 PM Experiment_8

Out[7]: Name Number Age Height Weight College Salary

Team Position

Atlanta C Al Horford 15.0 30.0 6-10 245.0 Florida 12000000.0

PF 43.0 31.0 6-9 235.0 Minnesota 1000000.0

PG 17.0 22.0 6-1 172.0 Wake Forest 1763400.0

Kent Old
SF 24.0 26.0 6-5 201.0 2000000.0
Bazemore Dominion

SG Hardaway 10.0 24.0 6-6 205.0 Michigan 1304520.0

... ... ... ... ... ... ... ... ...

Washington C North
Wizards 13.0 32.0 6-11 240.0 Carolina 11217391.0

PF 90.0 34.0 6-10 250.0 Kansas 3300000.0

PG 7.0 30.0 6-3 190.0 Nevada 2170465.0

Jared Boston
SF 1.0 30.0 6-7 225.0 4375000.0
Dudley College

Alan Michigan
SG 6.0 33.0 6-6 220.0 4000000.0
Anderson State

149 rows × 7 columns

localhost:8888/nbconvert/html/Experiments 6 to 10/Experiment_8.ipynb?download=false 5/5

10/20/23, 4:35 PM Experiment 9 - Jupyter Notebook

In [4]: import matplotlib.pyplot as plt

from scipy import stats
x = [5,7,8,7,2,17,2,9,4,11,12,9,6]
y = [99,86,87,88,111,86,103,87,94,78,77,85,86]
slope, intercept,r,p,std_err=stats.linregress(x,y)
def myfunc(x):
return slope *x+ intercept

localhost:8888/notebooks/Experiments 6 to 10/Experiment 9.ipynb 1/1

10/20/23, 4:54 PM Experiment 10 - Jupyter Notebook

In [14]: import pandas as pd

from sklearn import linear_model

# Read the CSV file into a DataFrame
df = pd.read_csv("data1.csv")

# Drop rows with non-numeric values in the 'Weight' and 'Volume' columns
df = df[pd.to_numeric(df['Weight'], errors='coerce').notna()]
df = df[pd.to_numeric(df['Volume'], errors='coerce').notna()]

# Convert 'Weight' and 'Volume' columns to numeric type
df['Weight'] = pd.to_numeric(df['Weight'])
df['Volume'] = pd.to_numeric(df['Volume'])

# Define the independent variables (features) X and the dependent variable (ta
X = df[['Weight', 'Volume']]
Y = df['CO2']

# Create a linear regression model
regr = linear_model.LinearRegression()

# Fit the linear regression model to the data, Y)

# Predict the CO2 emissions for a new data point with 'Weight' = 2300 and 'Vol
predictedCO2 = regr.predict([[2300, 1300]])

# Print the predicted CO2 emissions


C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\ UserWarning:
X does not have valid feature names, but LinearRegression was fitted with fea
ture names

In [ ]: ​

localhost:8888/notebooks/Experiments 6 to 10/Experiment 10.ipynb 1/1

You might also like