You are on page 1of 15

30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [1]: import pandas as pd


import pymysql as sql
db=sql.connect(host='localhost',user='root',password='manish@sql0047',datab

data=pd.read_sql_query('select * from emp',db) #isse dataframe bna
data

C:\Users\Acer\AppData\Local\Temp\ipykernel_21408\2883241542.py:5: UserWarn
ing: pandas only supports SQLAlchemy connectable (engine/connection) or da
tabase string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are n
ot tested. Please consider using SQLAlchemy.
data=pd.read_sql_query('select * from emp',db) #isse dataframe
bnane ki jarurt nhi h by default bn jati hai

Out[1]: id name lastname age city salary

0 111 Rohit Verma 27 Meerut 2000

1 112 Monu Kasana 23 Ghaziabad 5000

2 113 Vinod Sharma 28 Noida 12000

3 114 Satish Bhati 25 Bulandsher 4000

4 115 Manish Dhama 23 Greater Noida 10000

5 116 Sachin Dedha 24 Mujaffarnagar 9000

6 117 Manoj Tyagi 22 New Delhi 14000

In [2]: # CLEANING DATASET WHI WORK KREGA JHA PAR [NAN]


# in the above dataset there is no null value
👈 VALUES HOGI----------

In [3]: data.isnull().sum() # this query check the null values of each column...

Out[3]: id 0
name 0
lastname 0
age 0
city 0
salary 0
dtype: int64

In [4]: # example--------
import numpy as np
s=np.array([5,6,7,8,np.nan,44,55,np.nan])
s

Out[4]: array([ 5., 6., 7., 8., nan, 44., 55., nan])

In [5]: np.mean(s) # yha par koi bhi calculation possible nhi hai kyoki nan valu

Out[5]: nan

localhost:8888/notebooks/CLEANING DATA SET.ipynb 1/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [6]: # ANOTHER EXAMPLE OF THE CLEANING DATA SET ------->> FIRST WE EXTRACT THE DA

In [7]: import pandas as pd


data=pd.read_excel("C:/Users/Acer/Desktop/DATA12.xlsx")
data

Out[7]: ID Name Industry Inception Revenue Expenses Profit Growth Sa

IT 6,482,465
0 1.0 Lamtone 2009.0 $11,757,018 5274553.0 0.30 N
Services Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN N
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN N

IT 7,429,377
3 4.0 NaN 2013.0 NaN 6597557.0 NaN N
Services Dollars

7,435,363
4 5.0 NaN NaN NaN NaN 3138627.0 NaN N
Dollars

5,470,303
5 6.0 Techline Health 2006.0 NaN 8427816.0 0.23 N
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06 N

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04 N
Dollars

IT
8 9.0 Ganzlax 2011.0 $14,001,180 NaN 11901180.0 0.18 N
Services

9 NaN NaN NaN NaN NaN NaN NaN NaN N

In [8]: data.isnull().sum() # this function count the null values of the each col

Out[8]: ID 1
Name 3
Industry 4
Inception 2
Revenue 4
Expenses 4
Profit 1
Growth 5
Salary 10
dtype: int64

In [9]: data.count(axis=1) # this count the non null values of each row

Out[9]: 0 8
1 7
2 6
3 5
4 3
5 7
6 6
7 7
8 7
9 0
dtype: int64

localhost:8888/notebooks/CLEANING DATA SET.ipynb 2/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [10]: data.count(axis=0) # thiscount the non null values of the each column

Out[10]: ID 9
Name 7
Industry 6
Inception 8
Revenue 6
Expenses 6
Profit 9
Growth 5
Salary 0
dtype: int64

In [11]: data

Out[11]: ID Name Industry Inception Revenue Expenses Profit Growth Sa

IT 6,482,465
0 1.0 Lamtone 2009.0 $11,757,018 5274553.0 0.30 N
Services Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN N
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN N

IT 7,429,377
3 4.0 NaN 2013.0 NaN 6597557.0 NaN N
Services Dollars

7,435,363
4 5.0 NaN NaN NaN NaN 3138627.0 NaN N
Dollars

5,470,303
5 6.0 Techline Health 2006.0 NaN 8427816.0 0.23 N
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06 N

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04 N
Dollars

IT
8 9.0 Ganzlax 2011.0 $14,001,180 NaN 11901180.0 0.18 N
Services

9 NaN NaN NaN NaN NaN NaN NaN NaN N

In [12]: # HERE ALL THE VALUES OF THE SALARY COLUMN ARE NAN SO WE WANT TO DROP THE W
data.drop(['Salary'],axis=1,inplace=True) # isse salary wala column perma

localhost:8888/notebooks/CLEANING DATA SET.ipynb 3/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [13]: data

Out[13]: ID Name Industry Inception Revenue Expenses Profit Growth

IT 6,482,465
0 1.0 Lamtone 2009.0 $11,757,018 5274553.0 0.30
Services Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN

IT 7,429,377
3 4.0 NaN 2013.0 NaN 6597557.0 NaN
Services Dollars

7,435,363
4 5.0 NaN NaN NaN NaN 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 NaN 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04
Dollars

IT
8 9.0 Ganzlax 2011.0 $14,001,180 NaN 11901180.0 0.18
Services

9 NaN NaN NaN NaN NaN NaN NaN NaN

In [14]: # AB ROW KE LIYE CHECK KRENGE -------->>>


data.count(axis=1)

Out[14]: 0 8
1 7
2 6
3 5
4 3
5 7
6 6
7 7
8 7
9 0
dtype: int64

In [15]: # isme ninth row mein total NAN values hai , so now we clean the whole nint
data.dropna(how='all',inplace=True) # this remove the row which have all NA

localhost:8888/notebooks/CLEANING DATA SET.ipynb 4/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [16]: data

Out[16]: ID Name Industry Inception Revenue Expenses Profit Growth

6,482,465
0 1.0 Lamtone IT Services 2009.0 $11,757,018 5274553.0 0.30
Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN

7,429,377
3 4.0 NaN IT Services 2013.0 NaN 6597557.0 NaN
Dollars

7,435,363
4 5.0 NaN NaN NaN NaN 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 NaN 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04
Dollars

8 9.0 Ganzlax IT Services 2011.0 $14,001,180 NaN 11901180.0 0.18

In [17]: data.dropna(how='any') # isse ek bhi NAN value hogi row mein vo row delete

Out[17]: ID Name Industry Inception Revenue Expenses Profit Growth

0 1.0 Lamtone IT Services 2009.0 $11,757,018 6,482,465 Dollars 5274553.0 0.3

In [18]: data

Out[18]: ID Name Industry Inception Revenue Expenses Profit Growth

6,482,465
0 1.0 Lamtone IT Services 2009.0 $11,757,018 5274553.0 0.30
Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN

7,429,377
3 4.0 NaN IT Services 2013.0 NaN 6597557.0 NaN
Dollars

7,435,363
4 5.0 NaN NaN NaN NaN 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 NaN 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04
Dollars

8 9.0 Ganzlax IT Services 2011.0 $14,001,180 NaN 11901180.0 0.18

localhost:8888/notebooks/CLEANING DATA SET.ipynb 5/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [19]: # NOW WE WORK ON REVENUE & EXPENSES COLUMNS ( cleaning the NAN values from


#STEP 1 =>> fill zero where NAN values in the column

data.fillna({'Revenue':'0'},inplace=True)
data

Out[19]: ID Name Industry Inception Revenue Expenses Profit Growth

6,482,465
0 1.0 Lamtone IT Services 2009.0 $11,757,018 5274553.0 0.30
Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 $12,329,371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 $10,597,009 NaN 3005820.0 NaN

7,429,377
3 4.0 NaN IT Services 2013.0 0 6597557.0 NaN
Dollars

7,435,363
4 5.0 NaN NaN NaN 0 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 0 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 $9,254,614 NaN 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 $9,451,943 5573830.0 0.04
Dollars

8 9.0 Ganzlax IT Services 2011.0 $14,001,180 NaN 11901180.0 0.18

In [20]: # STEP 2 =>> make list



n=data['Revenue']
n2=list(n)
n2

Out[20]: ['$11,757,018 ',


'$12,329,371 ',
'$10,597,009 ',
'0',
'0',
'0',
'$9,254,614 ',
'$9,451,943 ',
'$14,001,180 ']

In [21]: # STEP 3=>> remove extra things like dollar($) and comma(,) from the list-
u=[]
for i in n2:
t=""
for j in i:
if(j!="$" and j!=","):
t=t+j
u.append(t)

localhost:8888/notebooks/CLEANING DATA SET.ipynb 6/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [22]: u

Out[22]: ['11757018 ',


'12329371 ',
'10597009 ',
'0',
'0',
'0',
'9254614 ',
'9451943 ',
'14001180 ']

In [23]: ​
# STEP 4=>> Now all the correct values are in the variable {u} now move th

data['Revenue']=u
data

Out[23]: ID Name Industry Inception Revenue Expenses Profit Growth

6,482,465
0 1.0 Lamtone IT Services 2009.0 11757018 5274553.0 0.30
Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 12329371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 10597009 NaN 3005820.0 NaN

7,429,377
3 4.0 NaN IT Services 2013.0 0 6597557.0 NaN
Dollars

7,435,363
4 5.0 NaN NaN NaN 0 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 0 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 9254614 NaN 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 9451943 5573830.0 0.04
Dollars

8 9.0 Ganzlax IT Services 2011.0 14001180 NaN 11901180.0 0.18

In [24]: data.info() # isse pyta chla Revenue abhi bhi object hai means String valu

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 9 non-null float64
1 Name 7 non-null object
2 Industry 6 non-null object
3 Inception 8 non-null float64
4 Revenue 9 non-null object
5 Expenses 6 non-null object
6 Profit 9 non-null float64
7 Growth 5 non-null float64
dtypes: float64(4), object(4)
memory usage: 648.0+ bytes

localhost:8888/notebooks/CLEANING DATA SET.ipynb 7/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [25]: data['Revenue']=data['Revenue'].astype(int)

In [26]: data.info() # now Revenue has been changed into integer value------>>>>

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 9 non-null float64
1 Name 7 non-null object
2 Industry 6 non-null object
3 Inception 8 non-null float64
4 Revenue 9 non-null int32
5 Expenses 6 non-null object
6 Profit 9 non-null float64
7 Growth 5 non-null float64
dtypes: float64(4), int32(1), object(3)
memory usage: 612.0+ bytes

In [27]: # NOW WE PERFORM THE SAME STEPS FOR THE EXPENSE COLUMN----->>>>>>>>>

#STEP 1 =>> fill zero where NAN values in the column

data.fillna({'Expenses':'0'},inplace=True)
data

Out[27]: ID Name Industry Inception Revenue Expenses Profit Growth

6,482,465
0 1.0 Lamtone IT Services 2009.0 11757018 5274553.0 0.30
Dollars

Financial 916,455
1 2.0 Stripfind 2010.0 12329371 11412916.0 NaN
Services Dollars

2 3.0 Canecorporation Health 2012.0 10597009 0 3005820.0 NaN

7,429,377
3 4.0 NaN IT Services 2013.0 0 6597557.0 NaN
Dollars

7,435,363
4 5.0 NaN NaN NaN 0 3138627.0 NaN
Dollars

5,470,303
5 6.0 Techline Health 2006.0 0 8427816.0 0.23
Dollars

6 7.0 Cityace NaN 2010.0 9254614 0 3005116.0 0.06

3,878,113
7 8.0 Kayelectronics NaN 2009.0 9451943 5573830.0 0.04
Dollars

8 9.0 Ganzlax IT Services 2011.0 14001180 0 11901180.0 0.18

localhost:8888/notebooks/CLEANING DATA SET.ipynb 8/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [28]: # STEP 2 =>> make list



l=data['Expenses']
l2=list(l)
l2

Out[28]: ['6,482,465 Dollars',


'916,455 Dollars',
'0',
'7,429,377 Dollars',
'7,435,363 Dollars',
'5,470,303 Dollars',
'0',
'3,878,113 Dollars',
'0']

In [38]: # STEP 3=>> remove extra things like [Dollars] and comma(,) from the list-
m=[]
for i in l2:
e=""
for j in i:
if(j.isdigit()):
e=e+j
m.append(e)

In [39]: m

Out[39]: ['6482465',
'916455',
'0',
'7429377',
'7435363',
'5470303',
'0',
'3878113',
'0']

In [40]: data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 9 non-null float64
1 Name 7 non-null object
2 Industry 6 non-null object
3 Inception 8 non-null float64
4 Revenue 9 non-null int32
5 Expenses 9 non-null object
6 Profit 9 non-null float64
7 Growth 5 non-null float64
dtypes: float64(4), int32(1), object(3)
memory usage: 612.0+ bytes

localhost:8888/notebooks/CLEANING DATA SET.ipynb 9/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [41]: data['Expenses']=m
data

Out[41]: ID Name Industry Inception Revenue Expenses Profit Growth

0 1.0 Lamtone IT Services 2009.0 11757018 6482465 5274553.0 0.30

Financial
1 2.0 Stripfind 2010.0 12329371 916455 11412916.0 NaN
Services

2 3.0 Canecorporation Health 2012.0 10597009 0 3005820.0 NaN

3 4.0 NaN IT Services 2013.0 0 7429377 6597557.0 NaN

4 5.0 NaN NaN NaN 0 7435363 3138627.0 NaN

5 6.0 Techline Health 2006.0 0 5470303 8427816.0 0.23

6 7.0 Cityace NaN 2010.0 9254614 0 3005116.0 0.06

7 8.0 Kayelectronics NaN 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 Ganzlax IT Services 2011.0 14001180 0 11901180.0 0.18

In [42]: data['Expenses']=data['Expenses'].astype(int)

In [43]: data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 9 non-null float64
1 Name 7 non-null object
2 Industry 6 non-null object
3 Inception 8 non-null float64
4 Revenue 9 non-null int32
5 Expenses 9 non-null int32
6 Profit 9 non-null float64
7 Growth 5 non-null float64
dtypes: float64(4), int32(2), object(2)
memory usage: 576.0+ bytes

In [44]: gf=data.select_dtypes(['int','float']) # this will privide the inte


gf

Out[44]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.30

1 2.0 2010.0 12329371 916455 11412916.0 NaN

2 3.0 2012.0 10597009 0 3005820.0 NaN

3 4.0 2013.0 0 7429377 6597557.0 NaN

4 5.0 NaN 0 7435363 3138627.0 NaN

5 6.0 2006.0 0 5470303 8427816.0 0.23

6 7.0 2010.0 9254614 0 3005116.0 0.06

7 8.0 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 2011.0 14001180 0 11901180.0 0.18

localhost:8888/notebooks/CLEANING DATA SET.ipynb 10/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [46]: gf.isnull().sum()/len(gf)*100 # this provide the NaN value in percentage

Out[46]: ID 0.000000
Inception 11.111111
Revenue 0.000000
Expenses 0.000000
Profit 0.000000
Growth 44.444444
dtype: float64

In [47]: gf

Out[47]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.30

1 2.0 2010.0 12329371 916455 11412916.0 NaN

2 3.0 2012.0 10597009 0 3005820.0 NaN

3 4.0 2013.0 0 7429377 6597557.0 NaN

4 5.0 NaN 0 7435363 3138627.0 NaN

5 6.0 2006.0 0 5470303 8427816.0 0.23

6 7.0 2010.0 9254614 0 3005116.0 0.06

7 8.0 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 2011.0 14001180 0 11901180.0 0.18

In [48]: # CHECK OUTLAYER IN GROWTH COLUMN------


t=gf['Growth']
t

Out[48]: 0 0.30
1 NaN
2 NaN
3 NaN
4 NaN
5 0.23
6 0.06
7 0.04
8 0.18
Name: Growth, dtype: float64

In [ ]: ​

In [ ]: ​

localhost:8888/notebooks/CLEANING DATA SET.ipynb 11/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [49]: t.plot.box()

Out[49]: <Axes: >

localhost:8888/notebooks/CLEANING DATA SET.ipynb 12/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [52]: # CHECK OUTLAYER IN INCEPTION COLUMN----


t2=gf['Inception']
t2
t2.plot.box()

Out[52]: <Axes: >

In [50]: # FILL THE MEAN VALUE IN PLACE OF THE NaN VALUES------------>>


gf.fillna(gf.mean())

Out[50]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.300

1 2.0 2010.0 12329371 916455 11412916.0 0.162

2 3.0 2012.0 10597009 0 3005820.0 0.162

3 4.0 2013.0 0 7429377 6597557.0 0.162

4 5.0 2010.0 0 7435363 3138627.0 0.162

5 6.0 2006.0 0 5470303 8427816.0 0.230

6 7.0 2010.0 9254614 0 3005116.0 0.060

7 8.0 2009.0 9451943 3878113 5573830.0 0.040

8 9.0 2011.0 14001180 0 11901180.0 0.180

In [57]: gf['Growth']=gf['Growth'].fillna(gf['Growth'].median()) # fill meadian va

localhost:8888/notebooks/CLEANING DATA SET.ipynb 13/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [58]: gf #(here median is 0.18)

Out[58]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.30

1 2.0 2010.0 12329371 916455 11412916.0 0.18

2 3.0 2012.0 10597009 0 3005820.0 0.18

3 4.0 2013.0 0 7429377 6597557.0 0.18

4 5.0 NaN 0 7435363 3138627.0 0.18

5 6.0 2006.0 0 5470303 8427816.0 0.23

6 7.0 2010.0 9254614 0 3005116.0 0.06

7 8.0 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 2011.0 14001180 0 11901180.0 0.18

In [59]: data

Out[59]: ID Name Industry Inception Revenue Expenses Profit Growth

0 1.0 Lamtone IT Services 2009.0 11757018 6482465 5274553.0 0.30

Financial
1 2.0 Stripfind 2010.0 12329371 916455 11412916.0 NaN
Services

2 3.0 Canecorporation Health 2012.0 10597009 0 3005820.0 NaN

3 4.0 NaN IT Services 2013.0 0 7429377 6597557.0 NaN

4 5.0 NaN NaN NaN 0 7435363 3138627.0 NaN

5 6.0 Techline Health 2006.0 0 5470303 8427816.0 0.23

6 7.0 Cityace NaN 2010.0 9254614 0 3005116.0 0.06

7 8.0 Kayelectronics NaN 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 Ganzlax IT Services 2011.0 14001180 0 11901180.0 0.18

In [60]: gf=data.select_dtypes(['int','float']) # this will privide the inte


gf

Out[60]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.30

1 2.0 2010.0 12329371 916455 11412916.0 NaN

2 3.0 2012.0 10597009 0 3005820.0 NaN

3 4.0 2013.0 0 7429377 6597557.0 NaN

4 5.0 NaN 0 7435363 3138627.0 NaN

5 6.0 2006.0 0 5470303 8427816.0 0.23

6 7.0 2010.0 9254614 0 3005116.0 0.06

7 8.0 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 2011.0 14001180 0 11901180.0 0.18

localhost:8888/notebooks/CLEANING DATA SET.ipynb 14/15


30/10/2023, 10:31 CLEANING DATA SET - Jupyter Notebook

In [62]: gf.fillna(0) # fill zero in place of NaN values


Out[62]: ID Inception Revenue Expenses Profit Growth

0 1.0 2009.0 11757018 6482465 5274553.0 0.30

1 2.0 2010.0 12329371 916455 11412916.0 0.00

2 3.0 2012.0 10597009 0 3005820.0 0.00

3 4.0 2013.0 0 7429377 6597557.0 0.00

4 5.0 0.0 0 7435363 3138627.0 0.00

5 6.0 2006.0 0 5470303 8427816.0 0.23

6 7.0 2010.0 9254614 0 3005116.0 0.06

7 8.0 2009.0 9451943 3878113 5573830.0 0.04

8 9.0 2011.0 14001180 0 11901180.0 0.18

In [ ]: # FILL THE LINEAR VALUES INTO THE DATA--------->>>>>>>>>

localhost:8888/notebooks/CLEANING DATA SET.ipynb 15/15

You might also like