Professional Documents
Culture Documents
BDA-4 EDA Project
BDA-4 EDA Project
In [4]: data.head()
Out[4]: S.No. Name Location Year Kilometers_Driven Fuel_Type Transmission Owner_Type Mile
Maruti
0 0 Wagon R Mumbai 2010 72000 CNG Manual First
km
LXI CNG
Hyundai
Creta 1.6 1
1 1 Pune 2015 41000 Diesel Manual First
CRDi SX k
Option
Honda
2 2 Chennai 2011 46000 Petrol Manual First
Jazz V k
Maruti 2
3 3 Chennai 2012 87000 Diesel Manual First
Ertiga VDI k
Audi A4
New 2.0
4 4 Coimbatore 2013 40670 Diesel Automatic Second
TDI k
Multitronic
In [5]: data.tail()
Volkswagen
Vento
7248 7248 Hyderabad 2011 89411 Diesel Manual First
Diesel
Trendline
Volkswagen
7249 7249 Mumbai 2015 59000 Petrol Automatic First
Polo GT TSI
Nissan
7250 7250 Micra Kolkata 2012 28000 Diesel Manual First
Diesel XV
Volkswagen
7251 7251 Pune 2013 52262 Petrol Automatic Third
Polo GT TSI
Mercedes-
Benz E-
7252 7252 Class 2009- Kochi 2014 72443 Diesel Automatic First
2013 E 220
CDI Avan...
In [6]: data.dtypes
S.No. int64
Out[6]:
Name object
Location object
Year int64
Kilometers_Driven int64
Fuel_Type object
Transmission object
Owner_Type object
Mileage object
Engine object
Power object
Seats float64
New_Price object
Price float64
dtype: object
In [7]: data.describe().T
In [8]: data.info()
In [9]: data.nunique()
S.No. 7253
Out[9]:
Name 2041
Location 11
Year 23
Kilometers_Driven 3660
Fuel_Type 5
Transmission 2
Owner_Type 4
Mileage 450
Engine 150
Power 386
Seats 9
New_Price 625
Price 1373
dtype: int64
In [10]: data.isnull().sum()
S.No. 0
Out[10]:
Name 0
Location 0
Year 0
Kilometers_Driven 0
Fuel_Type 0
Transmission 0
Owner_Type 0
Mileage 2
Engine 46
Power 46
Seats 53
New_Price 6247
Price 1234
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 7253 non-null object
1 Location 7253 non-null object
2 Year 7253 non-null int64
3 Kilometers_Driven 7253 non-null int64
4 Fuel_Type 7253 non-null object
5 Transmission 7253 non-null object
6 Owner_Type 7253 non-null object
7 Mileage 7251 non-null object
8 Engine 7207 non-null object
9 Power 7207 non-null object
10 Seats 7200 non-null float64
11 New_Price 1006 non-null object
12 Price 6019 non-null float64
dtypes: float64(2), int64(2), object(9)
memory usage: 736.8+ KB
Maruti
26.6
0 Wagon R Mumbai 2010 72000 CNG Manual First 9
km/kg
LXI CNG
Hyundai
Creta 1.6 19.67
1 Pune 2015 41000 Diesel Manual First
CRDi SX kmpl
Option
Honda 18.2
2 Chennai 2011 46000 Petrol Manual First
Jazz V kmpl
Maruti 20.77
3 Chennai 2012 87000 Diesel Manual First
Ertiga VDI kmpl
Audi A4
New 2.0 15.2
4 Coimbatore 2013 40670 Diesel Automatic Second
TDI kmpl
Multitronic
In [14]: data.describe().T
In [16]: print(data.Brand.unique())
Land Rover
Range 12.7
13 Delhi 2014 72000 Diesel Automatic First
Rover 2.2L kmp
Pure
Land Rover
14 Freelander Pune 2012 85000 Diesel Automatic Second 0.0 kmp
2 TD4 SE
Mini
16.6
176 Countryman Jaipur 2017 8525 Diesel Automatic Second
kmp
Cooper D
Land Rover
Range 12.7
191 Coimbatore 2018 36091 Diesel Automatic First
Rover 2.2L kmp
Dynamic
Mini
Cooper 16.82
228 Kochi 2017 26327 Petrol Automatic First
Convertible kmp
S
In [19]: data.describe(include='all').T
Mahindra
Name 7253 2041 XUV500 55 NaN NaN NaN NaN N
W8 2WD
Year 7253.0 NaN NaN NaN 2013.365366 3.254421 1996.0 2011.0 201
Kilometers_Driven 7253.0 NaN NaN NaN 58699.063146 84427.720583 171.0 34000.0 5341
Mileage 7251 450 17.0 kmpl 207 NaN NaN NaN NaN N
63.71
New_Price 1006 625 6 NaN NaN NaN NaN N
Lakh
In [20]: cat_cols=data.select_dtypes(include=['object']).columns
num_cols = data.select_dtypes(include=np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numerical Variables:")
print(num_cols)
Categorical Variables:
Index(['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type',
'Mileage', 'Engine', 'Power', 'New_Price', 'Brand', 'Model'],
dtype='object')
Numerical Variables:
['Year', 'Kilometers_Driven', 'Seats', 'Price', 'Car_Age']
Kilometers_Driven
Skew : 61.58
Seats
Skew : 1.9
Price
Skew : 3.34
Car_Age
Skew : 0.84
def add_percentage_labels(ax):
total = len(data)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height() / total)
x = p.get_x() + p.get_width() / 2
y = p.get_height()
ax.annotate(percentage, (x, y), ha='center', va='bottom')
axes[1][1].tick_params(labelrotation=45)
axes[2][0].tick_params(labelrotation=90)
axes[2][1].tick_params(labelrotation=90)
plt.show()
plt.tight_layout()
plt.show()
In [27]: log_transform(data,['Kilometers_Driven','Price'])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 7253 non-null object
1 Location 7253 non-null object
2 Year 7253 non-null int64
3 Kilometers_Driven 7253 non-null int64
4 Fuel_Type 7253 non-null object
5 Transmission 7253 non-null object
6 Owner_Type 7253 non-null object
7 Mileage 7251 non-null object
8 Engine 7207 non-null object
9 Power 7207 non-null object
10 Seats 7200 non-null float64
11 New_Price 1006 non-null object
12 Price 6019 non-null float64
13 Car_Age 7253 non-null int64
14 Brand 7253 non-null object
15 Model 7252 non-null object
16 Kilometers_Driven_log 7253 non-null float64
17 Price_log 6019 non-null float64
dtypes: float64(4), int64(3), object(11)
memory usage: 1020.1+ KB
In [29]: plt.figure(figsize=(13,17))
sns.pairplot(data=data.drop(['Kilometers_Driven','Price'],axis=1))
plt.show()
In [32]: data.loc[data["Mileage"]==0.0,'Mileage']=np.nan
data.Mileage.isnull().sum()
2
Out[32]:
In [33]: data.Seats.isnull().sum()
data['Seats'].fillna(value=np.nan,inplace=True)
data['Seats']=data.groupby(['Model','Brand'])['Seats'].apply(lambda x:x.fillna(x.media
data['Engine']=data.groupby(['Brand','Model'])['Engine'].apply(lambda x:x.fillna(x.med
data['Power']=data.groupby(['Brand','Model'])['Power'].apply(lambda x:x.fillna(x.media
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\nanops.py in nanmedian(va
lues, axis, skipna, mask)
718 try:
--> 719 values = values.astype("f8")
720 except ValueError as err:
The above exception was the direct cause of the following exception:
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\groupby\groupby.py in _py
thon_apply_general(self, f, data)
1308 """
-> 1309 keys, values, mutated = self.grouper.apply(f, data, self.axis)
1310
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\groupby\ops.py in apply(s
elf, f, data, axis)
840 group_axes = group.axes
--> 841 res = f(group)
842 if not _is_indexed_like(res, group_axes, axis):
~\AppData\Local\Temp\ipykernel_5292\2680079267.py in <lambda>(x)
3 data['Seats']=data.groupby(['Model','Brand'])['Seats'].apply(lambda x:x.filln
a(x.median()))
----> 4 data['Engine']=data.groupby(['Brand','Model'])['Engine'].apply(lambda x:x.fil
lna(x.median()))
5 data['Power']=data.groupby(['Brand','Model'])['Power'].apply(lambda x:x.filln
a(x.median()))
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in median(sel
f, axis, skipna, level, numeric_only, **kwargs)
10801 ):
> 10802 return NDFrame.median(self, axis, skipna, level, numeric_only, **
kwargs)
10803
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in median(sel
f, axis, skipna, level, numeric_only, **kwargs)
10373 def median(self, axis=None, skipna=None, level=None, numeric_only=None, *
*kwargs):
> 10374 return self._stat_function(
10375 "median", nanops.nanmedian, axis, skipna, level, numeric_only, **
kwargs
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in _stat_funct
ion(self, name, func, axis, skipna, level, numeric_only, **kwargs)
10353 )
> 10354 return self._reduce(
10355 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_o
nly
localhost:8888/nbconvert/html/~BDA-4 EDA Project.ipynb?download=false 16/19
4/15/24, 4:57 PM ~BDA-4 EDA Project
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\series.py in _reduce(sel
f, op, name, axis, skipna, numeric_only, filter_type, **kwds)
4391 with np.errstate(all="ignore"):
-> 4392 return op(delegate, skipna=skipna, **kwds)
4393
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\nanops.py in f(values, ax
is, skipna, **kwds)
154 else:
--> 155 result = alt(values, axis=axis, skipna=skipna, **kwds)
156
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\nanops.py in nanmedian(va
lues, axis, skipna, mask)
721 # e.g. "could not convert string to float: 'a'"
--> 722 raise TypeError(str(err)) from err
723 if mask is not None:
The above exception was the direct cause of the following exception:
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\groupby\generic.py in app
ly(self, func, *args, **kwargs)
221 )
222 def apply(self, func, *args, **kwargs):
--> 223 return super().apply(func, *args, **kwargs)
224
225 @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\groupby\groupby.py in app
ly(self, func, *args, **kwargs)
1284
1285 with group_selection_context(self):
-> 1286 return self._python_apply_general(f, self._selected_obj)
1287
1288 return result
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\groupby\ops.py in apply(s
elf, f, data, axis)
839 # group might be modified
840 group_axes = group.axes
--> 841 res = f(group)
842 if not _is_indexed_like(res, group_axes, axis):
843 mutated = True
~\AppData\Local\Temp\ipykernel_5292\2680079267.py in <lambda>(x)
2 data['Seats'].fillna(value=np.nan,inplace=True)
3 data['Seats']=data.groupby(['Model','Brand'])['Seats'].apply(lambda x:x.filln
a(x.median()))
----> 4 data['Engine']=data.groupby(['Brand','Model'])['Engine'].apply(lambda x:x.fil
lna(x.median()))
5 data['Power']=data.groupby(['Brand','Model'])['Power'].apply(lambda x:x.filln
a(x.median()))
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in median(sel
f, axis, skipna, level, numeric_only, **kwargs)
10800 self, axis=None, skipna=None, level=None, numeric_only=None, **kw
args
10801 ):
> 10802 return NDFrame.median(self, axis, skipna, level, numeric_only, **
kwargs)
10803
10804 setattr(cls, "median", median)
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in median(sel
f, axis, skipna, level, numeric_only, **kwargs)
10372
10373 def median(self, axis=None, skipna=None, level=None, numeric_only=None, *
*kwargs):
> 10374 return self._stat_function(
10375 "median", nanops.nanmedian, axis, skipna, level, numeric_only, **
kwargs
10376 )
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\generic.py in _stat_funct
ion(self, name, func, axis, skipna, level, numeric_only, **kwargs)
10352 name, axis=axis, level=level, skipna=skipna, numeric_only=num
eric_only
10353 )
> 10354 return self._reduce(
10355 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_o
nly
10356 )
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\series.py in _reduce(sel
f, op, name, axis, skipna, numeric_only, filter_type, **kwds)
4390 )
4391 with np.errstate(all="ignore"):
-> 4392 return op(delegate, skipna=skipna, **kwds)
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\nanops.py in f(values, ax
is, skipna, **kwds)
153 result = alt(values, axis=axis, skipna=skipna, **kwds)
154 else:
--> 155 result = alt(values, axis=axis, skipna=skipna, **kwds)
156
157 return result
~\AppData\Roaming\Python\Python39\site-packages\pandas\core\nanops.py in nanmedian(va
lues, axis, skipna, mask)
720 except ValueError as err:
721 # e.g. "could not convert string to float: 'a'"
--> 722 raise TypeError(str(err)) from err
723 if mask is not None:
724 values[mask] = np.nan
In [ ]: