You are on page 1of 17

11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [6]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import scipy.stats as stats
import seaborn as sns
import os
os.getcwd()
os.chdir('C:\\Users\\USER\\Downloads')

In [8]:

df_Wholesale = pd.read_csv('Wholesale+Customers+Data.csv')

In [9]:

df.head()

Out[9]:

Buyer/Spender Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicates

0 1 Retail Other 12669 9656 7561 214 2674 1

1 2 Retail Other 7057 9810 9568 1762 3293 1

2 3 Retail Other 6353 8808 7684 2405 3516 7

3 4 Hotel Other 13265 1196 4221 6404 507 1

4 5 Retail Other 22615 5410 7198 3915 1777 5

In [10]:

df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 440 entries, 0 to 439

Data columns (total 9 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 Buyer/Spender 440 non-null int64

1 Channel 440 non-null object

2 Region 440 non-null object

3 Fresh 440 non-null int64

4 Milk 440 non-null int64

5 Grocery 440 non-null int64

6 Frozen 440 non-null int64

7 Detergents_Paper 440 non-null int64

8 Delicatessen 440 non-null int64

dtypes: int64(7), object(2)

memory usage: 31.1+ KB

In [11]:

# There are 440 entries with 9 columns however there is no null values.
# There are 7 integers and 2 objects

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 1/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [12]:

df.describe(include='all')

Out[12]:

Buyer/Spender Channel Region Fresh Milk Grocery Fro

count 440.000000 440 440 440.000000 440.000000 440.000000 440.000

unique NaN 2 3 NaN NaN NaN N

top NaN Hotel Other NaN NaN NaN N

freq NaN 298 316 NaN NaN NaN N

mean 220.500000 NaN NaN 12000.297727 5796.265909 7951.277273 3071.931

std 127.161315 NaN NaN 12647.328865 7380.377175 9503.162829 4854.673

min 1.000000 NaN NaN 3.000000 55.000000 3.000000 25.000

25% 110.750000 NaN NaN 3127.750000 1533.000000 2153.000000 742.250

50% 220.500000 NaN NaN 8504.000000 3627.000000 4755.500000 1526.000

75% 330.250000 NaN NaN 16933.750000 7190.250000 10655.750000 3554.250

max 440.000000 NaN NaN 112151.000000 73498.000000 92780.000000 60869.000

In [13]:

# Hotel as channel and other as region has spent the most on above data and retail spend th

In [14]:

df2 = df.copy()
df2['Total'] = df2['Fresh'] + df2['Milk'] + df2['Grocery'] + df2['Frozen'] + df2['Detergent

In [15]:

#Dropping "Buyer/Spender" column as it is not useful


df2 = df2.drop(['Buyer/Spender'], axis = 1)

In [16]:

df2.head()

Out[16]:

Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen Total

0 Retail Other 12669 9656 7561 214 2674 1338 34112

1 Retail Other 7057 9810 9568 1762 3293 1776 33266

2 Retail Other 6353 8808 7684 2405 3516 7844 36610

3 Hotel Other 13265 1196 4221 6404 507 1788 27381

4 Retail Other 22615 5410 7198 3915 1777 5185 46100

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 2/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [17]:

df2.describe(include='all')

Out[17]:

Channel Region Fresh Milk Grocery Frozen Detergents_

count 440 440 440.000000 440.000000 440.000000 440.000000 440.0

unique 2 3 NaN NaN NaN NaN

top Hotel Other NaN NaN NaN NaN

freq 298 316 NaN NaN NaN NaN

mean NaN NaN 12000.297727 5796.265909 7951.277273 3071.931818 2881.4

std NaN NaN 12647.328865 7380.377175 9503.162829 4854.673333 4767.8

min NaN NaN 3.000000 55.000000 3.000000 25.000000 3.0

25% NaN NaN 3127.750000 1533.000000 2153.000000 742.250000 256.7

50% NaN NaN 8504.000000 3627.000000 4755.500000 1526.000000 816.5

75% NaN NaN 16933.750000 7190.250000 10655.750000 3554.250000 3922.0

max NaN NaN 112151.000000 73498.000000 92780.000000 60869.000000 40827.0

In [20]:

df2.groupby('Region')['Total'].sum()

Out[20]:

Region

Lisbon 2386813

Oporto 1555088

Other 10677599

Name: Total, dtype: int64

In [24]:

plt.figure(figsize = (6,4))
df2.groupby('Region')['Total'].sum().plot.bar();

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 3/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [26]:

df2.groupby('Channel')['Total'].sum()

Out[26]:

Channel

Hotel 7999569

Retail 6619931

Name: Total, dtype: int64

In [27]:

df2.groupby('Channel')['Total'].sum().plot.bar();

In [28]:

pd.pivot_table(df2, values=['Total'], index=['Region'], aggfunc=np.sum)

Out[28]:

Total

Region

Lisbon 2386813

Oporto 1555088

Other 10677599

In [29]:

pd.pivot_table(df2, values=['Total'], index=['Channel'], aggfunc=np.sum)

Out[29]:

Total

Channel

Hotel 7999569

Retail 6619931

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 4/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [30]:

# Hotel has spend the most and Retails spend the least

In [31]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

Out[31]:

Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total

Region

Lisbon 104327 204136 854833 231026 570037 422454 2386813

Oporto 54506 173311 464721 190132 433274 239144 1555088

Other 512110 890410 3960577 930492 2495251 1888759 10677599

In [32]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

In [33]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

Out[33]:

Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total

Channel

Hotel 421955 235587 4015717 1116979 1180717 1028614 7999569

Retail 248988 1032270 1264414 234671 2317845 1521743 6619931

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 5/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [34]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

In [35]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

Out[35]:

Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total

Region Channel

Hotel 70632 56081 761233 184512 237542 228342 1538342


Lisbon
Retail 33695 148055 93600 46514 332495 194112 848471

Hotel 30965 13516 326215 160861 123074 64519 719150


Oporto
Retail 23541 159795 138506 29271 310200 174625 835938

Hotel 320358 165990 2928269 771606 820101 735753 5742077


Other
Retail 191752 724420 1032308 158886 1675150 1153006 4935522

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 6/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [36]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

In [37]:

pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates

Out[37]:

Delicatessen Detergents_Paper Fresh Frozen Groce

Channel Hotel Retail Hotel Retail Hotel Retail Hotel Retail Hotel Ret

Region

Lisbon 70632 33695 56081 148055 761233 93600 184512 46514 237542 3324

Oporto 30965 23541 13516 159795 326215 138506 160861 29271 123074 3102

Other 320358 191752 165990 724420 2928269 1032308 771606 158886 820101 16751

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 7/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [38]:

pd.pivot_table(df2, values='Fresh', index=['Region'], columns=['Channel'], aggfunc=np.sum)

Out[38]:

Channel Hotel Retail

Region

Lisbon 761233 93600

Oporto 326215 138506

Other 2928269 1032308

In [39]:

#out of all different varities Fresh has the highest values and delicatessen has the lowest
# other has maximum number of channel and regions however Oporto has a least number of Chan

In [40]:

df.std()/df.mean()

Out[40]:

Buyer/Spender 0.576695

Fresh 1.053918

Milk 1.273299

Grocery 1.195174

Frozen 1.580332

Detergents_Paper 1.654647

Delicatessen 1.849407

dtype: float64

In [41]:

df.describe().T['std'] / df.describe().T['mean']

Out[41]:

Buyer/Spender 0.576695

Fresh 1.053918

Milk 1.273299

Grocery 1.195174

Frozen 1.580332

Detergents_Paper 1.654647

Delicatessen 1.849407

dtype: float64

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 8/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [42]:

sns.boxplot(data=df);

In [43]:

#Outliers are important because of this we need to understand our data more accurately

In [49]:

#Analysis Report
#On the basis of analysis we should concentrate more on Hotel and other region because we h
#Fresh items we are spending more than other however delicatession has lowest spend
#we have obeserved the retail has less spend and they are spending more on Grocessary

In [50]:

#Problem 2 Clear Mountain State University

In [51]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats

In [52]:

df = pd.read_csv('Survey-1.csv')

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 9/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [53]:

df.head()

Out[53]:

Grad Social
ID Gender Age Class Major GPA Employment Salary S
Intention Networking

0 1 Female 20 Junior Other Yes 2.9 Full-Time 50.0 1

1 2 Male 23 Senior Management Yes 3.6 Part-Time 25.0 1

2 3 Male 21 Junior Other Yes 2.5 Part-Time 45.0 2

3 4 Male 21 Junior CIS Yes 2.5 Full-Time 40.0 4

4 5 Male 23 Senior Other Undecided 2.8 Unemployed 40.0 2

In [54]:

df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 62 entries, 0 to 61

Data columns (total 14 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 ID 62 non-null int64

1 Gender 62 non-null object

2 Age 62 non-null int64

3 Class 62 non-null object

4 Major 62 non-null object

5 Grad Intention 62 non-null object

6 GPA 62 non-null float64

7 Employment 62 non-null object

8 Salary 62 non-null float64

9 Social Networking 62 non-null int64

10 Satisfaction 62 non-null int64

11 Spending 62 non-null int64

12 Computer 62 non-null object

13 Text Messages 62 non-null int64

dtypes: float64(2), int64(6), object(6)

memory usage: 6.9+ KB

In [55]:

# There are 62 entries and no null values


# data has 2 floats, 6 Integers and 6 Objects

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 10/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [56]:

df.describe()

Out[56]:

Social
ID Age GPA Salary Satisfaction Spending
Networking Mes

count 62.000000 62.000000 62.000000 62.000000 62.000000 62.000000 62.000000 62.0

mean 31.500000 21.129032 3.129032 48.548387 1.516129 3.741935 482.016129 246.2

std 18.041619 1.431311 0.377388 12.080912 0.844305 1.213793 221.953805 214.4

min 1.000000 18.000000 2.300000 25.000000 0.000000 1.000000 100.000000 0.0

25% 16.250000 20.000000 2.900000 40.000000 1.000000 3.000000 312.500000 100.0

50% 31.500000 21.000000 3.150000 50.000000 1.000000 4.000000 500.000000 200.0

75% 46.750000 22.000000 3.400000 55.000000 2.000000 4.000000 600.000000 300.0

max 62.000000 26.000000 3.900000 80.000000 4.000000 6.000000 1400.000000 900.0

In [57]:

pd.crosstab(df['Gender'],df['Major'])

Out[57]:

International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business

Gender

Female 3 3 7 4 4 3

Male 4 1 4 2 6 4

In [58]:

pd.crosstab(df['Gender'],df['Grad Intention'])

Out[58]:

Grad Intention No Undecided Yes

Gender

Female 9 13 11

Male 3 9 17

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 11/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [59]:

pd.crosstab(df['Gender'],df['Employment'])

Out[59]:

Employment Full-Time Part-Time Unemployed

Gender

Female 3 24 6

Male 7 19 3

In [60]:

pd.crosstab(df['Gender'],df['Computer'])

Out[60]:

Computer Desktop Laptop Tablet

Gender

Female 2 29 2

Male 3 26 0

In [61]:

#prob that is randomly selected University student will be male/female


df['Gender'].value_counts()

Out[61]:

Female 33

Male 29

Name: Gender, dtype: int64

In [62]:

#conditional probability of different majors among the male students in University.


pd.crosstab(df['Gender'],df['Major'])

Out[62]:

International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business

Gender

Female 3 3 7 4 4 3

Male 4 1 4 2 6 4

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 12/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [63]:

#probability that a randomly chosen student is a male and intends to graduate


pd.crosstab(df['Gender'],df['Grad Intention'])

Out[63]:

Grad Intention No Undecided Yes

Gender

Female 9 13 11

Male 3 9 17

In [64]:

#probability that a randomly selected student is a female and does NOT have a laptop.
pd.crosstab(df['Gender'],df['Computer'])

Out[64]:

Computer Desktop Laptop Tablet

Gender

Female 2 29 2

Male 3 26 0

In [65]:

#probability that a randomly chosen student is a male or has full-time employment?


pd.crosstab(df['Gender'],df['Employment'])

Out[65]:

Employment Full-Time Part-Time Unemployed

Gender

Female 3 24 6

Male 7 19 3

In [66]:

#conditional probability that given a female student is randomly chosen, she is majoring in
pd.crosstab(df['Gender'],df['Major'])

Out[66]:

International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business

Gender

Female 3 3 7 4 4 3

Male 4 1 4 2 6 4

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 13/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [67]:

#Contingency table of Gender and Intent to Graduate at 2 levels (Y/N). The Undecided studen
pd.crosstab(df['Gender'],df[df['Grad Intention']!='Undecided']['Grad Intention'])

Out[67]:

Grad Intention No Yes

Gender

Female 9 11

Male 3 17

In [68]:

#Do you think the graduate intention and being female are independent events?
p_female = 20/40
p_grad = 28/40
p_female_grad = 11/40
p_female_int_grad = p_female * p_grad

if (p_female_int_grad == p_female_grad):
print('Graduate intention and female are independent events')
else:
print('Graduate intention and female are dependent events')

Graduate intention and female are dependent events

In [69]:

#If a student is chosen randomly, what is the probability that his/her GPA is less than 3?
df[df['GPA']<3]['GPA'].value_counts().sum()

Out[69]:

17

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 14/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [70]:

#2.7.2.
#Find the conditional probability that a randomly selected male earns 50 or more.
#Find the conditional probability that a randomly selected female earns 50 or more.
df[df['Salary']>=50][['Gender','Salary']].value_counts()

Out[70]:

Gender Salary

Female 50.0 5

55.0 5

60.0 5

Male 50.0 4

55.0 3

60.0 3

Female 70.0 1

78.0 1

80.0 1

Male 52.0 1

54.0 1

65.0 1

80.0 1

dtype: int64

In [71]:

#Shapiro test

p_val = stats.shapiro(df['GPA']).pvalue
print('GPA p-value: ' + str(p_val))

p_val = stats.shapiro(df['Salary']).pvalue
print('Salary p-value: ' + str(p_val))

p_val = stats.shapiro(df['Spending']).pvalue
print('Spending p-value: ' + str(p_val))

p_val = stats.shapiro(df['Text Messages']).pvalue


print('Text msgs p-value: ' + str(p_val))

GPA p-value: 0.11204058676958084

Salary p-value: 0.028000956401228905

Spending p-value: 1.6854661225806922e-05

Text msgs p-value: 4.324040673964191e-06

In [72]:

# ABC Asphalt Shingles

In [77]:

df = pd.read_csv('A+&+B+shingles.csv')

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 15/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [78]:

df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 36 entries, 0 to 35

Data columns (total 2 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 A 36 non-null float64

1 B 31 non-null float64

dtypes: float64(2)

memory usage: 704.0 bytes

In [79]:

# There are 36 entries having 2 floats A has 36 non null values and B has 31 non null value

In [80]:

df.describe()

Out[80]:

A B

count 36.000000 31.000000

mean 0.316667 0.273548

std 0.135731 0.137296

min 0.130000 0.100000

25% 0.207500 0.160000

50% 0.290000 0.230000

75% 0.392500 0.400000

max 0.720000 0.580000

In [81]:

df.head()

Out[81]:

A B

0 0.44 0.14

1 0.61 0.15

2 0.47 0.31

3 0.30 0.16

4 0.15 0.37

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 16/17


11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook

In [82]:

stats.ttest_1samp(df['A'],0.35,alternative='two-sided')
#Use pvalue/2

Out[82]:

Ttest_1sampResult(statistic=-1.4735046253382782, pvalue=0.14955266289815025)

In [83]:

stats.ttest_1samp(df['B'],0.35,alternative='two-sided',nan_policy='omit')
#Use pvalue/2

Out[83]:

Ttest_1sampResult(statistic=-3.1003313069986995, pvalue=0.00418095480063836
5)

In [84]:

stats.levene(df.A,df.B.dropna())

Out[84]:

LeveneResult(statistic=0.23808965111555147, pvalue=0.6272312061867605)

In [85]:

stats.ttest_ind(df.A,df.B,equal_var=True,nan_policy='omit',alternative='two-sided')

Out[85]:

Ttest_indResult(statistic=1.2896282719661123, pvalue=0.2017496571835306)

In [ ]:

localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 17/17

You might also like