Professional Documents
Culture Documents
In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import scipy.stats as stats
import seaborn as sns
import os
os.getcwd()
os.chdir('C:\\Users\\USER\\Downloads')
In [8]:
df_Wholesale = pd.read_csv('Wholesale+Customers+Data.csv')
In [9]:
df.head()
Out[9]:
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
In [11]:
# There are 440 entries with 9 columns however there is no null values.
# There are 7 integers and 2 objects
In [12]:
df.describe(include='all')
Out[12]:
In [13]:
# Hotel as channel and other as region has spent the most on above data and retail spend th
In [14]:
df2 = df.copy()
df2['Total'] = df2['Fresh'] + df2['Milk'] + df2['Grocery'] + df2['Frozen'] + df2['Detergent
In [15]:
In [16]:
df2.head()
Out[16]:
In [17]:
df2.describe(include='all')
Out[17]:
In [20]:
df2.groupby('Region')['Total'].sum()
Out[20]:
Region
Lisbon 2386813
Oporto 1555088
Other 10677599
In [24]:
plt.figure(figsize = (6,4))
df2.groupby('Region')['Total'].sum().plot.bar();
In [26]:
df2.groupby('Channel')['Total'].sum()
Out[26]:
Channel
Hotel 7999569
Retail 6619931
In [27]:
df2.groupby('Channel')['Total'].sum().plot.bar();
In [28]:
Out[28]:
Total
Region
Lisbon 2386813
Oporto 1555088
Other 10677599
In [29]:
Out[29]:
Total
Channel
Hotel 7999569
Retail 6619931
In [30]:
# Hotel has spend the most and Retails spend the least
In [31]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[31]:
Region
In [32]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [33]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[33]:
Channel
In [34]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [35]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[35]:
Region Channel
In [36]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [37]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[37]:
Channel Hotel Retail Hotel Retail Hotel Retail Hotel Retail Hotel Ret
Region
Lisbon 70632 33695 56081 148055 761233 93600 184512 46514 237542 3324
Oporto 30965 23541 13516 159795 326215 138506 160861 29271 123074 3102
Other 320358 191752 165990 724420 2928269 1032308 771606 158886 820101 16751
In [38]:
Out[38]:
Region
In [39]:
#out of all different varities Fresh has the highest values and delicatessen has the lowest
# other has maximum number of channel and regions however Oporto has a least number of Chan
In [40]:
df.std()/df.mean()
Out[40]:
Buyer/Spender 0.576695
Fresh 1.053918
Milk 1.273299
Grocery 1.195174
Frozen 1.580332
Detergents_Paper 1.654647
Delicatessen 1.849407
dtype: float64
In [41]:
df.describe().T['std'] / df.describe().T['mean']
Out[41]:
Buyer/Spender 0.576695
Fresh 1.053918
Milk 1.273299
Grocery 1.195174
Frozen 1.580332
Detergents_Paper 1.654647
Delicatessen 1.849407
dtype: float64
In [42]:
sns.boxplot(data=df);
In [43]:
#Outliers are important because of this we need to understand our data more accurately
In [49]:
#Analysis Report
#On the basis of analysis we should concentrate more on Hotel and other region because we h
#Fresh items we are spending more than other however delicatession has lowest spend
#we have obeserved the retail has less spend and they are spending more on Grocessary
In [50]:
In [51]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
In [52]:
df = pd.read_csv('Survey-1.csv')
In [53]:
df.head()
Out[53]:
Grad Social
ID Gender Age Class Major GPA Employment Salary S
Intention Networking
In [54]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
0 ID 62 non-null int64
In [55]:
In [56]:
df.describe()
Out[56]:
Social
ID Age GPA Salary Satisfaction Spending
Networking Mes
In [57]:
pd.crosstab(df['Gender'],df['Major'])
Out[57]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
In [58]:
pd.crosstab(df['Gender'],df['Grad Intention'])
Out[58]:
Gender
Female 9 13 11
Male 3 9 17
In [59]:
pd.crosstab(df['Gender'],df['Employment'])
Out[59]:
Gender
Female 3 24 6
Male 7 19 3
In [60]:
pd.crosstab(df['Gender'],df['Computer'])
Out[60]:
Gender
Female 2 29 2
Male 3 26 0
In [61]:
Out[61]:
Female 33
Male 29
In [62]:
Out[62]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
In [63]:
Out[63]:
Gender
Female 9 13 11
Male 3 9 17
In [64]:
#probability that a randomly selected student is a female and does NOT have a laptop.
pd.crosstab(df['Gender'],df['Computer'])
Out[64]:
Gender
Female 2 29 2
Male 3 26 0
In [65]:
Out[65]:
Gender
Female 3 24 6
Male 7 19 3
In [66]:
#conditional probability that given a female student is randomly chosen, she is majoring in
pd.crosstab(df['Gender'],df['Major'])
Out[66]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
In [67]:
#Contingency table of Gender and Intent to Graduate at 2 levels (Y/N). The Undecided studen
pd.crosstab(df['Gender'],df[df['Grad Intention']!='Undecided']['Grad Intention'])
Out[67]:
Gender
Female 9 11
Male 3 17
In [68]:
#Do you think the graduate intention and being female are independent events?
p_female = 20/40
p_grad = 28/40
p_female_grad = 11/40
p_female_int_grad = p_female * p_grad
if (p_female_int_grad == p_female_grad):
print('Graduate intention and female are independent events')
else:
print('Graduate intention and female are dependent events')
In [69]:
#If a student is chosen randomly, what is the probability that his/her GPA is less than 3?
df[df['GPA']<3]['GPA'].value_counts().sum()
Out[69]:
17
In [70]:
#2.7.2.
#Find the conditional probability that a randomly selected male earns 50 or more.
#Find the conditional probability that a randomly selected female earns 50 or more.
df[df['Salary']>=50][['Gender','Salary']].value_counts()
Out[70]:
Gender Salary
Female 50.0 5
55.0 5
60.0 5
Male 50.0 4
55.0 3
60.0 3
Female 70.0 1
78.0 1
80.0 1
Male 52.0 1
54.0 1
65.0 1
80.0 1
dtype: int64
In [71]:
#Shapiro test
p_val = stats.shapiro(df['GPA']).pvalue
print('GPA p-value: ' + str(p_val))
p_val = stats.shapiro(df['Salary']).pvalue
print('Salary p-value: ' + str(p_val))
p_val = stats.shapiro(df['Spending']).pvalue
print('Spending p-value: ' + str(p_val))
In [72]:
In [77]:
df = pd.read_csv('A+&+B+shingles.csv')
In [78]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
0 A 36 non-null float64
1 B 31 non-null float64
dtypes: float64(2)
In [79]:
# There are 36 entries having 2 floats A has 36 non null values and B has 31 non null value
In [80]:
df.describe()
Out[80]:
A B
In [81]:
df.head()
Out[81]:
A B
0 0.44 0.14
1 0.61 0.15
2 0.47 0.31
3 0.30 0.16
4 0.15 0.37
In [82]:
stats.ttest_1samp(df['A'],0.35,alternative='two-sided')
#Use pvalue/2
Out[82]:
Ttest_1sampResult(statistic=-1.4735046253382782, pvalue=0.14955266289815025)
In [83]:
stats.ttest_1samp(df['B'],0.35,alternative='two-sided',nan_policy='omit')
#Use pvalue/2
Out[83]:
Ttest_1sampResult(statistic=-3.1003313069986995, pvalue=0.00418095480063836
5)
In [84]:
stats.levene(df.A,df.B.dropna())
Out[84]:
LeveneResult(statistic=0.23808965111555147, pvalue=0.6272312061867605)
In [85]:
stats.ttest_ind(df.A,df.B,equal_var=True,nan_policy='omit',alternative='two-sided')
Out[85]:
Ttest_indResult(statistic=1.2896282719661123, pvalue=0.2017496571835306)
In [ ]: