You are on page 1of 14

task6

March 25, 2024

[1]: from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive

[2]: %cd "/content/drive/MyDrive/Interview-AI Engineer-VNPay/dataset"

/content/drive/MyDrive/Interview-AI Engineer-VNPay/dataset

[3]: !ls

anscombe.csv data.csv US_Stores.xlsx

[59]: import pandas as pd


import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

[6]: df = pd.read_csv("data.csv", encoding="latin1")

[47]: df

[47]: InvoiceNo StockCode Description Quantity \


0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
1 536365 71053 WHITE METAL LANTERN 6
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
… … … … …
541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12
541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6
541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4
541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4
541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3

InvoiceDate UnitPrice CustomerID Country Month \


0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom 12

1
1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 12
2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom 12
3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 12
4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 12
… … … … … …
541904 2011-12-09 12:50:00 0.85 12680.0 France 12
541905 2011-12-09 12:50:00 2.10 12680.0 France 12
541906 2011-12-09 12:50:00 4.15 12680.0 France 12
541907 2011-12-09 12:50:00 4.15 12680.0 France 12
541908 2011-12-09 12:50:00 4.95 12680.0 France 12

Year TotalSpending
0 2010 15.30
1 2010 20.34
2 2010 22.00
3 2010 20.34
4 2010 20.34
… … …
541904 2011 10.20
541905 2011 12.60
541906 2011 16.60
541907 2011 16.60
541908 2011 14.85

[541909 rows x 11 columns]

[9]: print(df.dtypes)
print(df.isnull().sum())

InvoiceNo object
StockCode object
Description object
Quantity int64
InvoiceDate object
UnitPrice float64
CustomerID float64
Country object
dtype: object
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135080
Country 0
dtype: int64

2
[10]: print(df.describe())

Quantity UnitPrice CustomerID


count 541909.000000 541909.000000 406829.000000
mean 9.552250 4.611114 15287.690570
std 218.081158 96.759853 1713.600303
min -80995.000000 -11062.060000 12346.000000
25% 1.000000 1.250000 13953.000000
50% 3.000000 2.080000 15152.000000
75% 10.000000 4.130000 16791.000000
max 80995.000000 38970.000000 18287.000000

[19]: num_invoices = df['InvoiceNo'].nunique()


print("Number of unique invoices:", num_invoices)

sales_per_invoice = df.groupby('InvoiceNo').apply(lambda x: (x['UnitPrice'] *␣


↪x['Quantity']).sum())

print("Sale per invoice:", sales_per_invoice)

Number of unique invoices: 25900


Sale per invoice: InvoiceNo
536365 139.12
536366 22.20
536367 278.73
536368 70.05
536369 17.85

C581484 -168469.60
C581490 -32.53
C581499 -224.69
C581568 -54.75
C581569 -7.50
Length: 25900, dtype: float64

[14]: total_quantity_per_product = df.groupby('Description')['Quantity'].sum()


print("Total Quantity Sold per Product:")
print(total_quantity_per_product)

Total Quantity Sold per Product:


Description
4 PURPLE FLOCK DINNER CANDLES 144
50'S CHRISTMAS GIFT BAG LARGE 1913
DOLLY GIRL BEAKER 2448
I LOVE LONDON MINI BACKPACK 389
I LOVE LONDON MINI RUCKSACK 1

wrongly marked carton 22804 -256
wrongly marked. 23343 in box -3100

3
wrongly sold (22719) barcode 170
wrongly sold as sets -600
wrongly sold sets -975
Name: Quantity, Length: 4223, dtype: int64

[15]: top_selling_products = df.groupby('Description')['Quantity'].sum().nlargest(10)


print("Top Selling Products:")
print(top_selling_products)

Top Selling Products:


Description
WORLD WAR 2 GLIDERS ASSTD DESIGNS 53847
JUMBO BAG RED RETROSPOT 47363
ASSORTED COLOUR BIRD ORNAMENT 36381
POPCORN HOLDER 36334
PACK OF 72 RETROSPOT CAKE CASES 36039
WHITE HANGING HEART T-LIGHT HOLDER 35317
RABBIT NIGHT LIGHT 30680
MINI PAINT SET VINTAGE 26437
PACK OF 12 LONDON TISSUES 26315
PACK OF 60 PINK PAISLEY CAKE CASES 24753
Name: Quantity, dtype: int64

[42]: plt.figure(figsize=(8, 8))


plt.pie(top_selling_products, labels=top_selling_products.index, autopct="%1.
↪1f%%")

plt.title("Top 10 Selling Products")


plt.show()

4
[16]: total_revenue_per_country = df.groupby('Country')['UnitPrice'].sum() * df.
↪groupby('Country')['Quantity'].sum()

print("Total Revenue per Country:")


print(total_revenue_per_country)

Total Revenue per Country:


Country
Australia 3.391920e+08
Austria 8.213237e+06
Bahrain 2.250820e+04
Belgium 1.745691e+08
Brazil 5.076560e+04
Canada 2.515933e+06
Channel Islands 3.543772e+07
Cyprus 2.476308e+07
Czech Republic 5.218480e+04
Denmark 1.037379e+07
EIRE 6.910362e+09
European Community 1.461429e+05
Finland 4.039054e+07
France 4.754174e+09
Germany 4.423796e+09
Greece 1.109879e+06
Hong Kong 5.837971e+07
Iceland 1.182814e+06
Israel 4.697061e+06
Italy 3.103124e+07
Japan 2.054914e+07
Lebanon 9.358184e+04
Lithuania 6.483488e+04
Malta 6.287134e+05
Netherlands 1.299341e+09
Norway 1.256648e+08
Poland 5.195552e+06
Portugal 2.109474e+08
RSA 8.733120e+04
Saudi Arabia 1.808250e+03
Singapore 1.314199e+08
Spain 3.388797e+08
Sweden 6.439000e+07
Switzerland 2.066251e+08
USA 6.669093e+05
United Arab Emirates 2.257520e+05
United Kingdom 9.575347e+12
Unspecified 3.973233e+06

5
dtype: float64

[38]: total_revenue_per_country = (df.groupby('Country')['UnitPrice'].sum() * df.


↪groupby('Country')['Quantity'].sum()).sort_values()

plt.figure(figsize=(12, 8))
plt.bar(total_revenue_per_country.index, total_revenue_per_country,␣
↪color='skyblue')

plt.title('Total Revenue per Country (Logarithmic Scale)')


plt.xlabel('Country')
plt.ylabel('Total Revenue')
plt.xticks(rotation=90)
plt.yscale('log')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

• UK is the most vibrant market

6
[17]: df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Month'] = df['InvoiceDate'].dt.month
monthly_sales = df.groupby('Month')['UnitPrice'].sum()
print("Monthly Sales Trend:")
print(monthly_sales)

Monthly Sales Trend:


Month
1 172752.800
2 127448.770
3 171486.510
4 129164.961
5 190685.460
6 200717.340
7 171906.791
8 150385.680
9 199235.212
10 263434.090
11 327149.850
12 394436.510
Name: UnitPrice, dtype: float64

[33]: monthly_sales = df.groupby('Month')['UnitPrice'].sum()


plt.figure(figsize=(10, 6))
monthly_sales.plot(kind='bar', color='skyblue')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

7
• December has the most sales activities
[23]: df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Year'] = df['InvoiceDate'].dt.year
yearly_sales = df.groupby('Year')['UnitPrice'].sum()
print("Yearly Sales Trend:")
print(yearly_sales)

Yearly Sales Trend:


Year
2010 260520.850
2011 2238283.124
Name: UnitPrice, dtype: float64

[32]: yearly_sales = df.groupby('Year')['UnitPrice'].sum()


plt.figure(figsize=(10, 6))
yearly_sales.plot(kind='bar', color='skyblue')
plt.title('Yearly Sales Trend')
plt.xlabel('Year')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

8
Lets find some VIP customers
[60]: df['TotalSpending'] = df['Quantity'] * df['UnitPrice']
total_spending_per_customer = df.groupby('CustomerID')['TotalSpending'].sum()
top_10_customers = total_spending_per_customer.nlargest(10)

print("Top 10 Customers with Highest Total Spending:")


print(top_10_customers)

Top 10 Customers with Highest Total Spending:


CustomerID
14646.0 279489.02
18102.0 256438.49
17450.0 187482.17
14911.0 132572.62
12415.0 123725.45
14156.0 113384.14
17511.0 88125.38
16684.0 65892.08
13694.0 62653.10
15311.0 59419.34
Name: TotalSpending, dtype: float64

9
[25]: num_customers = df['CustomerID'].nunique()
print("Number of unique customers:", num_customers)
sales_per_customer = df.groupby('CustomerID').apply(lambda x: (x['UnitPrice'] *␣
↪x['Quantity']).sum())

print("Sale per customer:", sales_per_customer)

Number of unique customers: 4372


Sale per customer: CustomerID
12346.0 0.00
12347.0 4310.00
12348.0 1797.24
12349.0 1757.55
12350.0 334.40

18280.0 180.60
18281.0 80.82
18282.0 176.60
18283.0 2094.88
18287.0 1837.28
Length: 4372, dtype: float64
IQR (Interquartile Range) is used to remove outliers because it provides a robust measure of spread
that is less affected by extreme values compared to other measures like standard deviation.
The IQR is calculated as the difference between the third quartile (Q3) and the first quartile (Q1)
of a dataset. By definition, the interquartile range captures the middle 50% of the data, making it
resistant to the influence of outliers at the tails of the distribution.
Outliers can significantly skew statistical analyses and machine learning models, leading to mis-
leading results. Removing outliers using the IQR method helps to ensure that the data used
for analysis is more representative of the underlying distribution, improving the reliability of the
insights obtained.
[31]: Q1 = df['UnitPrice'].quantile(0.25)
Q3 = df['UnitPrice'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR


upper_bound = Q3 + 1.5 * IQR

df_filtered = df[(df['UnitPrice'] >= lower_bound) & (df['UnitPrice'] <=␣


↪upper_bound)]

plt.figure(figsize=(10, 6))
plt.hist(df_filtered['UnitPrice'], bins=50, color='skyblue', edgecolor='black')
plt.title('Unit Price Distribution (Without Outliers - IQR Method)')
plt.xlabel('Unit Price')
plt.ylabel('Frequency')
plt.grid(True)

10
plt.show()

[30]: df['UnitPrice'].max()

[30]: 38970.0

Customer Segmentation
[58]: total_spending = df.groupby('CustomerID')['UnitPrice'].sum()
purchase_frequency = df.groupby('CustomerID')['InvoiceNo'].nunique()
average_spending_per_purchase = total_spending / purchase_frequency
customer_metrics = pd.DataFrame({
'TotalSpending': total_spending,
'PurchaseFrequency': purchase_frequency,
'AvgSpendingPerPurchase': average_spending_per_purchase
})

# Normalize the data


customer_metrics_normalized = (customer_metrics - customer_metrics.mean()) /␣
↪customer_metrics.std()

# Perform K-means clustering


kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(customer_metrics_normalized)

11
customer_metrics['Cluster'] = kmeans.labels_

plt.figure(figsize=(10, 8))

sns.scatterplot(data=customer_metrics, x='PurchaseFrequency',␣
↪y='TotalSpending', hue='Cluster', palette='viridis')

plt.title('Customer Segmentation')
plt.xlabel('Purchase Frequency')
plt.ylabel('Total Spending')
plt.show()

segment_analysis = customer_metrics.groupby('Cluster').agg({
'TotalSpending': 'mean',
'PurchaseFrequency': 'mean',
'AvgSpendingPerPurchase': 'mean'
})
print(segment_analysis)

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870:
FutureWarning: The default value of `n_init` will change from 10 to 'auto' in
1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(

12
TotalSpending PurchaseFrequency AvgSpendingPerPurchase
Cluster
0 1487.143013 27.589958 86.416492
1 26184.300000 126.833333 726.355225
2 207.224451 3.594280 62.041187
3 40278.900000 5.000000 8055.780000
• Cluster 0: Regular Spenders
• Cluster 1: High-Value Spenders
• Cluster 2: Occasional Buyers
• Cluster 3: VIP Customers
Customer Churn Analysis
[62]: churn_period = 180
last_purchase_date = df.groupby('CustomerID')['InvoiceDate'].max()
current_date = df['InvoiceDate'].max()
df['Churned'] = (current_date - last_purchase_date).dt.days > churn_period
churn_rate = df['Churned'].mean() * 100
print("Churn Rate:", churn_rate, "%")

13
Churn Rate: 19.7163769441903 %
1 out of 5 customers does not make any transaction in the last 6 months
[ ]:

14

You might also like