You are on page 1of 20

K-MEANS CLUSTERING & APRIORI

Submitted by : R K SUJAN

Reg No: 200928009


K-MEANS CLUSTERING

Kmeans algorithm is an iterative algorithm that tries to partition the dataset into K


pre-defined distinct non-overlapping subgroups (clusters) where each data point
belongs to only one group. It tries to make the intra-cluster data points as similar as
possible while also keeping the clusters as different (far) as possible. It assigns data
points to a cluster such that the sum of the squared distance between the data points
and the cluster’s centroid (arithmetic mean of all the data points that belong to that
cluster) is at the minimum. The less variation we have within clusters, the more
homogeneous (similar) the data points are within the same cluster.

The way K-Means algorithm works is as follows:

1. Specify number of clusters K.

2. Initialize centroids by first shuffling the dataset and then randomly


selecting K data points for the centroids without replacement.

3. Keep iterating until there is no change to the centroids. i.e assignment of data
points to clusters isn’t changing.

4. Compute the sum of the squared distance between data points and all
centroids.

5. Assign each data point to the closest cluster (centroid).

6. Compute the centroids for the clusters by taking the average of the all data
points that belong to each cluster.
ALGORITHM FOR K-MEANS CLUSTERING
CODE FOR K-MEANS
#import required libraries

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

#generate our dataset

dataset =
make_blobs(n_samples=200,centers=4,n_features=2,cluster_std=1.6,random_state
=50)

points = dataset[0]

#import kmeans

from sklearn.cluster import KMeans

# Create a kmeans objects

kmeans = KMeans(n_clusters=4)

# fit the Kmeans object to the dataset

kmeans.fit(points)

KMeans(n_clusters=4)

plt.scatter(dataset[0][:,0], dataset[0][:,1])

<matplotlib.collections.PathCollection at 0x23668c26790>
clusters = kmeans.cluster_centers_

#print out the clusters


print(clusters)
[[-2.40167949 10.17352695]
[ 0.05161133 -5.35489826]
[-5.56465793 -2.34988939]
[-1.92101646 5.21673484]]

y_km = kmeans.fit_predict(points)

plt.scatter(points[y_km == 0,0], points[y_km == 0,1], s=50, color='red')

plt.scatter(points[y_km == 1,0], points[y_km == 1,1], s=50, color='green')

plt.scatter(points[y_km == 2,0], points[y_km == 2,1], s=50, color='yellow')

plt.scatter(points[y_km == 3,0], points[y_km == 3,1], s=50, color='cyan')


plt.scatter(clusters[0][0], clusters[0][1], marker='*', s=200, color='black')

plt.scatter(clusters[1][0], clusters[1][1], marker='*', s=200, color='black')

plt.scatter(clusters[2][0], clusters[2][1], marker='*', s=200, color='black')

plt.scatter(clusters[3][0], clusters[3][1], marker='*', s=200, color='black')

plt.show()

DATASET
(array([[-1.06705283e+00, 9.24306355e+00],
[ 1.08654318e+00, -6.94815805e+00],
[-2.30970265e+00, 5.84969440e+00],
[-1.59728965e+00, 8.45369045e+00],
[-5.79816711e+00, -3.78405528e+00],
[-2.29833776e+00, -4.66005371e+00],
[-1.68210799e+00, 1.12843423e+01],
[-7.38296450e-01, -5.20135260e+00],
[-1.92048345e-01, -6.45318764e+00],
[-6.37920287e+00, -2.17672355e+00],
[-5.12054803e+00, -2.91449176e+00],
[-4.22297649e+00, 9.04719945e+00],
[-2.67815731e+00, -2.32828819e+00],
[-3.02994109e+00, 1.86959382e+00],
[-5.73938121e+00, -7.25648845e-01],
[-1.99427571e+00, 4.28616868e+00],
[-2.89522086e+00, 1.10222354e+01],
[-1.11286937e+00, 1.03086380e+01],
[-5.19987051e+00, -1.59879364e+00],
[-4.75916597e+00, -1.97047767e+00],
[-6.76865308e+00, -3.56907573e+00],
[-2.65715781e+00, 3.33763714e+00],
[-4.61722463e+00, 1.06373187e+01],
[-1.43729337e+00, 1.12137736e+01],
[-1.98068787e+00, 9.73142838e+00],
[-1.47838268e+00, 4.02156675e+00],
[-3.74580344e+00, 9.15545625e+00],
[-5.51090509e-01, -2.19802594e+00],
[-2.68015629e+00, 5.58489786e+00],
[-1.18697121e-01, 1.04950260e+01],
[ 7.08946126e-02, 1.27161487e+01],
[-6.03097685e+00, -1.01668649e+00],
[-6.43543481e+00, 1.19165025e-01],
[-7.91271326e-01, -5.63231066e+00],
[ 9.02189228e-02, -4.24988128e+00],
[-3.72960397e+00, -2.40552410e+00],
[-5.47201497e+00, -1.29098281e+00],
[-4.50400179e+00, -1.29552557e+00],
[-1.59604970e+00, 7.08952891e+00],
[-2.45285170e+00, 6.35814471e+00],
[-1.02481236e+00, 1.34548122e+01],
[-7.16917808e+00, -3.68305685e+00],
[-2.09444877e+00, 6.60308885e+00],
[-3.08549983e+00, 6.22161479e+00],
[-6.82140576e-01, -4.83269360e+00],
[ 1.64436813e+00, -3.29688399e+00],
[-1.81165386e+00, 9.57286673e+00],
[-1.21769584e-01, 6.51275284e+00],
[-1.65143884e+00, 6.38316168e+00],
[-4.18721798e+00, 8.93800061e+00],
[-1.10703455e+00, -4.83713152e+00],
[-7.62627421e+00, -4.60727232e+00],
[-3.06568887e-01, 5.25844092e+00],
[-1.23581275e+00, 8.35805290e+00],
[-1.85807535e-01, 2.57718893e+00],
[ 7.28797198e-01, 6.06528632e+00],
[-1.70400879e+00, -2.88008464e+00],
[-5.02706384e+00, 7.61298431e-01],
[-6.22443225e+00, -6.57162467e-01],
[-2.90807981e+00, 5.27669491e+00],
[-1.37711368e+00, -5.50047455e+00],
[-5.57986277e-01, -2.70088621e+00],
[-5.68833947e+00, 7.94601173e+00],
[-2.77413056e+00, -5.78872960e+00],
[-1.53159637e+00, -5.42990953e+00],
[-3.22848472e+00, 9.44642918e+00],
[ 9.86777496e-01, -7.30690762e+00],
[-4.42661936e+00, 3.35071015e+00],
[-3.17162516e+00, 1.10347610e+01],
[-4.74516474e+00, 7.89837755e+00],
[ 1.02471465e+00, -4.64795418e+00],
[-6.13566432e+00, -2.93094035e+00],
[-3.42672033e+00, 7.64284207e+00],
[ 1.27831270e+00, -6.29519484e+00],
[-3.16483095e+00, 6.35636403e+00],
[ 1.13910574e-02, 5.46235123e+00],
[-5.41232378e+00, -2.68666494e+00],
[ 4.61164125e-01, 4.69143186e+00],
[-2.41469662e+00, 4.66269862e+00],
[-3.77686363e-01, -5.75177620e+00],
[-6.10691421e+00, -5.98494706e+00],
[-4.87535312e-01, 6.36669314e+00],
[-5.73193316e+00, -1.81425052e+00],
[-4.88797474e+00, -2.96226761e+00],
[-5.91551686e+00, -1.39463278e+00],
[-7.44500073e+00, -1.82470952e+00],
[-3.39008216e+00, 1.09563447e+01],
[ 2.47622860e-01, -5.03543616e+00],
[-3.10260432e+00, 1.09469609e+01],
[-5.15417920e+00, -4.12796457e+00],
[-4.28633194e-01, -4.24947701e+00],
[-4.27501504e+00, 1.08359469e+01],
[ 4.55976021e-02, -4.59883918e+00],
[-5.04804825e+00, 4.27765336e+00],
[-2.40612947e+00, 5.07809235e+00],
[-2.27451380e+00, -1.54186053e+00],
[-1.57744641e-01, -1.15341625e+01],
[-2.19532828e+00, 4.52009408e+00],
[-5.01209756e-01, -3.66534438e+00],
[-2.55093474e+00, 5.07808929e+00],
[-7.89434801e+00, -3.17030594e+00],
[-1.53349447e+00, -5.87137205e+00],
[-3.69177238e+00, 2.87620370e+00],
[-1.31024459e+00, 1.19798893e+01],
[-1.49167744e+00, 7.45001320e+00],
[ 2.24563558e+00, -6.37052906e+00],
[-2.93581723e+00, 4.37099430e+00],
[-2.45885784e+00, -3.47646132e+00],
[-9.37207745e+00, -2.04265047e+00],
[-1.85324174e+00, 1.15343543e+01],
[-4.55544644e-02, -5.77956461e+00],
[-4.81350458e+00, -4.29442383e+00],
[-2.83977728e+00, 1.05836834e+01],
[-3.25189078e+00, 8.58382453e+00],
[-5.78104717e+00, -3.22180679e+00],
[-1.35072701e+00, 4.38388826e+00],
[-2.54760385e+00, 1.23266492e+01],
[-1.83963385e+00, 1.17304073e+01],
[-3.56940146e+00, 3.97719844e+00],
[ 5.19455346e+00, -3.85790517e+00],
[ 1.26866610e+00, 8.69129038e+00],
[-3.63664996e+00, 7.23811254e+00],
[-1.55079863e+00, 8.16118375e+00],
[-1.75136566e+00, 1.01798622e+01],
[ 4.22044090e+00, -7.82455952e+00],
[-1.01845204e+00, 1.08561916e+01],
[-3.09538208e+00, 9.04263837e+00],
[-2.75853245e+00, 5.71712591e+00],
[-1.69955192e+00, 7.60084115e+00],
[ 1.00681205e+00, -5.97364221e+00],
[-3.63618643e+00, -4.01910949e+00],
[ 1.05766953e+00, -2.84354513e+00],
[-5.21005358e-01, -5.36288806e+00],
[ 4.74333018e-01, 2.91649791e+00],
[-1.16095485e+00, 9.30443737e+00],
[ 7.72592657e-01, 3.34757221e+00],
[ 1.15283270e-01, -4.98158692e+00],
[-6.17063348e-01, 1.04101088e+01],
[-2.76847604e+00, 8.52320320e+00],
[-5.25173430e+00, -2.08429857e+00],
[-3.85525653e+00, 9.54219399e+00],
[-8.01851943e-01, 5.95676894e+00],
[-2.36271016e+00, 6.81776964e+00],
[-1.99764975e+00, -3.85128758e+00],
[-6.65130512e+00, -3.92501387e+00],
[-5.57724115e+00, 1.14034957e+01],
[ 1.19709771e+00, -5.35592862e+00],
[-3.25011945e+00, 5.37703143e+00],
[ 1.18033537e+00, -7.97895365e+00],
[-6.91252565e+00, -4.45298216e+00],
[-1.76815267e+00, 9.19196787e+00],
[-6.65058496e+00, -2.11819191e+00],
[-3.70764352e+00, 6.74162691e+00],
[-3.71255665e-01, -4.99321884e+00],
[ 1.12056494e-01, -6.58921181e+00],
[-2.33425004e+00, 7.05562607e+00],
[-6.96784964e-01, 1.00164565e+01],
[-7.67542214e-01, -5.69548201e+00],
[-6.88656858e-01, -9.55180953e+00],
[ 3.19734410e+00, -3.69780369e+00],
[-1.66854762e+00, 4.66869475e+00],
[-3.40729232e-01, 5.72252744e+00],
[-7.63340475e-01, -2.76069256e+00],
[-2.15963524e+00, -7.56230415e+00],
[-5.10916044e+00, -4.59492642e+00],
[-4.57024715e+00, -7.15787278e-01],
[-4.87469044e+00, -2.18916044e+00],
[-1.42220382e+00, 5.20840822e+00],
[ 1.86144971e+00, -9.23586332e+00],
[-4.26446596e-01, -4.87764872e+00],
[-1.35671783e+00, -3.72590953e+00],
[ 9.41731341e-02, -4.05882797e+00],
[-6.95635379e+00, -8.81553313e-01],
[ 1.08172044e+00, 5.81661034e+00],
[ 3.14164337e-02, -5.38350852e+00],
[-2.85096028e+00, 7.51512826e+00],
[-4.36021918e+00, -2.94957772e+00],
[-3.47593712e-01, -4.33384716e+00],
[-2.70720258e+00, 1.05857295e+01],
[-3.77539609e+00, 4.13414806e+00],
[ 1.26381204e+00, -7.84824077e+00],
[-1.44884409e+00, 3.75963327e+00],
[-2.25521451e+00, 5.57096900e+00],
[-3.09117088e+00, 9.37957142e+00],
[-6.36790963e+00, -5.30288810e-02],
[-6.11503859e-01, -3.24108804e+00],
[-6.24195183e+00, -2.19627952e+00],
[-5.34255894e+00, 1.28888667e+01],
[ 1.14391114e+00, 4.35267793e+00],
[ 2.32669251e+00, -7.52917540e+00],
[-2.96275801e+00, 2.59217754e+00],
[-3.02730359e+00, 3.52590749e+00],
[-3.49785697e+00, -8.75045274e-01],
[-5.23835667e-01, 7.48498444e+00],
[ 1.09766760e+00, -4.85679456e+00],
[-1.10000365e+00, 1.10130763e+01],
[-3.98648663e+00, -1.98177808e+00],
[-3.29604652e+00, 6.38490461e+00],
[-3.75526942e+00, -1.56756272e+00],
[-7.10483937e-01, 1.18869578e+01]]), array([2, 0, 3, 3, 1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 1, 3, 2, 2,
1, 1, 1, 3,
2, 2, 2, 3, 2, 1, 3, 2, 2, 1, 1, 0, 0, 1, 1, 1, 3, 3, 2, 1, 3, 3,
0, 0, 2, 3, 3, 2, 0, 1, 3, 2, 3, 3, 0, 1, 1, 3, 0, 0, 2, 0, 0, 2,
0, 3, 2, 2, 0, 1, 2, 0, 3, 3, 1, 3, 3, 0, 1, 3, 1, 1, 1, 1, 2, 0,
2, 1, 0, 2, 0, 3, 3, 1, 0, 3, 0, 3, 1, 0, 3, 2, 3, 0, 3, 1, 1, 2,
0, 1, 2, 2, 1, 3, 2, 2, 3, 0, 2, 2, 2, 2, 0, 2, 2, 3, 2, 0, 1, 0,
0, 3, 2, 3, 0, 2, 2, 1, 2, 3, 2, 1, 1, 2, 0, 3, 0, 1, 2, 1, 3, 0,
0, 3, 2, 0, 0, 0, 3, 3, 1, 0, 1, 1, 1, 3, 0, 0, 0, 0, 1, 3, 0, 2,
1, 0, 2, 3, 0, 3, 3, 2, 1, 0, 1, 2, 3, 0, 3, 3, 1, 3, 0, 2, 1, 3,
1, 2]))
APRIORI ALGORITHM
Agarwal and Srikant proposed the Apriori algorithm in 1994. Apriori uses a
"bottom up" approach, where frequent subsets are extended one item at a time
(a step known as candidate generation), and groups of candidates are tested
against the data. The algorithm terminates when no further successful
extensions are found. Apriori uses breadth-first search to count candidate item
sets efficiently. This algorithm uses downward closure property, which states
that, “Any subset of a frequent itemset must be frequent”. It is called as
apriori because it uses prior knowledge of frequent item set properties.
It uses level-wise search, where k-itemsets (an itemset containing k number
of items is called as a k-itemset) are used to explore (k+1) itemsets to mine
frequent itemsets from transactional database. First, the set of frequent 1-
temset (L1) is found. L1 is used to find L2, which is used to find L3 and so on,
until no more frequent k-itemsets can be found.
The candidate-gen function takes Lk-1 and returns a superset (called the
candidates) of the set of all frequent
k- itemsets. It has two steps

 join step: Generate all possible candidate itemsets Ck of length k


 prune step: Remove those candidates in Ck that cannot be frequent.

Apriori Property –
All non-empty subset of frequent itemset must be frequent. The key concept
of Apriori algorithm is its anti-monotonicity of support measure. Apriori
assumes that

All subsets of a frequent itemset must be frequent(Apriori propertry).


If an itemset is infrequent, all its supersets will be infrequent.
Limitations of Apriori Algorithm

Apriori Algorithm can be slow. The main limitation is time required to hold a
vast number of candidate sets with much frequent itemsets, low minimum support
or large itemsets i.e. it is not an efficient approach for large number of datasets.
For example, if there are 10^4 from frequent 1- itemsets, it need to generate more
than 10^7 candidates into 2-length which in turn they will be tested and
accumulate. Furthermore, to detect frequent pattern in size 100 i.e. v1, v2…
v100, it have to generate 2^100 candidate itemsets that yield on costly and
wasting of time of candidate generation. So, it will check for many sets from
candidate itemsets, also it will scan database many times repeatedly for finding
candidate itemsets. Apriori will be very low and inefficiency when memory
capacity is limited with large number of transactions.
PSEUDOCODE OF THE ALGORITHM
APRIORI CODE
# Loading neccesary packages

import numpy as np

import pandas as pd

from mlxtend.frequent_patterns import apriori

from mlxtend.frequent_patterns import association_rules

# Reading Data From Web

#myretaildata = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-
databases/00352/Online%20Retail.xlsx')

myretaildata.head ()

#Data Cleaning

myretaildata['Description'] = myretaildata['Description'].str.strip()#removes spaces

myretaildata.dropna(axis=0, subset=['InvoiceNo'], inplace=True)#removes


duplicate invoice

myretaildata['InvoiceNo'] = myretaildata['InvoiceNo'].astype('str')#converting
invoice number to type string

myretaildata = myretaildata[~myretaildata['InvoiceNo'].str.contains('C')]#remove
credit transactions

myretaildata.head()

myretaildata['Country'].value_counts()

#myretaildata.shape

#Seperating transactions for Germany

mybasket = (myretaildata[myretaildata['Country'] =="Germany"]

.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)

.set_index('InvoiceNo'))

#viewing transaction basket

mybasket.head()

#converting all positive values to 1 and everything else to 0

def my_encode_units(x):

if x <= 0;

return 0

if x >=1;

return 1

my_basket_sets = mybasket.applymap(my_encode_units)

my_basket_sets.drop('POSTAGE', inplace=True, axis=1)#Remove "Postage" as an


item

#Generating frequent itemsets

my_frequent_itemsets = apriori(my_basket_sets, min_support=0.07,


use_colnames=True)

#Generating rules

my_rules = association_rules(my_frquent_itemsets, metric="lift",


min_threshold=1)

#viewing top 100 rules

my_rules.head(100)

my_basket_sets['ROUND SNACK BOXES SET OF 4 WOODLAND'].sum()

my_basket_sets['SPACEBOY LUNCH BOX'].sum()

#Filtering rules based on condition


my_rules[ (my_rules['lift'] >=3)&

(my_rules['confidence'] >=0.3)]

You might also like