Professional Documents
Culture Documents
18BCE2301
Devangshu Mazumder
Aim:
Design and implement a KMeans and KMedoids using a sklearn
Abstract:
k-means clustering is a method of vector quantization, originally from signal processing, that
aims to partition n observations into k clusters in which each observation belongs to the
cluster with the nearest mean, serving as a prototype of the cluster.
k -medoids is a classical partitioning technique of clustering that splits the data set of n
objects into k clusters, where the number k of clusters assumed known a priori (which implies
that the programmer must specify k before the execution of a k -medoids algorithm).
Sample Code:
import pandas as pd
import numpy as np
import sklearn.metrics as sm
%matplotlib inline
iris = datasets.load_iris()
print(iris.data)
print(iris.target_names)
print (iris.target)
y = pd.DataFrame(iris.target, columns=['Target'])
x.head()
y.head()
plt.figure(figsize=(12,3))
iris_targets_legend = np.array(iris.target_names)
green_patch = mpatches.Patch(color='green',label='Versicolor')
plt.subplot(1, 2, 1)
plt.subplot(1,2,2)
iris_k_mean_model = KMeans(n_clusters=3)
iris_k_mean_model.fit(x)
print (iris_k_mean_model.labels_)
print (iris_k_mean_model.cluster_centers_)
plt.figure(figsize=(15,8))
colors = np.array(['red', 'green', 'blue','yellow'])
plt.subplot(1, 2, 1)
plt.title('Before classification')
plt.subplot(1, 2, 2)
plt.title("Model's classification")
sm.accuracy_score(predictedY, y['Target'])
sm.confusion_matrix(predictedY, y['Target'])
#M-Medoids
scaler = MinMaxScaler()
#PCA Transformation
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
datapoints = PCAdf.values
m, f = datapoints.shape
k=3
#Visualization
X_reduced = points
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plt.show()
seed(1)
return X[samples, :]
medoids_initial = init_medoids(datapoints, 3)
m = len(X)
medoids_shape = medoids.shape
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
S[i, :] = d_i**p
return S
S = compute_d_p(datapoints, medoids_initial, 2)
def assign_labels(S):
labels = assign_labels(S)
S = compute_d_p(datapoints, medoids, p)
labels = assign_labels(S)
out_medoids = medoids
for i in set(labels):
cluster_points = datapoints[labels == i]
for datap in cluster_points:
new_medoid = datap
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
return out_medoids
#Full algorithm
if starting_medoids is None:
medoids = init_medoids(X, k)
else:
medoids = starting_medoids
converged = False
labels = np.zeros(len(X))
i=1
old_medoids = medoids.copy()
S = compute_d_p(X, medoids, p)
labels = assign_labels(S)
medoids = update_medoids(X, medoids, p)
i += 1
return (medoids,labels)
results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]
#Count
a_int = a.astype(dtype=int)
b_int = b.astype(dtype=int)
all_axes = tuple(range(len(a.shape)))
if exact:
return exact_matches
num_exact_matches = np.sum(exact_matches)
return exact_matches
return np.sum(matches)
print(n_matches,
OUTPUT: