You are on page 1of 2

#Alur Hirarkial/////////////////////////////////////////////////////////////

#delete var1
del df['Var_1']

#IMPORT LIBRARY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

#melihat jumlah NaN


df.isna().sum()

#PENGECEKAN VALUE DARI COLUMN


df['Pengalaman Kerja'].value_counts()

#Replace paling banyak


df['Pernah_Menikah'] = df['Pernah_Menikah'].fillna("Ya")
df['Lulusan Pendidikan'] = df['Lulusan Pendidikan'].fillna("Ya")
df['Pekerjaan'] = df['Pekerjaan'].fillna("Artis")
df['Pengalaman Kerja'] = df['Pengalaman Kerja'].fillna(1.0)
df['Jumlah Keluarga'] = df['Jumlah Keluarga'].fillna(2.0)

#mengubah karakrter menjadi nilai


from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
df["Usia"] = ord_enc.fit_transform(df[["Usia"]])
df["Pekerjaan"] = ord_enc.fit_transform(df[["Pekerjaan"]])
df["Jenis Kelamin"] = ord_enc.fit_transform(df[["Jenis Kelamin"]])
df["Pernah_Menikah"] = ord_enc.fit_transform(df[["Pernah_Menikah"]])
df["Lulusan Pendidikan"] = ord_enc.fit_transform(df[["Lulusan Pendidikan"]])
df["Besar Pengeluaran"] = ord_enc.fit_transform(df[["Besar Pengeluaran"]])
df.head(5)

#
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
df = pd.DataFrame(scaled_data, columns= df.columns)
df

#Grafik
plt.figure(figsize=(15,10))
dendrogram(linkage(df, method="ward"), leaf_rotation=90, p=5, color_threshold=20,
leaf_font_size=10, truncate_mode='level')
plt.show()

#MENCARI JUMLAH CLUSTER YANG PALING OPTIMAL DENGAN SILHOUETE SCORE


from sklearn.metrics import silhouette_score
silhouette_scores = []
for n_cluster in range(2,10):
index = n_cluster-2
silhouette_scores.append(
silhouette_score(df, AgglomerativeClustering(n_clusters =
n_cluster).fit_predict(df)))
print("silhouette_score for n_cluster = ",n_cluster," is
",silhouette_scores[index])
plt.bar(range(2,10), silhouette_scores)
plt.xlabel("number of cluster",fontsize=10)
plt.ylabel("Silhouette score",fontsize=10)
plt.show()

#
agglo = AgglomerativeClustering(n_clusters = 3)
agglo.fit(df)
labels = agglo.labels_
df = pd.concat([df, pd.DataFrame({'cluster' : labels})], axis=1)
df.head(5)

#
for i in df :
grid = sns.FacetGrid(df, col='cluster')
grid.map(plt.hist, i)

#
#DECOMPOSISI PCA
dist = 1 - cosine_similarity(df)
pca = PCA(n_components = 2)
pca = pca.fit_transform(dist)

#
#VISUALISASI
x, y = pca[:,0], pca[:,1]
warna = {
0 : 'red',
1 : 'green',
2 : 'yellow'
}
label_pca = {
0 : 'cluster 1',
1 : 'cluster 2',
2 : 'cluster 3'
}
df = pd.DataFrame({'x' : x, 'y' : y, 'label' : labels})
groups = df.groupby('label')
fig, ax = plt.subplots(figsize=(15,10))
for name, group in groups :
ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
color=warna[name], label = label_pca[name], mec='none')
ax.set_aspect('auto')
ax.tick_params(axis='x', which = 'both', bottom = 'off', top = 'off',
labelbottom = 'off')
ax.tick_params(axis='y', which = 'both', bottom = 'off', top = 'off',
labelbottom = 'off')
ax.legend()
ax.set_title("Visualisasi Agglomerative Clustering")
plt.show()

You might also like