You are on page 1of 12

KLASIFIKASI DATA

1. Logistic Regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat model Logistic Regression


model = LogisticRegression()
model.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = model.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualisasi data dan hasil prediksi


plt.figure(figsize=(10, 6))

# Scatter plot data training


sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title('Data Training')

# Plot batas keputusan (decision boundary)


ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# Membuat grid untuk menentukan batas keputusan


xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50),
np.linspace(ylim[0], ylim[1], 50))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

# Menampilkan decision boundary


Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.5, cmap='viridis')

plt.show()

2. K-Means

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Menormalisasi data menggunakan StandardScaler


scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Menggunakan K-Means untuk mengelompokkan data menjadi 3 kelompok


kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung', 'Cluster']]
y = df['Cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Visualisasi hasil clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Cluster', data=df, palette='viridis')
plt.title('Hasil K-Means Clustering')

# Menampilkan pusat cluster


centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], marker='X', s=200, c='red', label='Centroids')
plt.legend()
plt.show()

# Membuat model Logistic Regression


model = LogisticRegression()
model.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = model.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

3. K-Nearest Neighbors (K-NN)

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])
# Membagi data menjadi data training dan data testing
X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan K-NN untuk klasifikasi


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = knn.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualisasi data dan hasil prediksi


plt.figure(figsize=(10, 6))

# Scatter plot data training


sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title('Data Training')

# Menampilkan decision boundary


h = .02 # Step size in the mesh
x_min, x_max = X_train.iloc[:, 0].min() - 1, X_train.iloc[:, 0].max() + 1
y_min, y_max = X_train.iloc[:, 1].min() - 1, X_train.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot


Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3)

plt.show()

4. Support Vector Machines (SVM)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Membuat data penjualan e-commerce sederhana
data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan Support Vector Machines (SVM) untuk klasifikasi


svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = svm.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualisasi data dan hasil prediksi


plt.figure(figsize=(10, 6))

# Scatter plot data training


sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title('Data Training')

# Menampilkan decision boundary


h = .02 # Step size in the mesh
x_min, x_max = X_train.iloc[:, 0].min() - 1, X_train.iloc[:, 0].max() + 1
y_min, y_max = X_train.iloc[:, 1].min() - 1, X_train.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot


Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3)
plt.show()

5. Decision Trees

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan Decision Trees untuk klasifikasi


dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = dt_classifier.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualisasi Decision Tree


plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, feature_names=X.columns, class_names=df['Label'].unique(), filled=True,
rounded=True)
plt.title('Decision Tree')
plt.show()

6. Random Forest

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan Random Forest untuk klasifikasi


rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = rf_classifier.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Menampilkan fitur yang paling penting


feature_importances = rf_classifier.feature_importances_
print("\nFeature Importances:")
for feature, importance in zip(X.columns, feature_importances):
print(f"{feature}: {importance}")
# Visualisasi fitur yang paling penting
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=X.columns, orient='h')
plt.title('Feature Importances')
plt.show()

7. Naive Bayes

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan Naive Bayes untuk klasifikasi


naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# Menguji model pada data testing


y_pred = naive_bayes.predict(X_test)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Visualisasi data dan hasil prediksi
plt.figure(figsize=(10, 6))

# Scatter plot data training


sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title('Data Training')

plt.show()

8. Neural Networks

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])

# Membagi data menjadi data training dan data testing


X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisasi data menggunakan StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Menggunakan Neural Networks untuk klasifikasi


mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_classifier.fit(X_train_scaled, y_train)

# Menguji model pada data testing


y_pred = mlp_classifier.predict(X_test_scaled)

# Menampilkan confusion matrix dan classification report


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualisasi data dan hasil prediksi


plt.figure(figsize=(10, 6))

# Scatter plot data training


sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title('Data Training')

plt.show()

9. Membandingkan Logistic Regression, K-Means, K-Nearest Neighbors (K-NN), Support Vector


Machines (SVM), Decision Trees, Random Forest, Naive Bayes, Neural Networks

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Membuat data penjualan e-commerce sederhana


data = {
'Jumlah_Pengunjung': [150, 200, 300, 120, 180, 250, 140, 280, 200, 320],
'Durasi_Pengunjung': [2, 3, 4, 1, 2, 3, 1.5, 4, 3, 5],
'Total_Penjualan': [500, 800, 1200, 300, 600, 1000, 400, 1100, 900, 1300]
}

df = pd.DataFrame(data)

# Membuat label target, misalnya jika Total_Penjualan > 1000 maka 'Laris', jika 500 <= Total_Penjualan <=
1000 maka 'Sedang', sebaliknya 'Tidak Laris'
df['Label'] = pd.cut(df['Total_Penjualan'], bins=[-float('inf'), 500, 1000, float('inf')], labels=['Tidak Laris',
'Sedang', 'Laris'])
# Membagi data menjadi data training dan data testing
X = df[['Jumlah_Pengunjung', 'Durasi_Pengunjung']]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisasi data menggunakan StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fungsi untuk melatih, mengevaluasi model, dan membuat visualisasi


def train_evaluate_visualize(model, X_train, y_train, X_test, y_test, model_name):
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"### {model_name} ###")

# Analisis training
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Analisis testing
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Visualisasi data dan hasil prediksi


plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_train,
y_train], axis=1))
plt.title(f'Data Training - {model_name}')

plt.subplot(1, 2, 2)
sns.scatterplot(x='Jumlah_Pengunjung', y='Durasi_Pengunjung', hue='Label', data=pd.concat([X_test,
y_test], axis=1))
plt.title(f'Data Testing - {model_name}')

plt.show()

# Melatih, mengevaluasi, dan membuat visualisasi untuk setiap model


train_evaluate_visualize(LogisticRegression(), X_train, y_train, X_test, y_test, 'Logistic Regression')
train_evaluate_visualize(KNeighborsClassifier(n_neighbors=3), X_train, y_train, X_test, y_test, 'K-Nearest
Neighbors (K-NN)')
train_evaluate_visualize(SVC(kernel='linear', C=1), X_train, y_train, X_test, y_test, 'Support Vector
Machines (SVM)')
train_evaluate_visualize(DecisionTreeClassifier(random_state=42), X_train, y_train, X_test, y_test, 'Decision
Trees')
train_evaluate_visualize(GaussianNB(), X_train, y_train, X_test, y_test, 'Naive Bayes')
train_evaluate_visualize(MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42),
X_train_scaled, y_train, X_test_scaled, y_test, 'Neural Networks')
Latihan / Tugas

1. Cobalah semua program diatas, amati


2. Buatlah / Carilah Data untuk diklasifikasi, gunakan semua metode diatas satu-persatu lalu yg terakhir
bandingkan semua metode tersebut.

 Logistic Regression
 K-Means
 K-Nearest Neighbors (K-NN)
 Support Vector Machines (SVM)
 Decision Trees
 Random Forest
 Naive Bayes
 Neural Networks

3. Membuat ppt, pilihlah salah satu diantara metode tersebut jelaskan penggunaannya, rumusnya dan
parameternya. Bisa menggunakan bantuan ChatGPT.

You might also like