TranMinhTu KTGK

import pandas as pd
data = pd.read_excel('D5 - glass.data.xlsx', sheet_name=1)

data.to_csv('glass.data.csv', index=False)
print(data.head())
Id RI: refractive index \

0 1 1.52101
1 2 1.51761
2 3 1.51618
3 4 1.51766
4 5 1.51742
Na: Sodium (unit measurement: weight percent in corresponding

oxide, as \
0 13.64
1 13.89
2 13.53
3 13.21
4 13.27
Mg: Magnesium Al: Aluminum Si: Silicon K: Potassium Ca: Calcium

\
0 4.49 1.10 71.78 0.06 8.75
1 3.60 1.36 72.73 0.48 7.83
2 3.55 1.54 72.99 0.39 7.78
3 3.69 1.29 72.61 0.57 8.22
4 3.62 1.24 73.08 0.55 8.07
Ba: Barium Fe: Iron Type of glass: (class attribute)

0 0.0 0.0 1
1 0.0 0.0 1
2 0.0 0.0 1
3 0.0 0.0 1
4 0.0 0.0 1
# Xóa cột 'id'

data.drop(columns=['Id'], inplace=True)
# Hiển thị DataFrame sau khi xóa cột

data
{"summary":"{\n \"name\": \"data\",\n \"rows\": 214,\n \"fields\":
[\n {\n \"column\": \" RI: refractive index\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0.0030368637393854334,\n \"min\": 1.51115,\n \"max\":
1.53393,\n \"num_unique_values\": 178,\n \"samples\": [\
n 1.51966,\n 1.51808,\n 1.51969\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"Na:
Sodium (unit measurement: weight percent in corresponding oxide, as
\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0.816603555714983,\n \"min\": 10.73,\n \"max\":
17.38,\n \"num_unique_values\": 142,\n \"samples\": [\n
14.38,\n 13.02,\n 13.83\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Mg: Magnesium\",\n
1.442407844870442,\n \"min\": 0.0,\n \"max\": 4.49,\n
\"num_unique_values\": 94,\n \"samples\": [\n 3.86,\n
3.47,\n 2.9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"Al:
Aluminum\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0.4992696456004845,\n \"min\": 0.29,\n \"max\":
1.48,\n 1.65,\n 1.24\n ],\n
n },\n {\n \"column\": \"Si: Silicon\",\n
0.7745457947651124,\n \"min\": 69.81,\n \"max\": 75.41,\
n \"num_unique_values\": 133,\n \"samples\": [\n
72.22,\n 73.06,\n 71.81\n ],\n
n },\n {\n \"column\": \"K: Potassium\",\n
0.65219184555898,\n \"min\": 0.0,\n \"max\": 6.21,\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"Ca:
Calcium\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 1.423153487281395,\n \"min\": 5.43,\n \"max\":
12.5,\n 8.52,\n 11.52\n ],\n
n },\n {\n \"column\": \"Ba: Barium\",\n
0.49721926059970356,\n \"min\": 0.0,\n \"max\": 3.15,\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"Fe:
Iron\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0.09743870063650084,\n \"min\": 0.0,\n \"max\":
0.51,\n 0.03,\n 0.21\n ],\n
n },\n {\n \"column\": \"Type of glass: (class
attribute)\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 2,\n \"min\": 1,\n
\"max\": 7,\n \"num_unique_values\": 6,\n \"samples\":
[\n 1,\n 2,\n 7\n ],\n
n }\n ]\n}","type":"dataframe","variable_name":"data"}
print(f'Kích thước của dữ liệu: {data.shape}')

print(f'Chiều của dữ liệu: {data.ndim}')
Kích thước của dữ liệu: (214, 10)

Chiều của dữ liệu: 2
print('\nKiểu dữ liệu của các thuộc tính:')

print(data.dtypes)
Kiểu dữ liệu của các thuộc tính:

RI: refractive index
float64
Na: Sodium (unit measurement: weight percent in corresponding oxide,
as float64
Mg: Magnesium
float64
Al: Aluminum
float64
Si: Silicon
float64
K: Potassium
float64
Ca: Calcium
float64
Ba: Barium
float64
Fe: Iron
float64
Type of glass: (class attribute)
int64
dtype: object
data.rename(columns={'Type of glass: (class attribute)': 'label'},

inplace=True)
if 'label' in data.columns:
print('\nSốlượng thực thểcủa các giá trị nhãn:')
print(data['label'].value_counts())
Sốlượng thực thểcủa các giá trị nhãn:
label
2 76
1 70
7 29
3 17
5 13
6 9
Name: count, dtype: int64
print('\nThông sốcủa các cột thuộc tính có giá trị sốthực:')

numeric_columns = data.select_dtypes(include=['float64',
'int64']).columns
for column in numeric_columns:
print(f'\nCột: {column}')
print(f'Giá trị nhỏ nhất: {data[column].min()}')
print(f'Giá trị lớn nhất: {data[column].max()}')
print(f'Giá trị trung bình: {data[column].mean()}')
Thông sốcủa các cột thuộc tính có giá trị sốthực:
Cột: RI: refractive index

Giá trị nhỏ nhất: 1.51115
Giá trị lớn nhất: 1.53393
Giá trị trung bình: 1.5183654205607477
Cột: Na: Sodium (unit measurement: weight percent in corresponding

oxide, as
Cột: Mg: Magnesium

Cột: Al: Aluminum

Cột: Si: Silicon

Cột: K: Potassium
Cột: Ca: Calcium

Cột: Ba: Barium

Giá trị nhỏ nhấ t: 0.0
Giá trị lớn nhấ t: 3.15
Cột: Fe: Iron

Cột: label
Giá trị nhỏ nhất: 1
Giá trị lớn nhất: 7
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
# Lấy các cột thuộc tính là giá trị liên tục

continuous_cols = data.select_dtypes(include=['float64',
'int64']).columns
continuous_data = data[continuous_cols].drop(columns='label')
# Thu giảm số chiều với PCA

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(continuous_data)
# Hiển thị dữ liệu trên không gian 2D

plt.figure(figsize=(10, 6))
scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1],
c=data['label'], cmap='viridis')
plt.legend(handles=scatter.legend_elements()[0],
labels=set(data['label']))
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Reduction of Continuous Attributes')
plt.show()
from sklearn.model_selection import train_test_split, cross_val_score,
GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,
AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
# Tách dữ liệu thành thuộc tính và nhãn

X = continuous_data
y = data['label']
# Chuẩn hóa dữ liệu

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Các mô hình để đánh giá

models = {
'KNN': KNeighborsClassifier(),
'Random Forest': RandomForestClassifier(),
'Naive Bayes': GaussianNB(),
'AdaBoost': AdaBoostClassifier(),
'SVM': SVC()
}
# Tinh chỉnh tham số và đánh giá các mô hình

best_estimators = {}
f1_scorer = make_scorer(f1_score, average='weighted')
for model_name, model in models.items():

if model_name == 'KNN':
param_grid = {'n_neighbors': [3, 5, 7, 9]}
elif model_name == 'Random Forest':
param_grid = {'n_estimators': [50, 100, 150], 'max_depth':
[None, 10, 20, 30]}
elif model_name == 'AdaBoost':
param_grid = {'n_estimators': [50, 100, 150]}
elif model_name == 'SVM':
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
else:
param_grid = {}
grid_search = GridSearchCV(model, param_grid, cv=10,

scoring=f1_scorer)
grid_search.fit(X_scaled, y)
best_estimators[model_name] = grid_search.best_estimator_
# Đánh giá và so sánh

results = {}
for model_name, estimator in best_estimators.items():

scores = cross_val_score(estimator, X_scaled, y, cv=10,
scoring=f1_scorer)
results[model_name] = scores.mean()
# Hiển thị bảng so sánh hiệu năng

print("\nBảng so sánh hiệu năng các mô hình:")
for model_name, score in results.items():
print(f"{model_name}: F-Score = {score:.4f}")
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_split.py:700: UserWarning: The least populated class in y has only 9
members, which is less than n_splits=10.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split
.py:700: UserWarning: The least populated class in y has only 9
warnings.warn(
warnings.warn(
warnings.warn(
warnings.warn(
warnings.warn(
warnings.warn(
warnings.warn(
warnings.warn(
Bảng so sánh hiệu năng các mô hình:

KNN: F-Score = 0.6532
Random Forest: F-Score = 0.6986
Naive Bayes: F-Score = 0.3703
AdaBoost: F-Score = 0.3789
SVM: F-Score = 0.6071
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_split.py:700: UserWarning: The least populated class in y has only 9
warnings.warn(
from sklearn.cluster import KMeans, DBSCAN

from sklearn.metrics import adjusted_rand_score
# K-Means clustering
kmeans = KMeans(n_clusters=len(data['label'].unique()))
kmeans_labels = kmeans.fit_predict(X_scaled)
print("\nAdjusted Rand Index for K-Means:", adjusted_rand_score(y,
kmeans_labels))
# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
print("Adjusted Rand Index for DBSCAN:", adjusted_rand_score(y,
dbscan_labels))
Adjusted Rand Index for K-Means: 0.14259389944260903

Adjusted Rand Index for DBSCAN: -0.028430115188775154
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/
_kmeans.py:870: FutureWarning: The default value of `n_init` will
change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly
to suppress the warning
warnings.warn(

TranMinhTu KTGK

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

TranMinhTu KTGK

Uploaded by

Copyright:

Available Formats

import pandas as pd

data = pd.read_excel('D5 - glass.data.xlsx', sheet_name=1)

Id RI: refractive index \

Na: Sodium (unit measurement: weight percent in corresponding

Mg: Magnesium Al: Aluminum Si: Silicon K: Potassium Ca: Calcium

1 3.60 1.36 72.73 0.48 7.83

2 3.55 1.54 72.99 0.39 7.78

3 3.69 1.29 72.61 0.57 8.22

4 3.62 1.24 73.08 0.55 8.07

Ba: Barium Fe: Iron Type of glass: (class attribute)

# Xóa cột 'id'

# Hiển thị DataFrame sau khi xóa cột

print(f'Kích thước của dữ liệu: {data.shape}')

Kích thước của dữ liệu: (214, 10)

print('\nKiểu dữ liệu của các thuộc tính:')

Kiểu dữ liệu của các thuộc tính:

data.rename(columns={'Type of glass: (class attribute)': 'label'},

print('\nThông sốcủa các cột thuộc tính có giá trị sốthực:')

Thông sốcủa các cột thuộc tính có giá trị sốthực:

Cột: RI: refractive index

Cột: Na: Sodium (unit measurement: weight percent in corresponding

Cột: Mg: Magnesium

Cột: Al: Aluminum

Cột: Si: Silicon

Cột: Ca: Calcium

Cột: Ba: Barium

Cột: Fe: Iron

from sklearn.decomposition import PCA

# Lấy các cột thuộc tính là giá trị liên tục

# Thu giảm số chiều với PCA

# Hiển thị dữ liệu trên không gian 2D

# Tách dữ liệu thành thuộc tính và nhãn

# Chuẩn hóa dữ liệu

# Các mô hình để đánh giá

# Tinh chỉnh tham số và đánh giá các mô hình

for model_name, model in models.items():

grid_search = GridSearchCV(model, param_grid, cv=10,

# Đánh giá và so sánh

for model_name, estimator in best_estimators.items():

# Hiển thị bảng so sánh hiệu năng

Bảng so sánh hiệu năng các mô hình:

from sklearn.cluster import KMeans, DBSCAN

Adjusted Rand Index for K-Means: 0.14259389944260903

You might also like