0% found this document useful (0 votes)
13 views6 pages

Untitled Document

The document outlines a machine learning workflow using the Breast Cancer dataset, which includes feature selection through a neural network, training a Random Forest classifier, and optimizing its hyperparameters using a genetic algorithm. It evaluates the performance of both a normal and pruned Random Forest model, providing accuracy scores, confusion matrices, and classification reports. The process demonstrates the effectiveness of feature selection and optimization in improving model performance.

Uploaded by

aakashswastikun
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views6 pages

Untitled Document

The document outlines a machine learning workflow using the Breast Cancer dataset, which includes feature selection through a neural network, training a Random Forest classifier, and optimizing its hyperparameters using a genetic algorithm. It evaluates the performance of both a normal and pruned Random Forest model, providing accuracy scores, confusion matrices, and classification reports. The process demonstrates the effectiveness of feature selection and optimization in improving model performance.

Uploaded by

aakashswastikun
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 6

import numpy as np

from [Link] import load_breast_cancer


from sklearn.model_selection import train_test_split
from [Link] import RandomForestClassifier
from [Link] import accuracy_score, confusion_matrix,
classification_report
from tensorflow import keras
from [Link] import differential_evolution
import [Link] as plt
import seaborn as sns

# Load Breast Cancer dataset


data = load_breast_cancer()
X, y = [Link], [Link]

# Split dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Step 1: Feature Selection Using Neural Network


def create_nn(input_shape):
model = [Link]([
[Link](16, activation='relu',
input_shape=(input_shape,)),
[Link](8, activation='relu'),
[Link](1, activation='sigmoid') # Binary
classification
])
[Link](optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
return model

# Train Neural Network


nn_model = create_nn(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Get feature importance from the neural network


# Use the absolute weights of the first layer as a proxy for feature
importance
weights = [Link](nn_model.layers[0].get_weights()[0])
feature_importance = [Link](weights, axis=1)

# Select top-k features based on importance


k = 10 # Number of features to select (changed to 7)
top_k_features = [Link](feature_importance)[-k:]

# Filter dataset to include only top-k features


X_train_pruned = X_train[:, top_k_features]
X_test_pruned = X_test[:, top_k_features]

# Print number of features used


print(f"Number of Features Used: {k}")

# Step 2: Tree Pruning Using Selected Features


# Train Random Forest on pruned features
rf_model = RandomForestClassifier(n_estimators=100, max_depth=4,
random_state=42)
rf_model.fit(X_train_pruned, y_train)

# Evaluate Random Forest on pruned features


y_pred = rf_model.predict(X_test_pruned)
print("Random Forest Accuracy (Pruned Features):", accuracy_score(y_test,
y_pred))

# Step 3: Optimize Random Forest Using Genetic Algorithm


def ga_fitness(params):
"""
Fitness function for Genetic Algorithm.
Optimizes n_estimators and max_depth of Random Forest.
"""
n_estimators, max_depth = params
n_estimators = int(n_estimators) # Ensure n_estimators is an integer
max_depth = int(max_depth) # Ensure max_depth is an integer

# Train Random Forest with given hyperparameters


model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=42
)
[Link](X_train_pruned, y_train)

# Evaluate on validation set


y_pred = [Link](X_test_pruned)
return -accuracy_score(y_test, y_pred) # Minimize negative accuracy

# Define bounds for GA (n_estimators, max_depth)


bounds = [
(10, 200), # n_estimators range
(2, 20) # max_depth range
]

# Run Genetic Algorithm


result = differential_evolution(ga_fitness, bounds, strategy='best1bin',
maxiter=5, popsize=10, tol=0.01)

# Get optimized hyperparameters


optimized_n_estimators = int(result.x[0])
optimized_max_depth = int(result.x[1])

# Train Random Forest with optimized hyperparameters


optimized_rf_model = RandomForestClassifier(
n_estimators=optimized_n_estimators,
max_depth=optimized_max_depth,
random_state=42
)
optimized_rf_model.fit(X_train_pruned, y_train)

# Evaluate optimized Random Forest


y_pred_optimized = optimized_rf_model.predict(X_test_pruned)
print("Optimized Random Forest Accuracy (Pruned Features):",
accuracy_score(y_test, y_pred_optimized))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_optimized)
[Link](figsize=(6, 4))
[Link](conf_matrix, annot=True, fmt="d", cmap="Blues",
xticklabels=data.target_names, yticklabels=data.target_names)
[Link]("Predicted")
[Link]("Actual")
[Link]("Confusion Matrix (Optimized Random Forest)")
[Link]()

# Classification Report
print("\nClassification Report (Optimized Random Forest):\n",
classification_report(y_test, y_pred_optimized,
target_names=data.target_names))

import numpy as np
from [Link] import RandomForestClassifier
from sklearn.model_selection import train_test_split
from [Link] import load_breast_cancer
from [Link] import accuracy_score, confusion_matrix,
classification_report
import [Link] as plt
import seaborn as sns
import time

# Load dataset
data = load_breast_cancer()
X, y = [Link], [Link]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Function to evaluate model performance


def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
"""
Evaluate the model's performance and generate visualizations.
"""
# Measure training time
start_time = [Link]()
[Link](X_train, y_train)
training_time = [Link]() - start_time

# Make predictions
y_pred = [Link](X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # For ROC curve
(binary classification)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"{model_name} Accuracy: {accuracy:.4f}")
print(f"{model_name} Training Time: {training_time:.4f} seconds")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
[Link](figsize=(6, 4))
[Link](conf_matrix, annot=True, fmt="d", cmap="Blues",
xticklabels=data.target_names, yticklabels=data.target_names)
[Link]("Predicted")
[Link]("Actual")
[Link](f"{model_name} Confusion Matrix")
[Link]()

# Classification Report
print(f"\n{classification_report(y_test, y_pred,
target_names=data.target_names)}")

# Feature Importance Plot


[Link](figsize=(10, 6))
[Link](range(len(model.feature_importances_)),
model.feature_importances_)
[Link]("Feature Index")
[Link]("Feature Importance")
[Link](f"{model_name} Feature Importance")
[Link]()

# Train and evaluate Normal Random Forest model


print("Normal Random Forest Model:")
rf_model_normal = RandomForestClassifier(n_estimators=100, max_depth=4,
random_state=42)
evaluate_model(rf_model_normal, X_train, X_test, y_train, y_test, "Normal
Random Forest")

# Train and evaluate Pruned Random Forest model


print("\nPruned Random Forest Model:")
pruned_features = [7, 20, 22, 23, 27] # Significant features from pruning
X_train_pruned = X_train[:, pruned_features]
X_test_pruned = X_test[:, pruned_features]
rf_model_pruned = RandomForestClassifier(n_estimators=100, max_depth=4,
random_state=42)
evaluate_model(rf_model_pruned, X_train_pruned, X_test_pruned, y_train,
y_test, "Pruned Random Forest")

# Compare number of features


print(f"\nNumber of Features:")
print(f"Normal Random Forest: {X_train.shape[1]}")
print(f"Pruned Random Forest: {X_train_pruned.shape[1]}")

You might also like