You are on page 1of 2

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

df = pd.read_excel("/Users/ngocanhle/Downloads/DS_ANHHUY.xlsx")
X = np.array(df[["Total Income", "exp", "region"]])
Y = np.array(df["Classification"])
validation_size = 0.3
seed = 0
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

k_values = list(range(1, 100))


cv_scores = []

for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train, Y_train, cv=5, scoring='accuracy')
cv_scores.append(scores.mean())

optimal_k = k_values[cv_scores.index(max(cv_scores))]
print(f"The optimal number of neighbors is {optimal_k}.")
-----------------
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, \
test_size=validation_size, random_state=seed)
knn = KNeighborsClassifier(n_neighbors=34)
knn.fit(X_train, Y_train)
# predict the response
Y_predict = knn.predict(X_validation)
# evaluate accuracy
print("Accuracy of 1NN: %.2f %%" %(100*accuracy_score(Y_validation, Y_predict)))

#-------------------
Xnew = [[55,11,0]]
# make a prediction
ynew =knn.predict(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))
#regression
import statsmodels.api as sm
X = sm.add_constant(df[["Total Income", "exp", "region"]])
y = df["noodle (box)"]
model = sm.OLS(y, X).fit()
print(model.summary())

You might also like