You are on page 1of 2

# data preparation as feature engineering for wine data

from numpy import mean


from numpy import std
from pandas import read_csv
from sklearn.model_selection import RepeatedStratifiedKFold,
cross_val_score
from sklearn import linear_model, pipeline, preprocessing, tree,
neural_network, ensemble, neighbors
from sklearn.decomposition import TruncatedSVD

# load the dataset


url =
'http://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv'
df = read_csv(url, header = None)
data = df.values
x, y = data[:, :-1], data[:,-1]
x = x.astype('float')
y = preprocessing.LabelEncoder().fit_transform(y.astype('str'))

# transform for feature union


transforms = list()
#transforms.append(('mms', preprocessing.MinMaxScaler()))
#transforms.append(('stds', preprocessing.StandardScaler()))
#transforms.append(('kbd', preprocessing.KBinsDiscretizer()))
#transforms.append(('qt', preprocessing.QuantileTransformer()))
#transforms.append(('pnf', preprocessing.PolynomialFeatures()))
#transforms.append(('spl', preprocessing.SplineTransformer()))
#transforms.append(('mas', preprocessing.MaxAbsScaler()))
#transforms.append(('rbs', preprocessing.RobustScaler()))
#transforms.append(('tcs', decomposition.TruncatedSVD()))

# create the feature union


fu = pipeline.FeatureUnion(transforms)

# define the AI model (as classification model)


#model = tree.DecisionTreeClassifier()
#model = tree.ExtraTreeClassifier()
#model = linear_model.Perceptron()
#model = neural_network.MLPClassifier(hidden_layer_sizes = 5)
#model = ensemble.ExtraTreesClassifier()
#model = ensemble.AdaBoostClassifier()
#model = ensemble.RandomForestClassifier()
#model = neighbors.KNeighborsClassifier()
#model = neighbors.RadiusNeighborsClassifier()
# define the pipeline
steps = list()
steps.append(('fu',fu))
steps.append(('m',model))
pipeline = pipeline.Pipeline(steps=steps)

# define the cross-validation procedure


cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 10, random_state =
1)

#evaluate the model


scores = cross_val_score(pipeline, x, y, scoring = 'accuracy', cv=cv,
n_jobs=-1)

# print results
print('Accuracy (mean: %0.3f with std %0.3f)' % (mean(scores),
std(scores)))

You might also like