You are on page 1of 16

{

"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wine Price Prediction"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# evalautea the model\n",
"\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,
mean_absolute_percentage_error\n",
"from math import sqrt\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"wine = pd.read_csv('wine.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Year</th>\n",
" <th>Price</th>\n",
" <th>WinterRain</th>\n",
" <th>AGST</th>\n",
" <th>HarvestRain</th>\n",
" <th>Age</th>\n",
" <th>FrancePop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1952</td>\n",
" <td>7.4950</td>\n",
" <td>600</td>\n",
" <td>17.1167</td>\n",
" <td>160</td>\n",
" <td>31</td>\n",
" <td>43183.569</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1953</td>\n",
" <td>8.0393</td>\n",
" <td>690</td>\n",
" <td>16.7333</td>\n",
" <td>80</td>\n",
" <td>30</td>\n",
" <td>43495.030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1955</td>\n",
" <td>7.6858</td>\n",
" <td>502</td>\n",
" <td>17.1500</td>\n",
" <td>130</td>\n",
" <td>28</td>\n",
" <td>44217.857</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1957</td>\n",
" <td>6.9845</td>\n",
" <td>420</td>\n",
" <td>16.1333</td>\n",
" <td>110</td>\n",
" <td>26</td>\n",
" <td>45152.252</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1958</td>\n",
" <td>6.7772</td>\n",
" <td>582</td>\n",
" <td>16.4167</td>\n",
" <td>187</td>\n",
" <td>25</td>\n",
" <td>45653.805</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Year Price WinterRain AGST HarvestRain Age FrancePop\n",
"0 1952 7.4950 600 17.1167 160 31 43183.569\n",
"1 1953 8.0393 690 16.7333 80 30 43495.030\n",
"2 1955 7.6858 502 17.1500 130 28 44217.857\n",
"3 1957 6.9845 420 16.1333 110 26 45152.252\n",
"4 1958 6.7772 582 16.4167 187 25 45653.805"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# AGST : Average Growing Season Temperature (in Celsius degrees)."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(25, 7)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Year', 'Price', 'WinterRain', 'AGST', 'HarvestRain', 'Age',\n",
" 'FrancePop'],\n",
" dtype='object')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.columns"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 25 entries, 0 to 24\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Year 25 non-null int64 \n",
" 1 Price 25 non-null float64\n",
" 2 WinterRain 25 non-null int64 \n",
" 3 AGST 25 non-null float64\n",
" 4 HarvestRain 25 non-null int64 \n",
" 5 Age 25 non-null int64 \n",
" 6 FrancePop 25 non-null float64\n",
"dtypes: float64(3), int64(4)\n",
"memory usage: 1.5 KB\n"
]
}
],
"source": [
"wine.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Year 0\n",
"Price 0\n",
"WinterRain 0\n",
"AGST 0\n",
"HarvestRain 0\n",
"Age 0\n",
"FrancePop 0\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Year</th>\n",
" <th>Price</th>\n",
" <th>WinterRain</th>\n",
" <th>AGST</th>\n",
" <th>HarvestRain</th>\n",
" <th>Age</th>\n",
" <th>FrancePop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Year</th>\n",
" <td>1.000000</td>\n",
" <td>-0.447768</td>\n",
" <td>0.016970</td>\n",
" <td>-0.246916</td>\n",
" <td>0.028009</td>\n",
" <td>-1.000000</td>\n",
" <td>0.994485</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Price</th>\n",
" <td>-0.447768</td>\n",
" <td>1.000000</td>\n",
" <td>0.136651</td>\n",
" <td>0.659563</td>\n",
" <td>-0.563322</td>\n",
" <td>0.447768</td>\n",
" <td>-0.466862</td>\n",
" </tr>\n",
" <tr>\n",
" <th>WinterRain</th>\n",
" <td>0.016970</td>\n",
" <td>0.136651</td>\n",
" <td>1.000000</td>\n",
" <td>-0.321091</td>\n",
" <td>-0.275441</td>\n",
" <td>-0.016970</td>\n",
" <td>-0.001622</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AGST</th>\n",
" <td>-0.246916</td>\n",
" <td>0.659563</td>\n",
" <td>-0.321091</td>\n",
" <td>1.000000</td>\n",
" <td>-0.064496</td>\n",
" <td>0.246916</td>\n",
" <td>-0.259162</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HarvestRain</th>\n",
" <td>0.028009</td>\n",
" <td>-0.563322</td>\n",
" <td>-0.275441</td>\n",
" <td>-0.064496</td>\n",
" <td>1.000000</td>\n",
" <td>-0.028009</td>\n",
" <td>0.041264</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Age</th>\n",
" <td>-1.000000</td>\n",
" <td>0.447768</td>\n",
" <td>-0.016970</td>\n",
" <td>0.246916</td>\n",
" <td>-0.028009</td>\n",
" <td>1.000000</td>\n",
" <td>-0.994485</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FrancePop</th>\n",
" <td>0.994485</td>\n",
" <td>-0.466862</td>\n",
" <td>-0.001622</td>\n",
" <td>-0.259162</td>\n",
" <td>0.041264</td>\n",
" <td>-0.994485</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Year Price WinterRain AGST HarvestRain
Age \\\n",
"Year 1.000000 -0.447768 0.016970 -0.246916 0.028009 -
1.000000 \n",
"Price -0.447768 1.000000 0.136651 0.659563 -0.563322
0.447768 \n",
"WinterRain 0.016970 0.136651 1.000000 -0.321091 -0.275441 -
0.016970 \n",
"AGST -0.246916 0.659563 -0.321091 1.000000 -0.064496
0.246916 \n",
"HarvestRain 0.028009 -0.563322 -0.275441 -0.064496 1.000000 -
0.028009 \n",
"Age -1.000000 0.447768 -0.016970 0.246916 -0.028009
1.000000 \n",
"FrancePop 0.994485 -0.466862 -0.001622 -0.259162 0.041264 -
0.994485 \n",
"\n",
" FrancePop \n",
"Year 0.994485 \n",
"Price -0.466862 \n",
"WinterRain -0.001622 \n",
"AGST -0.259162 \n",
"HarvestRain 0.041264 \n",
"Age -0.994485 \n",
"FrancePop 1.000000 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.corr()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Perform EDA"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Year', 'Price', 'WinterRain', 'AGST', 'HarvestRain', 'Age',\n",
" 'FrancePop'],\n",
" dtype='object')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wine.columns"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"X = wine[['WinterRain', 'AGST', 'HarvestRain']]\n",
"y = wine[['Price']]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>WinterRain</th>\n",
" <th>AGST</th>\n",
" <th>HarvestRain</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>600</td>\n",
" <td>17.1167</td>\n",
" <td>160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>690</td>\n",
" <td>16.7333</td>\n",
" <td>80</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" WinterRain AGST HarvestRain\n",
"0 600 17.1167 160\n",
"1 690 16.7333 80"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7.4950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0393</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Price\n",
"0 7.4950\n",
"1 8.0393"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" WinterRain AGST HarvestRain\n",
"8 697 16.3000 52\n",
"7 830 17.3333 38\n",
"11 602 15.3667 267\n",
"4 582 16.4167 187\n",
"3 420 16.1333 110\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
random_state= 123)\n",
"\n",
"print(X_train.head())"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape of train sets\n",
"(20, 3)\n",
"(20, 1)\n",
"************************\n",
"Shape of test sets\n",
"(5, 3)\n",
"(5, 1)\n"
]
}
],
"source": [
"print(\"Shape of train sets\")\n",
"print(X_train.shape)\n",
"print(y_train.shape)\n",
"\n",
"print(\"************************\")\n",
"print(\"Shape of test sets\")\n",
"\n",
"print(X_test.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"lm = LinearRegression()\n",
"lm.fit(X_train, y_train) # fit the model\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.00142976 0.6924606 -0.00377696]]\n"
]
}
],
"source": [
"print(lm.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-4.69864537]\n"
]
}
],
"source": [
"print(lm.intercept_)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# prediction on test data\n",
"predict_test = lm.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[7.39499156],\n",
" [7.21052235],\n",
" [7.18801372],\n",
" [5.84628932],\n",
" [6.66210952]])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_test"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>8.0757</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>7.2920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>7.1211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>6.2049</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>6.3459</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Price\n",
"5 8.0757\n",
"21 7.2920\n",
"22 7.1211\n",
"18 6.2049\n",
"15 6.3459"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation metrics for train data\n",
"\n",
"\n",
"Mean square error for train data (MSE) : 0.092\n",
"Root Mean square error for train data (RMSE) : 0.303\n",
" Mean absolute error for train data (MAE) : 0.239\n",
" Mean absolute percentage error for train data (MAPE) : 0.035\n",
"R2 value for train data : 0.764\n"
]
}
],
"source": [
"print(\"Evaluation metrics for train data\")\n",
"\n",
"# prediction on train data\n",
"predict_train = lm.predict(X_train)\n",
"\n",
"print('\\n')\n",
"# Actual value, prediction \n",
"mse_train = mean_squared_error(y_train, predict_train)\n",
"print(\"Mean square error for train data (MSE) :\", round(mse_train,3))\n",
"\n",
"rmse_train = sqrt(mse_train)\n",
"print(\"Root Mean square error for train data (RMSE) :\",
round(rmse_train,3))\n",
"\n",
"mae_train = mean_absolute_error(y_train, predict_train)\n",
"print(\" Mean absolute error for train data (MAE) :\", round(mae_train,3))\n",
"\n",
"mape_train = mean_absolute_percentage_error(y_train, predict_train)\n",
"print(\" Mean absolute percentage error for train data (MAPE) :\",
round(mape_train,3))\n",
"\n",
"r_square = r2_score(y_train, predict_train)\n",
"print(\"R2 value for train data :\", round(r_square, 3))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation metrics for test data\n",
"\n",
"\n",
"Mean square error for test data (MSE) : 0.141\n",
"Root Mean square error for test data (RMSE) : 0.375\n",
" Mean absolute error for test data (MAE) : 0.301\n",
" Mean absolute percentage error for test data (MAPE) : 0.042\n",
"R2 value for train data : 0.697\n"
]
}
],
"source": [
"print(\"Evaluation metrics for test data\")\n",
"\n",
"# prediction on train data\n",
"predict_test = lm.predict(X_test)\n",
"\n",
"print('\\n')\n",
"# Actual value, prediction \n",
"mse_test = mean_squared_error(y_test, predict_test)\n",
"print(\"Mean square error for test data (MSE) :\", round(mse_test,3))\n",
"\n",
"rmse_test = sqrt(mse_test)\n",
"print(\"Root Mean square error for test data (RMSE) :\", round(rmse_test,3))\
n",
"\n",
"mae_test = mean_absolute_error(y_test, predict_test)\n",
"print(\" Mean absolute error for test data (MAE) :\", round(mae_test,3))\n",
"\n",
"mape_test = mean_absolute_percentage_error(y_test, predict_test)\n",
"print(\" Mean absolute percentage error for test data (MAPE) :\",
round(mape_test,3))\n",
"\n",
"r_square_test = r2_score(y_test, predict_test)\n",
"print(\"R2 value for train data :\", round(r_square_test, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

You might also like