You are on page 1of 35

PROYECTO MACHINE LEARNING

In [ ]: import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets,metrics
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [ ]: df=pd.read_csv('bd_clientes_2020.csv')

In [ ]: df.head()

Out[3]: anio mes semana cliente ticket categorias productos_unicos segmento

0 2020 1 1 adef_01 2100 18 27 A

1 2020 1 2 adef_01 2100 18 27 A

2 2020 1 3 adef_01 2100 18 27 A

3 2020 1 4 adef_01 2100 18 27 A

4 2020 2 1 adef_01 2100 18 27 A

AVANCE N°01
1 Idendificar las variables numéricas (Medias, desv std, etc) y categóricas (frecuencia, valores únicos)

2 Identificar las categorías por mes, semanas, producto unicos en promedio (En Set se compran mas
productos unicos que en enero)

3 Patrón de consumo de l cliente en el mes.

In [ ]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 8 columns):
anio 2880 non-null int64
mes 2880 non-null int64
semana 2880 non-null int64
cliente 2880 non-null object
ticket 2880 non-null int64
categorias 2880 non-null int64
productos_unicos 2880 non-null int64
segmento 2880 non-null object
dtypes: int64(6), object(2)
memory usage: 180.1+ KB
In [ ]: df.describe()

Out[5]: anio mes semana ticket categorias productos_unicos

count 2880.0 2880.000000 2880.000000 2880.000000 2880.000000 2880.000000

mean 2020.0 6.500000 2.500000 1582.500000 14.000000 22.000000

std 0.0 3.452652 1.118228 678.069609 7.484614 11.960337

min 2020.0 1.000000 1.000000 450.000000 3.000000 6.000000

25% 2020.0 3.750000 1.750000 900.000000 8.250000 14.250000

50% 2020.0 6.500000 2.500000 1650.000000 12.000000 19.500000

75% 2020.0 9.250000 3.250000 2100.000000 18.000000 27.000000

max 2020.0 12.000000 4.000000 2700.000000 27.000000 45.000000

In [ ]: df.columns

Out[6]: Index(['anio', 'mes', 'semana', 'cliente', 'ticket', 'categorias',


'productos_unicos', 'segmento'],
dtype='object')

In [ ]: df.isnull().sum()

Out[7]: anio 0
mes 0
semana 0
cliente 0
ticket 0
categorias 0
productos_unicos 0
segmento 0
dtype: int64

In [ ]: # confirma el tipo de variables


df.dtypes

Out[8]: anio int64


mes int64
semana int64
cliente object
ticket int64
categorias int64
productos_unicos int64
segmento object
dtype: object
In [ ]: sns.countplot(x='segmento', data=df)
plt.show()

In [ ]: sns.regplot(x='ticket', y="categorias", data=df)


plt.show()

In [ ]: df['categorias'].value_counts(dropna = False)

Out[11]: 18 720
12 720
27 480
6 480
9 240
3 240
Name: categorias, dtype: int64

In [ ]: df['segmento'].value_counts(dropna = False)

Out[12]: A 1080
B 900
C 900
Name: segmento, dtype: int64
In [ ]: df['productos_unicos'].value_counts(dropna = False)

Out[13]: 45 480
27 480
21 480
18 480
15 240
9 240
12 240
6 240
Name: productos_unicos, dtype: int64

In [ ]: df.groupby('segmento').agg({'productos_unicos':'count'}).reset_index().head()

Out[14]: segmento productos_unicos

0 A 1080

1 B 900

2 C 900

In [ ]: df.groupby('segmento').agg({'categorias':'count'}).reset_index().head()

Out[15]: segmento categorias

0 A 1080

1 B 900

2 C 900

In [ ]: df.groupby('mes').agg({'productos_unicos':'count'}).reset_index().head(12)

Out[16]: mes productos_unicos

0 1 240

1 2 240

2 3 240

3 4 240

4 5 240

5 6 240

6 7 240

7 8 240

8 9 240

9 10 240

10 11 240

11 12 240

In [ ]: df.groupby('segmento').mean()

Out[17]: anio mes semana ticket categorias productos_unicos

segmento

A 2020.0 6.666667 2.5 2233.333333 21.333333 34.0

B 2020.0 6.400000 2.2 1644.000000 13.400000 19.4

C 2020.0 6.400000 2.8 740.000000 5.800000 10.2

In [ ]:

In [ ]: df['cliente'].value_counts(dropna = False)

Out[18]: adef_018 48
adef_021 48
adef_06 48
adef_01 48
adef_019 48
adef_014 48
adef_020 48
adef_09 48
adef_025 48
adef_012 48
adef_028 48
adef_023 48
adef_02 48
adef_011 48
adef_015 48
adef_026 48
adef_017 48
adef_07 48
adef_029 48
adef_027 48
adef_030 48
adef_010 48
adef_08 48
adef_013 48
adef_04 48
adef_03 48
adef_024 48
adef_022 48
adef_05 48
adef_016 48
..
adef_083 24
adef_056 24
adef_070 24
adef_080 24
adef_066 24
adef_075 24
adef_065 24
adef_084 24
adef_033 24
adef_052 24
adef_034 24
adef_078 24
adef_046 24
adef_058 24
adef_037 24
adef_044 24
adef_061 24
adef_071 24
adef_086 24
adef_055 24
adef_068 24
adef_079 24
adef_041 24
adef_043 24
adef_038 24
adef_088 24
adef_053 24
adef_040 24
adef_047 24
adef_042 24
Name: cliente, Length: 90, dtype: int64
In [ ]: corr = df.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=.8, linewidths=0.01,
square=True,annot=True)
plt.show()

In [ ]:

AVANCE N°02
1 Implementar una Regresión Lineal Vs un árbol de decisión y comparar el MAE, MSE y RMSE

2 Hacer una Regresión Logística y sus métricas de desempeño (Matriz de confusión..) Las variables
objetivos para esta RegLog son las A, B, y C

Regresión Lineal
In [ ]: # Identificar las variable numéricas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 8 columns):
anio 2880 non-null int64
mes 2880 non-null int64
semana 2880 non-null int64
cliente 2880 non-null object
ticket 2880 non-null int64
categorias 2880 non-null int64
productos_unicos 2880 non-null int64
segmento 2880 non-null object
dtypes: int64(6), object(2)
memory usage: 180.1+ KB

In [ ]: df.columns

Out[21]: Index(['anio', 'mes', 'semana', 'cliente', 'ticket', 'categorias',


'productos_unicos', 'segmento'],
dtype='object')

In [ ]: # Dividir la data de Train y Test


y = df["ticket"]
x = df[["anio", "mes", "semana", "productos_unicos", "categorias"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

In [ ]: x_t = sm.add_constant(x_t)

In [ ]: model =sm.OLS(y_t, x_t)

In [ ]: results = model.fit()

7/35
proyecto-machine-learning

In [ ]: print(results.summary())

OLS Regression Results


==============================================================================
Dep. Variable: ticket R-squared: 0.959
Model: OLS Adj. R-squared: 0.959
Method: Least Squares F-statistic: 1.345e+04
Date: Sat, 14 May 2022 Prob (F-statistic): 0.00
Time: 10:36:38 Log-Likelihood: -14620.
No. Observations: 2304 AIC: 2.925e+04
Df Residuals: 2299 BIC: 2.928e+04
Df Model: 4
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
anio 0.2676 0.005 53.915 0.000 0.258 0.277
mes -40.8456 0.894 -45.707 0.000 -42.598 -39.093
semana 3.9159 2.608 1.502 0.133 -1.198 9.029
productos_unicos -16.6045 1.118 -14.856 0.000 -18.796 -14.413
categorias 118.7504 1.816 65.389 0.000 115.189 122.312
==============================================================================
Omnibus: 42.019 Durbin-Watson: 2.040
Prob(Omnibus): 0.000 Jarque-Bera (JB): 41.609
Skew: 0.303 Prob(JB): 9.22e-10
Kurtosis: 2.743 Cond. No. 1.86e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specif
ied.
[2] The condition number is large, 1.86e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Interpretación

R-squared: 0.975 Las variables independientes explican el comportamiento de la variable dependiente en


un 97.5%

P>|t| = 0.00 Los valores de la significancia estadística son menores a 0.05 a excepción de la variable
"semana", entonces quiere decier que las variables en general estan relacionadas, las independientes
explican satisfactoriamente a la variable objetivo.

Coef El Coeficiente de Correlación: Cuando hay un incremento de la variable "anio", hay un incremento
positivo de 0.2676 en la variable objetivo ("ticket")

In [ ]: # Predicción
x_p = sm.add_constant(x_p)
y_f = results.predict(x_p)

/proyecto-machine-learning.ipynb 8/35
proyecto-machine-learning

In [ ]: df_result =pd.DataFrame({"y_f":y_f,
"y_p":y_p})
df_result

Out[84]: y_f y_p

881 1470.335157 1500

1752 1579.988354 1800

803 2647.719416 2700

964 2152.018157 2100

5 2155.934089 2100

1505 1973.557399 1890

1857 1167.813369 900

1742 1037.444597 900

2723 567.821640 450

554 1392.559817 1500

714 2562.112212 2400

930 1474.251088 1500

1859 1126.967733 900

2080 1965.725535 1890

225 2639.887552 2700

2348 662.199316 750

2825 751.722452 750

705 2639.887552 2700

484 2152.018157 2100

2074 1119.135869 900

379 2566.028144 2700

1884 1078.290233 900

345 2115.088453 2100

1703 1851.020491 1890

811 2566.028144 2700

2090 1539.142718 1800

2045 1506.128946 1800

2823 486.130368 450

1531 1932.711763 1890

1174 1433.405453 1500

... ... ...

7 2163.765953 2100

2208 1020.842261 900

1973 1506.128946 1800

1919 1851.020491 1890

537 2115.088453 2100

156 2070.326885 2100

1337 2558.196280 2700

2586 703.044952 750

540 2070.326885 2100

/proyecto-machine-learning.ipynb 9/35
proyecto-machine-learning

y_f y_p

2501 946.982853 900

889 1388.643885 1500

437 2155.934089 2100

712 2554.280348 2400

1367 1437.321384 1500

1796 1884.034263 1890

2624 600.835412 450

326 2602.957848 2400

2735 629.185544 750

1879 1465.283310 1800

502 1433.405453 1500

589 2074.242817 2100

2124 1078.290233 900

763 2566.028144 2700

2696 600.835412 450

74 1392.559817 1500

730 2119.004385 2100

1598 1037.444597 900

1853 1506.128946 1800

1065 2115.088453 2100

1236 2595.125984 2400

576 rows × 2 columns

Métricas de desempeño para variables numéricas

In [ ]: # MAE Mean Absule Error


sum(abs(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[85]: 104.24033430352067

In [ ]: # MSE Mean Square Error


sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[86]: 17533.42715534988

In [ ]: # RMSE Root Mean Square Error


np.sqrt(sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p']))

Out[87]: 132.41384804977869

Decision tree - Variable Numérica

/proyecto-machine-learning.ipynb 10/35
proyecto-machine-learning

In [ ]: #Cargamos la máuina de aprendizaje DecisionTreeRegressor


regressor =DecisionTreeRegressor(random_state=9, max_depth=3, min_samples_leaf=5)
regressor.fit(x_t,y_t)

Out[88]: DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,


max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=5,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=9, splitter='best')

In [ ]: y_f=result=regressor.predict(x_p)

/proyecto-machine-learning.ipynb 11/35
proyecto-machine-learning

In [ ]: df_result = pd.DataFrame({"y_f":y_f,
"y_p":y_p})
df_result

Out[90]: y_f y_p

881 1500.000000 1500

1752 1843.809524 1800

803 2549.222798 2700

964 2100.000000 2100

5 2100.000000 2100

1505 1843.809524 1890

1857 900.000000 900

1742 900.000000 900

2723 450.000000 450

554 1500.000000 1500

714 2549.222798 2400

930 1500.000000 1500

1859 900.000000 900

2080 1843.809524 1890

225 2549.222798 2700

2348 750.000000 750

2825 750.000000 750

705 2549.222798 2700

484 2100.000000 2100

2074 900.000000 900

379 2549.222798 2700

1884 900.000000 900

345 2100.000000 2100

1703 1843.809524 1890

811 2549.222798 2700

2090 1843.809524 1800

2045 1843.809524 1800

2823 450.000000 450

1531 1843.809524 1890

1174 1500.000000 1500

... ... ...

7 2100.000000 2100

2208 900.000000 900

1973 1843.809524 1800

1919 1843.809524 1890

537 2100.000000 2100

156 2100.000000 2100

1337 2549.222798 2700

2586 750.000000 750

540 2100.000000 2100

/proyecto-machine-learning.ipynb 12/35
proyecto-machine-learning

y_f y_p

2501 900.000000 900

889 1500.000000 1500

437 2100.000000 2100

712 2549.222798 2400

1367 1500.000000 1500

1796 1843.809524 1890

2624 450.000000 450

326 2549.222798 2400

2735 750.000000 750

1879 1843.809524 1800

502 1500.000000 1500

589 2100.000000 2100

2124 900.000000 900

763 2549.222798 2700

2696 450.000000 450

74 1500.000000 1500

730 2100.000000 2100

1598 900.000000 900

1853 1843.809524 1800

1065 2100.000000 2100

1236 2549.222798 2400

576 rows × 2 columns

In [ ]: # MAE Mean Absule Error


sum(abs(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[91]: 32.4712832743921

In [ ]: # MSE Mean Square Error


sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[92]: 4033.4879994546386

In [ ]: # RMSE Root Mean Square Error


np.sqrt(sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p']))

Out[100]: 63.509747279095976

Comentario: Los valores de las métricas de desempeño del modelo de árbol de decisión son menores y se
ajuntas mejor a los datos en comparación con la Regresión lineal.

Métrica Lineal Regression Tree Decision


MAE 104.24 32.47
MSE 17533.42 4033.4
RMSE 132.41 63.50

Logistic Regressor

/proyecto-machine-learning.ipynb 13/35
proyecto-machine-learning

In [ ]: y = df["segmento"]
x = df[["anio", "mes", "semana", "ticket", "categorias", "productos_unicos"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

In [ ]: lr = LogisticRegression(max_iter = 9000)
lr = lr.fit(x_t, y_t)
y_f = lr.predict(x_p)
df_result = pd.DataFrame({'y_f': y_f, 'y_p': y_p})

/proyecto-machine-learning.ipynb 14/35
proyecto-machine-learning

In [ ]: df_result = pd.DataFrame({"y_f":y_f,"y_p":y_p})
df_result

Out[103]: y_f y_p

881 B B

1752 B B

803 A A

964 A A

5 A A

1505 B B

1857 C C

1742 C B

2723 C C

554 B B

714 A A

930 B B

1859 C C

2080 B B

225 A A

2348 C C

2825 C C

705 A A

484 A A

2074 C C

379 A A

1884 C C

345 A A

1703 B B

811 A A

2090 B B

2045 B B

2823 C C

1531 B B

1174 B B

... ... ...

7 A A

2208 C C

1973 B B

1919 B B

537 A A

156 A A

1337 A A

2586 C C

540 A A

2501 C C

/proyecto-machine-learning.ipynb 15/35
proyecto-machine-learning

y_f y_p

889 B B

437 A A

712 A A

1367 B B

1796 B B

2624 C C

326 A A

2735 C C

1879 B B

502 B B

589 A A

2124 C C

763 A A

2696 C C

74 B B

730 A A

1598 C B

1853 B B

1065 A A

1236 A A

576 rows × 2 columns

In [ ]: # Asertividad
total_reg = len(df_result)
bien_clas = sum(df_result["y_f"] ==df_result["y_p"])
bien_clas/total_reg

Out[104]: 0.94618055555555558

In [ ]: from sklearn.metrics import confusion_matrix

In [ ]: cf=confusion_matrix(y_p, y_f)

/proyecto-machine-learning.ipynb 16/35
proyecto-machine-learning

In [ ]: cf_df = pd.DataFrame(cf,
index =["A","B","C"],
columns =["A","B","C"])
plt.figure(figsize=(5,4))
sns.heatmap(cf_df,annot=True)
plt.title("Matriz de Confusion")
plt.ylabel("Valor Real")
plt.xlabel("Valor Clasificado")
plt.show()

In [ ]: # Precisión = TP / (TP+FP)
from sklearn.metrics import precision_score
precision_score(y_p, y_f, average="macro")

Out[108]: 0.94544823795492838

In [ ]: # Recall = TP / (TP+FN)
from sklearn.metrics import recall_score
recall_score(y_p, y_f, average="macro")

Out[109]: 0.95128301102100232

In [ ]: # f1 = 2TP/(2TP+FP+FN)
from sklearn.metrics import f1_score
f1_score(y_p, y_f, average="macro")

Out[110]: 0.9471270383308692

AVANCE N°03
1 Generar un arbol de decision para saber si un cliente pertenece a una categoria o no

2 Random forest para los datos numéricos y categóricos

3 Relevancia de las variables

Decision Tree - Variable Categórica

/proyecto-machine-learning.ipynb 17/35
proyecto-machine-learning

In [ ]: #Cargamos la máuina de aprendizaje DecisionTreeRegressor


regressor =DecisionTreeClassifier(random_state=8, max_depth=4, min_samples_leaf=6)
regressor.fit(x_t,y_t)

Out[111]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,


max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=8,
splitter='best')

In [ ]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 8 columns):
anio 2880 non-null int64
mes 2880 non-null int64
semana 2880 non-null int64
cliente 2880 non-null object
ticket 2880 non-null int64
categorias 2880 non-null int64
productos_unicos 2880 non-null int64
segmento 2880 non-null object
dtypes: int64(6), object(2)
memory usage: 180.1+ KB

In [ ]: y = df["segmento"]
x = df[["anio", "mes", "semana", "ticket", "categorias", "productos_unicos"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

In [ ]: # Entrenamos el modelo
clf_model =DecisionTreeClassifier(criterion="gini",random_state=9, max_depth=3, min_sampl
clf_model.fit(x_t,y_t)

Out[114]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,


max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=9,
splitter='best')

In [ ]: # Predecir y_p
y_f =clf_model.predict(x_p)

/proyecto-machine-learning.ipynb 18/35
proyecto-machine-learning

In [ ]: df_result = pd.DataFrame({"y_f":y_f,
"y_p":y_p})
df_result

Out[116]: y_f y_p

881 B B

1752 B B

803 A A

964 A A

5 A A

1505 B B

1857 C C

1742 B B

2723 C C

554 B B

714 A A

930 B B

1859 C C

2080 B B

225 A A

2348 C C

2825 C C

705 A A

484 A A

2074 C C

379 A A

1884 C C

345 A A

1703 B B

811 A A

2090 B B

2045 B B

2823 C C

1531 B B

1174 B B

... ... ...

7 A A

2208 C C

1973 B B

1919 B B

537 A A

156 A A

1337 A A

2586 C C

540 A A

/proyecto-machine-learning.ipynb 19/35
proyecto-machine-learning

y_f y_p

2501 C C

889 B B

437 A A

712 A A

1367 B B

1796 B B

2624 C C

326 A A

2735 C C

1879 B B

502 B B

589 A A

2124 C C

763 A A

2696 C C

74 B B

730 A A

1598 B B

1853 B B

1065 A A

1236 A A

576 rows × 2 columns

In [ ]: # Asertividad
total_reg = len(df_result)
bien_clas = sum(df_result["y_f"] ==df_result["y_p"])
bien_clas/total_reg

Out[117]: 0.96006944444444442

/proyecto-machine-learning.ipynb 20/35
proyecto-machine-learning

In [ ]: from sklearn.metrics import confusion_matrix


cf=confusion_matrix(y_p, y_f)
cf_df = pd.DataFrame(cf,
index =["x","Y","Z"],
columns =["X","Y","Z"])
plt.figure(figsize=(5,4))
sns.heatmap(cf_df,annot=True)
plt.title("Matriz de Confusion")
plt.ylabel("Valor Real")
plt.xlabel("Valor Clasificado")
plt.show()

In [ ]: # Precisión = TP / (TP+FP)
from sklearn.metrics import precision_score
precision_score(y_p, y_f, average="macro")

Out[119]: 0.9612794612794614

In [ ]: # Recall = TP / (TP+FN)
from sklearn.metrics import recall_score
recall_score(y_p, y_f, average="macro")

Out[120]: 0.9665211062590976

In [ ]: # f1 = 2TP/(2TP+FP+FN)
from sklearn.metrics import f1_score
f1_score(y_p, y_f, average="macro")

Out[121]: 0.96182141279672939

Random Forest - Variables Numéricas

In [ ]: #Dividir los datos en datos de Entrenamiento y Test PARA VARIABLE OBJETIVO NUMÉRICA "Tick
y = df["ticket"]
x = df[["anio", "mes", "semana", "productos_unicos", "categorias"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

/proyecto-machine-learning.ipynb 21/35
proyecto-machine-learning

In [ ]: #Cargamos la máquina de aprendizaje RandomForestRegressor


from sklearn.ensemble import RandomForestRegressor
modelo = RandomForestRegressor(
n_estimators = 15,
criterion = "mse",
max_depth = 3,
max_features = 3,
n_jobs = -1,
random_state = 9)

modelo.fit(x_t,y_t)

y_f = modelo.predict(x_p)
df_result = pd.DataFrame({'y_f': y_f, 'y_p': y_p})

In [ ]: # Predecir x_p
y_f=modelo.predict(x_p)

/proyecto-machine-learning.ipynb 22/35
proyecto-machine-learning

In [ ]: df_result = pd.DataFrame({"y_f":y_f,
"y_p":y_p})
df_result

Out[125]: y_f y_p

881 1533.053945 1500

1752 1807.879300 1800

803 2583.647503 2700

964 2045.834568 2100

5 2045.834568 2100

1505 1918.202979 1890

1857 900.000000 900

1742 900.000000 900

2723 450.000000 450

554 1533.053945 1500

714 2540.062445 2400

930 1533.053945 1500

1859 900.000000 900

2080 1918.202979 1890

225 2542.094795 2700

2348 750.000000 750

2825 750.000000 750

705 2542.094795 2700

484 2045.834568 2100

2074 900.000000 900

379 2581.615152 2700

1884 900.000000 900

345 2045.834568 2100

1703 1918.202979 1890

811 2581.615152 2700

2090 1807.879300 1800

2045 1807.879300 1800

2823 450.000000 450

1531 1918.202979 1890

1174 1533.053945 1500

... ... ...

7 2045.834568 2100

2208 900.000000 900

1973 1807.879300 1800

1919 1918.202979 1890

537 2045.834568 2100

156 2045.834568 2100

1337 2540.062445 2700

2586 750.000000 750

540 2045.834568 2100

/proyecto-machine-learning.ipynb 23/35
proyecto-machine-learning

y_f y_p

2501 900.000000 900

889 1533.053945 1500

437 2045.834568 2100

712 2513.460089 2400

1367 1533.053945 1500

1796 1918.202979 1890

2624 450.000000 450

326 2540.062445 2400

2735 750.000000 750

1879 1807.879300 1800

502 1533.053945 1500

589 2045.834568 2100

2124 900.000000 900

763 2581.615152 2700

2696 450.000000 450

74 1533.053945 1500

730 2045.834568 2100

1598 900.000000 900

1853 1807.879300 1800

1065 2045.834568 2100

1236 2513.460089 2400

576 rows × 2 columns

Métricas de desempeño de variable numérica

In [ ]: # MAE Mean Absule Error


sum(abs(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[126]: 40.734898976039453

In [ ]: # MSE Mean Square Error


sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[127]: 3783.4599453716687

In [ ]: # RMSE Root Mean Square Error


np.sqrt(sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p']))

Out[128]: 61.509836167654264

Random Forest - Variable Categórica

In [ ]: #Dividir los datos en datos de Entrenamiento y Test PARA VARIABLE OBJETIVO CATEGÓRICA "se
y = df["segmento"]
x = df[["anio", "mes", "semana", "ticket", "categorias", "productos_unicos"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

/proyecto-machine-learning.ipynb 24/35
proyecto-machine-learning

In [ ]: #Cargamos la máquina de aprendizaje RandomForestClassifier


from sklearn.ensemble import RandomForestClassifier
modelo = RandomForestClassifier(
n_estimators = 20,
max_depth = 6,
max_features = 3,
n_jobs = -1,
random_state = 9)

modelo.fit(x_t,y_t)

Out[130]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',


max_depth=6, max_features=3, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
oob_score=False, random_state=9, verbose=0, warm_start=False)

In [ ]: # Predecir x_p
y_f=modelo.predict(x_p)

/proyecto-machine-learning.ipynb 25/35
proyecto-machine-learning

In [ ]: df_result = pd.DataFrame({"y_f":y_f,
"y_p":y_p})
df_result

Out[132]: y_f y_p

881 B B

1752 B B

803 A A

964 A A

5 A A

1505 B B

1857 C C

1742 B B

2723 C C

554 B B

714 A A

930 B B

1859 C C

2080 B B

225 A A

2348 C C

2825 C C

705 A A

484 A A

2074 C C

379 A A

1884 C C

345 A A

1703 B B

811 A A

2090 B B

2045 B B

2823 C C

1531 B B

1174 B B

... ... ...

7 A A

2208 C C

1973 B B

1919 B B

537 A A

156 A A

1337 A A

2586 C C

540 A A

/proyecto-machine-learning.ipynb 26/35
proyecto-machine-learning

y_f y_p

2501 C C

889 B B

437 A A

712 A A

1367 B B

1796 B B

2624 C C

326 A A

2735 C C

1879 B B

502 B B

589 A A

2124 C C

763 A A

2696 C C

74 B B

730 A A

1598 B B

1853 B B

1065 A A

1236 A A

576 rows × 2 columns

In [ ]: # Asertividad
total_reg = len(df_result)
bien_clas = sum(df_result["y_f"] ==df_result["y_p"])
bien_clas/total_reg

Out[133]: 1.0

/.ipynb 27/35
proyecto-machine-learning

In [ ]: from sklearn.metrics import confusion_matrix


cf=confusion_matrix(y_p, y_f)
cf_df = pd.DataFrame(cf,
index =["x","Y","Z"],
columns =["X","Y","Z"])
plt.figure(figsize=(5,4))
sns.heatmap(cf_df,annot=True)
plt.title("Matriz de Confusion")
plt.ylabel("Valor Real")
plt.xlabel("Valor Clasificado")
plt.show()

In [ ]: # Precisión = TP / (TP+FP)
from sklearn.metrics import precision_score
precision_score(y_p, y_f, average="macro")

Out[135]: 1.0

In [ ]: # Recall = TP / (TP+FN)
from sklearn.metrics import recall_score
recall_score(y_p, y_f, average="macro")

Out[136]: 1.0

In [ ]: # f1 = 2TP/(2TP+FP+FN)
from sklearn.metrics import f1_score
f1_score(y_p, y_f, average="macro")

Out[137]: 1.0

Relevancia de las variables en Random forest

In [ ]: modelo.feature_importances_

Out[138]: array([ 0. , 0.1287429 , 0.00330678, 0.33827269, 0.21664793,


0.3130297 ])

/proyecto-machine-learning.ipynb 28/35
proyecto-machine-learning

In [ ]: plt.barh(x_t.columns,modelo.feature_importances_)
plt.show()

AVANCE N°04 Red Neuronal


In [2]: import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [4]:

/proyecto-machine-learning.ipynb 29/35
proyecto-machine-learning

In [5]: df.shape

Out[5]: (2880, 8)

In [6]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 anio 2880 non-null int64
1 mes 2880 non-null int64
2 semana 2880 non-null int64
3 cliente 2880 non-null object
4 ticket 2880 non-null int64
5 categorias 2880 non-null int64
6 productos_unicos 2880 non-null int64
7 segmento 2880 non-null object
dtypes: int64(6), object(2)
memory usage: 180.1+ KB

Red Neuronal con Variable Numérica

In [18]: # Dividir Data de Train y Test


y = df["ticket"]
x = df[["anio", "mes", "semana", "productos_unicos", "categorias"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

/proyecto-machine-learning.ipynb 30/35
proyecto-machine-learning

In [10]: opt = keras.optimizers.Adam(learning_rate = 0.12)


model = keras.models.Sequential()
model.add(keras.layers.Dense(6, input_dim=5, kernel_initializer="normal", activation="rel
model.add(keras.layers.Dense(1, kernel_initializer="normal"))
optimezer = "adam"
model.compile(loss="mean_squared_error", optimizer=opt)
model.fit(x_t, y_t, batch_size=20, epochs=50, verbose=1)

Epoch 1/50
116/116 [==============================] - 1s 2ms/step - loss: 2916612.5000
Epoch 2/50
116/116 [==============================] - 0s 2ms/step - loss: 2873042.7500
Epoch 3/50
116/116 [==============================] - 0s 2ms/step - loss: 2830540.7500
Epoch 4/50
116/116 [==============================] - 0s 2ms/step - loss: 2788632.7500
Epoch 5/50
116/116 [==============================] - 0s 2ms/step - loss: 2747269.0000
Epoch 6/50
116/116 [==============================] - 0s 2ms/step - loss: 2706352.2500
Epoch 7/50
116/116 [==============================] - 0s 2ms/step - loss: 2666070.7500
Epoch 8/50
116/116 [==============================] - 0s 2ms/step - loss: 2626314.5000
Epoch 9/50
116/116 [==============================] - 0s 2ms/step - loss: 2586923.5000
Epoch 10/50
116/116 [==============================] - 0s 2ms/step - loss: 2548163.0000
Epoch 11/50
116/116 [==============================] - 0s 2ms/step - loss: 2509822.0000
Epoch 12/50
116/116 [==============================] - 1s 4ms/step - loss: 2471903.0000
Epoch 13/50
116/116 [==============================] - 0s 4ms/step - loss: 2434570.5000
Epoch 14/50
116/116 [==============================] - 0s 2ms/step - loss: 2397754.7500
Epoch 15/50
116/116 [==============================] - 0s 1ms/step - loss: 2361337.0000
Epoch 16/50
116/116 [==============================] - 0s 1ms/step - loss: 2325228.2500
Epoch 17/50
116/116 [==============================] - 0s 1ms/step - loss: 2289570.2500
Epoch 18/50
116/116 [==============================] - 0s 1ms/step - loss: 2254541.0000
Epoch 19/50
116/116 [==============================] - 0s 1ms/step - loss: 2219812.5000
Epoch 20/50
116/116 [==============================] - 0s 1ms/step - loss: 2185505.0000
Epoch 21/50
116/116 [==============================] - 0s 1ms/step - loss: 2151740.2500
Epoch 22/50
116/116 [==============================] - 0s 1ms/step - loss: 2118250.0000
Epoch 23/50
116/116 [==============================] - 0s 1ms/step - loss: 2085126.5000
Epoch 24/50
116/116 [==============================] - 0s 1ms/step - loss: 2052417.3750
Epoch 25/50
116/116 [==============================] - 0s 1ms/step - loss: 2020124.5000
Epoch 26/50
116/116 [==============================] - 0s 1ms/step - loss: 1988186.6250
Epoch 27/50
116/116 [==============================] - 0s 1ms/step - loss: 1956697.5000
Epoch 28/50
116/116 [==============================] - 0s 1ms/step - loss: 1925606.5000
Epoch 29/50
116/116 [==============================] - 0s 1ms/step - loss: 1894932.6250
Epoch 30/50

/proyecto-machine-learning.ipynb 31/35
proyecto-machine-learning
116/116 [==============================] - 0s 1ms/step - loss: 1864675.5000
Epoch 31/50
116/116 [==============================] - 0s 1ms/step - loss: 1834724.8750
Epoch 32/50
116/116 [==============================] - 0s 1ms/step - loss: 1805070.2500
Epoch 33/50
116/116 [==============================] - 0s 1ms/step - loss: 1775889.7500
Epoch 34/50
116/116 [==============================] - 0s 1ms/step - loss: 1747008.7500
Epoch 35/50
116/116 [==============================] - 0s 1ms/step - loss: 1718562.0000
Epoch 36/50
116/116 [==============================] - 0s 1ms/step - loss: 1690402.8750
Epoch 37/50
116/116 [==============================] - 0s 1ms/step - loss: 1662699.8750
Epoch 38/50
116/116 [==============================] - 0s 1ms/step - loss: 1635375.5000
Epoch 39/50
116/116 [==============================] - 0s 2ms/step - loss: 1608355.5000
Epoch 40/50
116/116 [==============================] - 0s 1ms/step - loss: 1581654.0000
Epoch 41/50
116/116 [==============================] - 0s 1ms/step - loss: 1555291.5000
Epoch 42/50
116/116 [==============================] - 0s 1ms/step - loss: 1529297.0000
Epoch 43/50
116/116 [==============================] - 0s 1ms/step - loss: 1503746.1250
Epoch 44/50
116/116 [==============================] - 0s 1ms/step - loss: 1478571.5000
Epoch 45/50
116/116 [==============================] - 0s 1ms/step - loss: 1453597.5000
Epoch 46/50
116/116 [==============================] - 0s 1ms/step - loss: 1428998.3750
Epoch 47/50
116/116 [==============================] - 0s 1ms/step - loss: 1404805.5000
Epoch 48/50
116/116 [==============================] - 0s 1ms/step - loss: 1380977.3750
Epoch 49/50
116/116 [==============================] - 0s 1ms/step - loss: 1357417.5000
Epoch 50/50
116/116 [==============================] - 0s 1ms/step - loss: 1334234.2500

Out[10]: <keras.callbacks.History at 0x7f5ac8344f10>

/proyecto-machine-learning.ipynb 32/35
proyecto-machine-learning

In [13]: # Predicción y_p


y_f0 = model.predict(x_p)
y_f =[]
for i in range(len(y_f0)):
y_f.append(y_f0[i][0])

y_f
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646.3365,
646 3365
In [14]: df_result = pd.DataFrame({'y_f': y_f, 'y_p': y_p})

Métricas

In [15]: # MAE Mean Absule Error


sum(abs(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[15]: 999.6164646148682

In [16]: # MSE Mean Square Error


sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p'])

Out[16]: 1389653.0628617601

In [17]: # RMSE Root Mean Square Error


np.sqrt(sum(np.square(df_result['y_p']-df_result['y_f']))/len(df_result['y_p']))

Out[17]: 1178.8354689530513

Red Neuronal Variable Categórica

In [32]: # Dividir Data de Train y Test


y = df["segmento"]
x = df[["anio", "mes", "semana", "ticket", "categorias", "productos_unicos"]]
x_t, x_p, y_t, y_p = train_test_split(x, y, test_size = 0.20, random_state = 9)

In [ ]: y_td = keras.utils.to_categorical(y_t)

/proyecto-machine-learning.ipynb 33/35
proyecto-machine-learning

In [ ]: # Preparar la red y entrenar


model = keras.models.Sequential()
model.add(keras.layers.Dense(6, input_dim=6, activation="relu", kernel_initializer="he_un
model.add(keras.layers.Dense(3, activation="softmax"))
opt = keras.optimizers.SGD(lr=.18)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.fit(x_t, y_td, epochs=200, verbose=1)

In [ ]: y_f0 = model.predict(x_p)
y_f0

In [ ]: y_f0 = model.predict(x_p)
y_f = []
for i in range(len(y_f0)):
y_f.append(np.argmax(y_f0[i]))
y_f
df_result = pd.DataFrame({'y_f': y_f,
'y_p': y_p})
df_result

/proyecto-machine-learning.ipynb 34/35

35/35

You might also like