You are on page 1of 16

In 

[1]:

1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 import plotly.express as px
6 import plotly.graph_objs as go
7 from plotly.subplots import make_subplots
8 import math
9 import datetime
10 sns.set()
11 sns.set_style('whitegrid')
12 # plt.style.use("dark_background")
13 import warnings
14 warnings.filterwarnings('ignore')
15 pd.set_option('display.max_columns', 500)
16 from xgboost import XGBRegressor
17 from xgboost import plot_importance
18 from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
19 from scipy.signal import periodogram
20 ​
21 ​
22 def plot_features(booster, figsize):
23 fig, ax = plt.subplots(1,1,figsize=figsize)
24 return plot_importance(booster=booster, ax=ax)
25 ​
26 import time
27 from itertools import product
28 %matplotlib inline
29 import os
30 import matplotlib.style as style
31 import matplotlib.gridspec as gridspec
32 import scipy.stats as stats
33 from scipy.special import boxcox1p
34 from scipy.stats import boxcox_normmax
35 from sklearn.model_selection import GridSearchCV
36 from sklearn.linear_model import Ridge
37 from sklearn.ensemble import RandomForestRegressor
38 from sklearn.neighbors import KNeighborsRegressor

---------------------------------------------------------------------------

ModuleNotFoundError Traceback (most recent call last)

<ipython-input-1-7807e2964a22> in <module>()

16 from xgboost import XGBRegressor

17 from xgboost import plot_importance

---> 18 from statsmodels.tsa.deterministic import CalendarFourier, Determini


sticProcess

19 from scipy.signal import periodogram

20

ModuleNotFoundError: No module named 'statsmodels.tsa.deterministic'

In [2]:

1 d1= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180


2 l1= d1.keys()
3 len(l1)

Out[2]:

40

In [3]:

1 d2= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180


2 l2= d2.keys()
3 len(l2)

Out[3]:

50

In [6]:

1 a,b,c= 2,4,8
2 print(b/a+c)
3 print(b/(a+c))

10.0

0.4

In [ ]:

1 ​

In [4]:

1 set(l2)- set(l1)

Out[4]:

{'accurate',

'actions',

'add',

'aged',

'ages',

'agree',

'akin',

'allow',

'amaze',

'angel'}
In [2]:

1 for dirname, _, filenames in os.walk('D:\AppliedAI\Python_Scripts\Kaggle_Data'):


2 for filename in filenames:
3 #print(os.path.join( filename))
4 if (filename.find('train') != -1):
5 train=pd.read_csv(os.path.join(dirname, filename))
6
7 elif (filename.find('test') != -1):
8 test=pd.read_csv(os.path.join(dirname, filename))
9 else:
10 submission=pd.read_csv(os.path.join(dirname, filename))

In [12]:

1 test['num_sold']=0
2 test.head()

Out[12]:

row_id date country store product num_sold

0 26298 2019-01-01 Finland KaggleMart Kaggle Mug 0

1 26299 2019-01-01 Finland KaggleMart Kaggle Hat 0

2 26300 2019-01-01 Finland KaggleMart Kaggle Sticker 0

3 26301 2019-01-01 Finland KaggleRama Kaggle Mug 0

4 26302 2019-01-01 Finland KaggleRama Kaggle Hat 0

In [17]:

1 matrix=pd.concat([train[['date','num_sold']],test[['date','num_sold']]],ignore_index=Tr

Lag Features
In [19]:

1 df = matrix.reset_index()
2 df = df.set_index(['date'])
3 entries_perday = len(df.loc['2015-01-08'])
4 df = df.reset_index().set_index('date')
5 df.drop('index',axis=1,inplace=True)
6 print('Number of entries in a day across all stores : {}'.format(entries_perday))
7 df.head()

Number of entries in a day across all stores : 18

Out[19]:

num_sold

date

2015-01-01 329

2015-01-01 520

2015-01-01 146

2015-01-01 572

2015-01-01 911

In [21]:

1 for lag in range(1, 11):


2 df['num_sold_lag' + str(lag)] = df['num_sold'].shift(18 * lag)
3 df.head(5)

Out[21]:

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag3 num_sold_lag4 num_sold_lag

date

2015-
329 NaN NaN NaN NaN Na
01-01

2015-
520 NaN NaN NaN NaN Na
01-01

2015-
146 NaN NaN NaN NaN Na
01-01

2015-
572 NaN NaN NaN NaN Na
01-01

2015-
911 NaN NaN NaN NaN Na
01-01
In [22]:

1 plt.figure(figsize = (16, 8))


2 columns = [col for col in df.columns if 'num_sold' in col]
3 sns.heatmap(data = df[df.index<'2019-01-01'][columns].corr(), square = True,
4 annot = True, cmap = "Reds", vmax=1, vmin=.83, fmt = ".3f")
5 plt.xticks(rotation = 40)
6 plt.title('Correlation - sales vs lagged sales', fontsize = 18);

Scatter Plot
In [24]:

1 fig, axs = plt.subplots(2, 5, figsize = (15, 6))


2 axs = axs.flatten()
3 for i in range(1, 11):
4 feature = 'num_sold_lag' + str(i)
5 sns.scatterplot(x=df[feature], y=df['num_sold'], ax=axs[i - 1], s=5)
6 axs[i - 1].set_title(feature, fontsize = 14)
7 ​
8 plt.suptitle('Scatterplot - Lags vs Sales', fontsize = 18)
9 plt.tight_layout();

In [25]:

1 df.drop(columns = ['num_sold_lag3', 'num_sold_lag4', 'num_sold_lag5', 'num_sold_lag6',


In [28]:

1 # Creating a periodogram.
2 ​
3 fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
4 freqencies, spectrum = periodogram(
5 train['num_sold'],
6 fs=fs,
7 detrend='linear',
8 window="boxcar",
9 scaling='spectrum',
10 )
11 fig, ax = plt.subplots(figsize = (16, 5))
12 ax.step(freqencies, spectrum, color="purple")
13 ax.set_xscale("log")
14 ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
15 ax.set_xticklabels(
16 [
17 "Annual (1)",
18 "Semiannual (2)",
19 "Quarterly (4)",
20 "Bimonthly (6)",
21 "Monthly (12)",
22 "Biweekly (26)",
23 "Weekly (52)",
24 "Semiweekly (104)",
25 ],
26 rotation=90,
27 )
28 ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
29 ax.set_ylabel("Variance")
30 ax.set_title("Periodogram", fontsize = 18);
31 ​
In [29]:

1 # choosing order = 4 because semiweekly, biweekly, and weekly periodicity was observere
2 fourier = CalendarFourier(freq="W", order=4)
3 data = df.reset_index().set_index([pd.DatetimeIndex(df.reset_index()['date'])])
4 y = data['num_sold']
5 dp = DeterministicProcess(
6 index= y.index,
7 order=1,
8 seasonal=False,
9 constant=False,
10 additional_terms = [fourier],
11 drop = True
12 )
13 ​
14 X = dp.in_sample()
15 X.shape

Out[29]:

(32868, 7)

In [31]:

1 df=df.reset_index()
2 df['date']=pd.to_datetime(df['date'])
3 df=df.reset_index().set_index('date')
4 df.head()

Out[31]:

index num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

date

2015-01-01 0 329 NaN NaN NaN

2015-01-01 1 520 NaN NaN NaN

2015-01-01 2 146 NaN NaN NaN

2015-01-01 3 572 NaN NaN NaN

2015-01-01 4 911 NaN NaN NaN


In [32]:

1 df=df.drop('index',axis=1).join(X.drop('trend',axis=1),how='inner')
2 df.head()

Out[32]:

sin(1,freq=W- cos(1,freq=W-
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN)

date

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
In [35]:

1 df=df.reset_index()
2 X=df[['date','num_sold']]
3 X['date']=X['date'].apply(lambda x: x.toordinal())
4 #https://www.analyticsvidhya.com/blog/2021/05/k-means-clustering-with-mall-customer-seg
5 #Importing KMeans from sklearn
6 from sklearn.cluster import KMeans
7 wcss=[]
8 for i in range(1,11):
9 km=KMeans(n_clusters=i)
10 km.fit(X)
11 wcss.append(km.inertia_)
12 #The elbow curve
13 plt.figure(figsize=(12,6))
14 plt.plot(range(1,11),wcss)
15 plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")
16 plt.xlabel("K Value")
17 plt.xticks(np.arange(1,11,1))
18 plt.ylabel("WCSS")
19 plt.show()
In [36]:

1 #Taking 5 clusters
2 km1=KMeans(n_clusters=5)
3 #Fitting the input data
4 km1.fit(X)
5 #predicting the labels of the input data
6 y=km1.predict(X)
7 #adding the labels to a column named label
8 df["label"] = y
9 #The new dataframe with the clustering done
10 df.head()

Out[36]:

sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN

2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
In [38]:

1 # extract date features


2 df['year'] = pd.to_datetime(df['date']).dt.year
3 df['month_name'] = pd.to_datetime(df['date']).dt.month_name()
4 df['month'] = pd.to_datetime(df['date']).dt.month
5 df['day'] = pd.to_datetime(df['date']).dt.day
6 df['day_of_week_name'] = pd.to_datetime(df['date']).dt.day_name()
7 df['day_of_week'] = pd.to_datetime(df['date']).dt.day_of_week
8 df.head()

Out[38]:

sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN

2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01

Standardaization

In [41]:

1 df.fillna(0,inplace=True)
2 from sklearn.preprocessing import StandardScaler
3 scaler=StandardScaler()
4 scaler.fit(df[['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']])
5 scaled=pd.DataFrame(scaler.transform(df[['num_sold','num_sold_lag1','num_sold_lag2','nu
6 scaled.columns=['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']
7 scaled.head()

Out[41]:

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

0 0.066655 -1.091752 -1.091752 -1.091752

1 0.066655 -1.091752 -1.091752 -1.091752

2 0.066655 -1.091752 -1.091752 -1.091752

3 0.066655 -1.091752 -1.091752 -1.091752

4 0.066655 -1.091752 -1.091752 -1.091752


In [45]:

1 df=df[['date', 'sin(1,freq=W-SUN)', 'cos(1,freq=W-SUN)', 'sin(2,freq=W-SUN)',


2 'cos(2,freq=W-SUN)', 'cos(3,freq=W-SUN)', 'sin(4,freq=W-SUN)', 'label',
3 'year', 'month_name', 'month', 'day', 'day_of_week',
4 'day_of_week_name']].join(scaled,how='inner')

In [46]:

1 df=pd.get_dummies(df)
2 df.head()

Out[46]:

sin(1,freq=W- cos(1,freq=W- sin(2,freq=W- cos(2,freq=W- cos(3,freq=W- sin(4,freq=W-


date
SUN) SUN) SUN) SUN) SUN) SUN)

2015-
0 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
1 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
2 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
3 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
4 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

In [50]:

1 X_train = df[df.date < '2018-06-30'].drop(['num_sold','date'], axis=1)


2 Y_train = df[df.date < '2018-06-30']['num_sold']
3 X_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')].drop(['num_sold','dat
4 Y_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')]['num_sold']
5 X_test = df[df.date > '2018-12-31'].drop(['num_sold','date'], axis=1)
In [51]:

1 ts = time.time()
2 ​
3 model = XGBRegressor(
4 max_depth=8,
5 n_estimators=1000,
6 min_child_weight=300,
7 colsample_bytree=0.8,
8 subsample=0.8,
9 eta=0.3,
10 seed=42)
11 ​
12 model.fit(
13 X_train,
14 Y_train,
15 eval_metric="mape",
16 eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
17 verbose=True,
18 early_stopping_rounds = 10)
19 ​
20 time.time() - ts

[0] validation_0-mape:4.85086 validation_1-mape:8.26721

[1] validation_0-mape:3.42136 validation_1-mape:5.78417

[2] validation_0-mape:2.44605 validation_1-mape:4.01639

[3] validation_0-mape:1.77397 validation_1-mape:2.79364

[4] validation_0-mape:1.34831 validation_1-mape:2.01040

[5] validation_0-mape:1.10281 validation_1-mape:1.44146

[6] validation_0-mape:0.95003 validation_1-mape:1.15750

[7] validation_0-mape:0.86617 validation_1-mape:1.00791

[8] validation_0-mape:0.81581 validation_1-mape:0.91798

[9] validation_0-mape:0.79867 validation_1-mape:0.86188

[10] validation_0-mape:0.78690 validation_1-mape:0.80864

[11] validation_0-mape:0.77299 validation_1-mape:0.80242

[12] validation_0-mape:0.75649 validation_1-mape:0.80535

[13] validation_0-mape:0.75429 validation_1-mape:0.81420

[14] validation_0-mape:0.75406 validation_1-mape:0.81496

[15] validation_0-mape:0.75047 validation_1-mape:0.81933

[16] validation_0-mape:0.75265 validation_1-mape:0.82055

[17] validation_0-mape:0.75017 validation_1-mape:0.82509

[18] validation_0-mape:0.75107 validation_1-mape:0.82793

[19] validation_0-mape:0.75062 validation_1-mape:0.82923

[20] validation_0-mape:0.75051 validation_1-mape:0.82955

[21] validation_0-mape:0.75093 validation_1-mape:0.82811

Out[51]:

15.90307068824768

Predictions
In [52]:

1 model.predict(X_test)

Out[52]:

array([0.3278924 , 0.3278924 , 0.3278924 , ..., 0.86840504, 0.86840504,

0.86840504], dtype=float32)

In [11]:

1 df= pd.read_excel("saaki test.xlsx")


2 df

Out[11]:

product_price product_title

0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...

2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console

3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...

4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

... ... ...

32666 109.99, Sale74.95 Kamik Gamma2 Snow Boots (For Women)

32667 $175.00 Hunter Rain Boots - Tour Neoprene

32668 175.00, Sale119.95 Blondo Steffy Snow Boots - Waterproof (For Women)

32669 $169.00 Maine Pac Boots

32670 59.99, Sale39.95 Itasca Icebreaker Snow Boots - Waterproof, Ins...

32671 rows × 2 columns

In [12]:

1 df.head()

Out[12]:

product_price product_title

0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...

2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console

3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...

4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

In [13]:

1 from fuzzywuzzy import fuzz


2 from fuzzywuzzy import process
3 import pandas
In [20]:

1 lst= df['product_title'].tolist()
2 #k=process.extract("Anti-Dark", lst)

In [23]:

1 k= process.extract("Anti-Dark", lst)

In [24]:

1 len(k)

Out[24]:

In [19]:

1 type(k)

Out[19]:

list

In [22]:

1 for i in k:
2 print(fuzz.ratio('Anti-Matter', i))
3

24

17

15

18

20

10

24

14

In [ ]:

1 ​

You might also like