'Whitegrid': # PLT - Style.use ("Dark - Background")

In
[1]:
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 import plotly.express as px
6 import plotly.graph_objs as go
7 from plotly.subplots import make_subplots
8 import math
9 import datetime
10 sns.set()
11 sns.set_style('whitegrid')
12 # plt.style.use("dark_background")
13 import warnings
14 warnings.filterwarnings('ignore')
15 pd.set_option('display.max_columns', 500)
16 from xgboost import XGBRegressor
17 from xgboost import plot_importance
18 from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
19 from scipy.signal import periodogram
20
21
22 def plot_features(booster, figsize):
23 fig, ax = plt.subplots(1,1,figsize=figsize)
24 return plot_importance(booster=booster, ax=ax)
25
26 import time
27 from itertools import product
28 %matplotlib inline
29 import os
30 import matplotlib.style as style
31 import matplotlib.gridspec as gridspec
32 import scipy.stats as stats
33 from scipy.special import boxcox1p
34 from scipy.stats import boxcox_normmax
35 from sklearn.model_selection import GridSearchCV
36 from sklearn.linear_model import Ridge
37 from sklearn.ensemble import RandomForestRegressor
38 from sklearn.neighbors import KNeighborsRegressor
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-7807e2964a22> in <module>()
16 from xgboost import XGBRegressor
17 from xgboost import plot_importance
---> 18 from statsmodels.tsa.deterministic import CalendarFourier, Determini

sticProcess
19 from scipy.signal import periodogram
20
ModuleNotFoundError: No module named 'statsmodels.tsa.deterministic'
In [2]:
1 d1= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180

2 l1= d1.keys()
3 len(l1)
Out[2]:
40
In [3]:
1 d2= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180

2 l2= d2.keys()
3 len(l2)
Out[3]:
50
In [6]:
1 a,b,c= 2,4,8
2 print(b/a+c)
3 print(b/(a+c))
10.0
0.4
In [ ]:
1
In [4]:
1 set(l2)- set(l1)
Out[4]:
{'accurate',
'actions',
'add',
'aged',
'ages',
'agree',
'akin',
'allow',
'amaze',
'angel'}
In [2]:
1 for dirname, _, filenames in os.walk('D:\AppliedAI\Python_Scripts\Kaggle_Data'):

2 for filename in filenames:
3 #print(os.path.join( filename))
4 if (filename.find('train') != -1):
5 train=pd.read_csv(os.path.join(dirname, filename))
6
7 elif (filename.find('test') != -1):
8 test=pd.read_csv(os.path.join(dirname, filename))
9 else:
10 submission=pd.read_csv(os.path.join(dirname, filename))
In [12]:
1 test['num_sold']=0
2 test.head()
Out[12]:
row_id date country store product num_sold
0 26298 2019-01-01 Finland KaggleMart Kaggle Mug 0
1 26299 2019-01-01 Finland KaggleMart Kaggle Hat 0
2 26300 2019-01-01 Finland KaggleMart Kaggle Sticker 0
3 26301 2019-01-01 Finland KaggleRama Kaggle Mug 0
4 26302 2019-01-01 Finland KaggleRama Kaggle Hat 0
In [17]:
1 matrix=pd.concat([train[['date','num_sold']],test[['date','num_sold']]],ignore_index=Tr
Lag Features
In [19]:
1 df = matrix.reset_index()
2 df = df.set_index(['date'])
3 entries_perday = len(df.loc['2015-01-08'])
4 df = df.reset_index().set_index('date')
5 df.drop('index',axis=1,inplace=True)
6 print('Number of entries in a day across all stores : {}'.format(entries_perday))
7 df.head()
Number of entries in a day across all stores : 18
Out[19]:
num_sold
date
2015-01-01 329
2015-01-01 520
2015-01-01 146
2015-01-01 572
2015-01-01 911
In [21]:
1 for lag in range(1, 11):

2 df['num_sold_lag' + str(lag)] = df['num_sold'].shift(18 * lag)
3 df.head(5)
Out[21]:
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag3 num_sold_lag4 num_sold_lag
date
2015-
329 NaN NaN NaN NaN Na
01-01
2015-
01-01
2015-
01-01
2015-
01-01
2015-
01-01
In [22]:
1 plt.figure(figsize = (16, 8))

2 columns = [col for col in df.columns if 'num_sold' in col]
3 sns.heatmap(data = df[df.index<'2019-01-01'][columns].corr(), square = True,
4 annot = True, cmap = "Reds", vmax=1, vmin=.83, fmt = ".3f")
5 plt.xticks(rotation = 40)
6 plt.title('Correlation - sales vs lagged sales', fontsize = 18);
Scatter Plot
In [24]:
1 fig, axs = plt.subplots(2, 5, figsize = (15, 6))

2 axs = axs.flatten()
3 for i in range(1, 11):
4 feature = 'num_sold_lag' + str(i)
5 sns.scatterplot(x=df[feature], y=df['num_sold'], ax=axs[i - 1], s=5)
6 axs[i - 1].set_title(feature, fontsize = 14)
7
8 plt.suptitle('Scatterplot - Lags vs Sales', fontsize = 18)
9 plt.tight_layout();
In [25]:
1 df.drop(columns = ['num_sold_lag3', 'num_sold_lag4', 'num_sold_lag5', 'num_sold_lag6',

In [28]:
1 # Creating a periodogram.
2
3 fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
4 freqencies, spectrum = periodogram(
5 train['num_sold'],
6 fs=fs,
7 detrend='linear',
8 window="boxcar",
9 scaling='spectrum',
10 )
11 fig, ax = plt.subplots(figsize = (16, 5))
12 ax.step(freqencies, spectrum, color="purple")
13 ax.set_xscale("log")
14 ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
15 ax.set_xticklabels(
16 [
17 "Annual (1)",
18 "Semiannual (2)",
19 "Quarterly (4)",
20 "Bimonthly (6)",
21 "Monthly (12)",
22 "Biweekly (26)",
23 "Weekly (52)",
24 "Semiweekly (104)",
25 ],
26 rotation=90,
27 )
28 ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
29 ax.set_ylabel("Variance")
30 ax.set_title("Periodogram", fontsize = 18);
31
In [29]:
1 # choosing order = 4 because semiweekly, biweekly, and weekly periodicity was observere
2 fourier = CalendarFourier(freq="W", order=4)
3 data = df.reset_index().set_index([pd.DatetimeIndex(df.reset_index()['date'])])
4 y = data['num_sold']
5 dp = DeterministicProcess(
6 index= y.index,
7 order=1,
8 seasonal=False,
9 constant=False,
10 additional_terms = [fourier],
11 drop = True
12 )
13
14 X = dp.in_sample()
15 X.shape
Out[29]:
(32868, 7)
In [31]:
1 df=df.reset_index()
2 df['date']=pd.to_datetime(df['date'])
3 df=df.reset_index().set_index('date')
4 df.head()
Out[31]:
index num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
date
2015-01-01 0 329 NaN NaN NaN
2015-01-01 1 520 NaN NaN NaN
2015-01-01 2 146 NaN NaN NaN
2015-01-01 3 572 NaN NaN NaN
2015-01-01 4 911 NaN NaN NaN

In [32]:
1 df=df.drop('index',axis=1).join(X.drop('trend',axis=1),how='inner')
2 df.head()
Out[32]:
sin(1,freq=W- cos(1,freq=W-
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN)
date
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
In [35]:
1 df=df.reset_index()
2 X=df[['date','num_sold']]
3 X['date']=X['date'].apply(lambda x: x.toordinal())
4 #https://www.analyticsvidhya.com/blog/2021/05/k-means-clustering-with-mall-customer-seg
5 #Importing KMeans from sklearn
6 from sklearn.cluster import KMeans
7 wcss=[]
8 for i in range(1,11):
9 km=KMeans(n_clusters=i)
10 km.fit(X)
11 wcss.append(km.inertia_)
12 #The elbow curve
13 plt.figure(figsize=(12,6))
14 plt.plot(range(1,11),wcss)
15 plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")
16 plt.xlabel("K Value")
17 plt.xticks(np.arange(1,11,1))
18 plt.ylabel("WCSS")
19 plt.show()
In [36]:
1 #Taking 5 clusters
2 km1=KMeans(n_clusters=5)
3 #Fitting the input data
4 km1.fit(X)
5 #predicting the labels of the input data
6 y=km1.predict(X)
7 #adding the labels to a column named label
8 df["label"] = y
9 #The new dataframe with the clustering done
10 df.head()
Out[36]:
sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN
2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
In [38]:
1 # extract date features

2 df['year'] = pd.to_datetime(df['date']).dt.year
3 df['month_name'] = pd.to_datetime(df['date']).dt.month_name()
4 df['month'] = pd.to_datetime(df['date']).dt.month
5 df['day'] = pd.to_datetime(df['date']).dt.day
6 df['day_of_week_name'] = pd.to_datetime(df['date']).dt.day_name()
7 df['day_of_week'] = pd.to_datetime(df['date']).dt.day_of_week
8 df.head()
Out[38]:
sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN
2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
Standardaization
In [41]:
1 df.fillna(0,inplace=True)
2 from sklearn.preprocessing import StandardScaler
3 scaler=StandardScaler()
4 scaler.fit(df[['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']])
5 scaled=pd.DataFrame(scaler.transform(df[['num_sold','num_sold_lag1','num_sold_lag2','nu
6 scaled.columns=['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']
7 scaled.head()
Out[41]:
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
0 0.066655 -1.091752 -1.091752 -1.091752
1 0.066655 -1.091752 -1.091752 -1.091752
2 0.066655 -1.091752 -1.091752 -1.091752
3 0.066655 -1.091752 -1.091752 -1.091752
4 0.066655 -1.091752 -1.091752 -1.091752

In [45]:
1 df=df[['date', 'sin(1,freq=W-SUN)', 'cos(1,freq=W-SUN)', 'sin(2,freq=W-SUN)',

2 'cos(2,freq=W-SUN)', 'cos(3,freq=W-SUN)', 'sin(4,freq=W-SUN)', 'label',
3 'year', 'month_name', 'month', 'day', 'day_of_week',
4 'day_of_week_name']].join(scaled,how='inner')
In [46]:
1 df=pd.get_dummies(df)
2 df.head()
Out[46]:
sin(1,freq=W- cos(1,freq=W- sin(2,freq=W- cos(2,freq=W- cos(3,freq=W- sin(4,freq=W-

date
SUN) SUN) SUN) SUN) SUN) SUN)
2015-
0 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
1 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
2 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
3 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
4 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
In [50]:
1 X_train = df[df.date < '2018-06-30'].drop(['num_sold','date'], axis=1)

2 Y_train = df[df.date < '2018-06-30']['num_sold']
3 X_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')].drop(['num_sold','dat
4 Y_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')]['num_sold']
5 X_test = df[df.date > '2018-12-31'].drop(['num_sold','date'], axis=1)
In [51]:
1 ts = time.time()
2
3 model = XGBRegressor(
4 max_depth=8,
5 n_estimators=1000,
6 min_child_weight=300,
7 colsample_bytree=0.8,
8 subsample=0.8,
9 eta=0.3,
10 seed=42)
11
12 model.fit(
13 X_train,
14 Y_train,
15 eval_metric="mape",
16 eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
17 verbose=True,
18 early_stopping_rounds = 10)
19
20 time.time() - ts
[0] validation_0-mape:4.85086 validation_1-mape:8.26721
Out[51]:
15.90307068824768
Predictions
In [52]:
1 model.predict(X_test)
Out[52]:
array([0.3278924 , 0.3278924 , 0.3278924 , ..., 0.86840504, 0.86840504,
0.86840504], dtype=float32)
In [11]:
1 df= pd.read_excel("saaki test.xlsx")

2 df
Out[11]:
product_price product_title
0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...
1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...
2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console
3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...
4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...
... ... ...
32666 109.99, Sale74.95 Kamik Gamma2 Snow Boots (For Women)
32667 $175.00 Hunter Rain Boots - Tour Neoprene
32668 175.00, Sale119.95 Blondo Steffy Snow Boots - Waterproof (For Women)
32669 $169.00 Maine Pac Boots
32670 59.99, Sale39.95 Itasca Icebreaker Snow Boots - Waterproof, Ins...
32671 rows × 2 columns
In [12]:
1 df.head()
Out[12]:
product_price product_title
0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...
1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...
2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console
3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...
4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...
In [13]:
1 from fuzzywuzzy import fuzz

2 from fuzzywuzzy import process
3 import pandas
In [20]:
1 lst= df['product_title'].tolist()
2 #k=process.extract("Anti-Dark", lst)
In [23]:
1 k= process.extract("Anti-Dark", lst)
In [24]:
1 len(k)
Out[24]:
In [19]:
1 type(k)
Out[19]:
list
In [22]:
1 for i in k:
2 print(fuzz.ratio('Anti-Matter', i))
3
24
17
15
18
20
10
24
14
In [ ]:
1

'Whitegrid': # PLT - Style.use ("Dark - Background")

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

'Whitegrid': # PLT - Style.use ("Dark - Background")

Uploaded by

Copyright:

Available Formats

In

ModuleNotFoundError Traceback (most recent call last)

16 from xgboost import XGBRegressor

17 from xgboost import plot_importance

---> 18 from statsmodels.tsa.deterministic import CalendarFourier, Determini

19 from scipy.signal import periodogram

ModuleNotFoundError: No module named 'statsmodels.tsa.deterministic'

1 d1= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180

1 d2= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180

1 for dirname, _, filenames in os.walk('D:\AppliedAI\Python_Scripts\Kaggle_Data'):

row_id date country store product num_sold

0 26298 2019-01-01 Finland KaggleMart Kaggle Mug 0

1 26299 2019-01-01 Finland KaggleMart Kaggle Hat 0

2 26300 2019-01-01 Finland KaggleMart Kaggle Sticker 0

3 26301 2019-01-01 Finland KaggleRama Kaggle Mug 0

4 26302 2019-01-01 Finland KaggleRama Kaggle Hat 0

Number of entries in a day across all stores : 18

1 for lag in range(1, 11):

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag3 num_sold_lag4 num_sold_lag

1 plt.figure(figsize = (16, 8))

1 fig, axs = plt.subplots(2, 5, figsize = (15, 6))

1 df.drop(columns = ['num_sold_lag3', 'num_sold_lag4', 'num_sold_lag5', 'num_sold_lag6',

index num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

2015-01-01 0 329 NaN NaN NaN

2015-01-01 1 520 NaN NaN NaN

2015-01-01 2 146 NaN NaN NaN

2015-01-01 3 572 NaN NaN NaN

2015-01-01 4 911 NaN NaN NaN

1 # extract date features

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

0 0.066655 -1.091752 -1.091752 -1.091752

1 0.066655 -1.091752 -1.091752 -1.091752

2 0.066655 -1.091752 -1.091752 -1.091752

3 0.066655 -1.091752 -1.091752 -1.091752

4 0.066655 -1.091752 -1.091752 -1.091752

1 df=df[['date', 'sin(1,freq=W-SUN)', 'cos(1,freq=W-SUN)', 'sin(2,freq=W-SUN)',

sin(1,freq=W- cos(1,freq=W- sin(2,freq=W- cos(2,freq=W- cos(3,freq=W- sin(4,freq=W-

1 X_train = df[df.date < '2018-06-30'].drop(['num_sold','date'], axis=1)

[0] validation_0-mape:4.85086 validation_1-mape:8.26721

[1] validation_0-mape:3.42136 validation_1-mape:5.78417

[2] validation_0-mape:2.44605 validation_1-mape:4.01639

[3] validation_0-mape:1.77397 validation_1-mape:2.79364

[4] validation_0-mape:1.34831 validation_1-mape:2.01040

[5] validation_0-mape:1.10281 validation_1-mape:1.44146

[6] validation_0-mape:0.95003 validation_1-mape:1.15750

[7] validation_0-mape:0.86617 validation_1-mape:1.00791

[8] validation_0-mape:0.81581 validation_1-mape:0.91798

[9] validation_0-mape:0.79867 validation_1-mape:0.86188

[10] validation_0-mape:0.78690 validation_1-mape:0.80864

[11] validation_0-mape:0.77299 validation_1-mape:0.80242

[12] validation_0-mape:0.75649 validation_1-mape:0.80535

[13] validation_0-mape:0.75429 validation_1-mape:0.81420

[14] validation_0-mape:0.75406 validation_1-mape:0.81496

[15] validation_0-mape:0.75047 validation_1-mape:0.81933

[16] validation_0-mape:0.75265 validation_1-mape:0.82055

[17] validation_0-mape:0.75017 validation_1-mape:0.82509

[18] validation_0-mape:0.75107 validation_1-mape:0.82793

[19] validation_0-mape:0.75062 validation_1-mape:0.82923

[20] validation_0-mape:0.75051 validation_1-mape:0.82955

[21] validation_0-mape:0.75093 validation_1-mape:0.82811

array([0.3278924 , 0.3278924 , 0.3278924 , ..., 0.86840504, 0.86840504,

1 df= pd.read_excel("saaki test.xlsx")

0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...