Professional Documents
Culture Documents
[1]:
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 import plotly.express as px
6 import plotly.graph_objs as go
7 from plotly.subplots import make_subplots
8 import math
9 import datetime
10 sns.set()
11 sns.set_style('whitegrid')
12 # plt.style.use("dark_background")
13 import warnings
14 warnings.filterwarnings('ignore')
15 pd.set_option('display.max_columns', 500)
16 from xgboost import XGBRegressor
17 from xgboost import plot_importance
18 from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
19 from scipy.signal import periodogram
20
21
22 def plot_features(booster, figsize):
23 fig, ax = plt.subplots(1,1,figsize=figsize)
24 return plot_importance(booster=booster, ax=ax)
25
26 import time
27 from itertools import product
28 %matplotlib inline
29 import os
30 import matplotlib.style as style
31 import matplotlib.gridspec as gridspec
32 import scipy.stats as stats
33 from scipy.special import boxcox1p
34 from scipy.stats import boxcox_normmax
35 from sklearn.model_selection import GridSearchCV
36 from sklearn.linear_model import Ridge
37 from sklearn.ensemble import RandomForestRegressor
38 from sklearn.neighbors import KNeighborsRegressor
---------------------------------------------------------------------------
<ipython-input-1-7807e2964a22> in <module>()
20
In [2]:
Out[2]:
40
In [3]:
Out[3]:
50
In [6]:
1 a,b,c= 2,4,8
2 print(b/a+c)
3 print(b/(a+c))
10.0
0.4
In [ ]:
1
In [4]:
1 set(l2)- set(l1)
Out[4]:
{'accurate',
'actions',
'add',
'aged',
'ages',
'agree',
'akin',
'allow',
'amaze',
'angel'}
In [2]:
In [12]:
1 test['num_sold']=0
2 test.head()
Out[12]:
In [17]:
1 matrix=pd.concat([train[['date','num_sold']],test[['date','num_sold']]],ignore_index=Tr
Lag Features
In [19]:
1 df = matrix.reset_index()
2 df = df.set_index(['date'])
3 entries_perday = len(df.loc['2015-01-08'])
4 df = df.reset_index().set_index('date')
5 df.drop('index',axis=1,inplace=True)
6 print('Number of entries in a day across all stores : {}'.format(entries_perday))
7 df.head()
Out[19]:
num_sold
date
2015-01-01 329
2015-01-01 520
2015-01-01 146
2015-01-01 572
2015-01-01 911
In [21]:
Out[21]:
date
2015-
329 NaN NaN NaN NaN Na
01-01
2015-
520 NaN NaN NaN NaN Na
01-01
2015-
146 NaN NaN NaN NaN Na
01-01
2015-
572 NaN NaN NaN NaN Na
01-01
2015-
911 NaN NaN NaN NaN Na
01-01
In [22]:
Scatter Plot
In [24]:
In [25]:
1 # Creating a periodogram.
2
3 fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
4 freqencies, spectrum = periodogram(
5 train['num_sold'],
6 fs=fs,
7 detrend='linear',
8 window="boxcar",
9 scaling='spectrum',
10 )
11 fig, ax = plt.subplots(figsize = (16, 5))
12 ax.step(freqencies, spectrum, color="purple")
13 ax.set_xscale("log")
14 ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
15 ax.set_xticklabels(
16 [
17 "Annual (1)",
18 "Semiannual (2)",
19 "Quarterly (4)",
20 "Bimonthly (6)",
21 "Monthly (12)",
22 "Biweekly (26)",
23 "Weekly (52)",
24 "Semiweekly (104)",
25 ],
26 rotation=90,
27 )
28 ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
29 ax.set_ylabel("Variance")
30 ax.set_title("Periodogram", fontsize = 18);
31
In [29]:
1 # choosing order = 4 because semiweekly, biweekly, and weekly periodicity was observere
2 fourier = CalendarFourier(freq="W", order=4)
3 data = df.reset_index().set_index([pd.DatetimeIndex(df.reset_index()['date'])])
4 y = data['num_sold']
5 dp = DeterministicProcess(
6 index= y.index,
7 order=1,
8 seasonal=False,
9 constant=False,
10 additional_terms = [fourier],
11 drop = True
12 )
13
14 X = dp.in_sample()
15 X.shape
Out[29]:
(32868, 7)
In [31]:
1 df=df.reset_index()
2 df['date']=pd.to_datetime(df['date'])
3 df=df.reset_index().set_index('date')
4 df.head()
Out[31]:
date
1 df=df.drop('index',axis=1).join(X.drop('trend',axis=1),how='inner')
2 df.head()
Out[32]:
sin(1,freq=W- cos(1,freq=W-
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN)
date
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
In [35]:
1 df=df.reset_index()
2 X=df[['date','num_sold']]
3 X['date']=X['date'].apply(lambda x: x.toordinal())
4 #https://www.analyticsvidhya.com/blog/2021/05/k-means-clustering-with-mall-customer-seg
5 #Importing KMeans from sklearn
6 from sklearn.cluster import KMeans
7 wcss=[]
8 for i in range(1,11):
9 km=KMeans(n_clusters=i)
10 km.fit(X)
11 wcss.append(km.inertia_)
12 #The elbow curve
13 plt.figure(figsize=(12,6))
14 plt.plot(range(1,11),wcss)
15 plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")
16 plt.xlabel("K Value")
17 plt.xticks(np.arange(1,11,1))
18 plt.ylabel("WCSS")
19 plt.show()
In [36]:
1 #Taking 5 clusters
2 km1=KMeans(n_clusters=5)
3 #Fitting the input data
4 km1.fit(X)
5 #predicting the labels of the input data
6 y=km1.predict(X)
7 #adding the labels to a column named label
8 df["label"] = y
9 #The new dataframe with the clustering done
10 df.head()
Out[36]:
sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN
2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
In [38]:
Out[38]:
sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN
2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01
2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
Standardaization
In [41]:
1 df.fillna(0,inplace=True)
2 from sklearn.preprocessing import StandardScaler
3 scaler=StandardScaler()
4 scaler.fit(df[['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']])
5 scaled=pd.DataFrame(scaler.transform(df[['num_sold','num_sold_lag1','num_sold_lag2','nu
6 scaled.columns=['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']
7 scaled.head()
Out[41]:
In [46]:
1 df=pd.get_dummies(df)
2 df.head()
Out[46]:
2015-
0 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
1 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
2 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
3 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
2015-
4 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01
In [50]:
1 ts = time.time()
2
3 model = XGBRegressor(
4 max_depth=8,
5 n_estimators=1000,
6 min_child_weight=300,
7 colsample_bytree=0.8,
8 subsample=0.8,
9 eta=0.3,
10 seed=42)
11
12 model.fit(
13 X_train,
14 Y_train,
15 eval_metric="mape",
16 eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
17 verbose=True,
18 early_stopping_rounds = 10)
19
20 time.time() - ts
Out[51]:
15.90307068824768
Predictions
In [52]:
1 model.predict(X_test)
Out[52]:
0.86840504], dtype=float32)
In [11]:
Out[11]:
product_price product_title
32668 175.00, Sale119.95 Blondo Steffy Snow Boots - Waterproof (For Women)
In [12]:
1 df.head()
Out[12]:
product_price product_title
In [13]:
1 lst= df['product_title'].tolist()
2 #k=process.extract("Anti-Dark", lst)
In [23]:
1 k= process.extract("Anti-Dark", lst)
In [24]:
1 len(k)
Out[24]:
In [19]:
1 type(k)
Out[19]:
list
In [22]:
1 for i in k:
2 print(fuzz.ratio('Anti-Matter', i))
3
24
17
15
18
20
10
24
14
In [ ]:
1