Professional Documents
Culture Documents
1.1 Read the data and do exploratory data analysis. Describe the data briefly. (Check the null
values, data types, shape, EDA). Perform Univariate and Bivariate Analysis. (8 marks)
In [1]:
### Loading nesscessary library for the model
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.style
filterwarnings('ignore')
In [2]:
df = pd.read_csv('Firm_level_data (1) (1) (1).csv')
In [3]:
df.head()
Out[3]: Unnamed:
sales capital patents randd employment sp500 tobinq
0
In [4]:
df.tail()
Out[4]: Unnamed:
sales capital patents randd employment sp500 tobinq
0
In [5]:
df.shape
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
In [7]:
### Data Description
df.describe(include = 'all').T
Out[7]: count unique top freq mean std min 25% 50%
Unnamed: 0 759.0 NaN NaN NaN 379.0 219.248717 0.0 189.5 379.0
sales 759.0 NaN NaN NaN 2689.705158 8722.060124 0.138 122.92 448.577082
capital 759.0 NaN NaN NaN 1977.747498 6466.704896 0.057 52.650501 202.179023
patents 759.0 NaN NaN NaN 25.831357 97.259577 0.0 1.0 3.0
randd 759.0 NaN NaN NaN 439.938074 2007.397588 0.0 4.628262 36.864136
employment 759.0 NaN NaN NaN 14.164519 43.321443 0.006 0.9275 2.924
tobinq 738.0 NaN NaN NaN 2.79491 3.366591 0.119001 1.018783 1.680303
value 759.0 NaN NaN NaN 2732.73475 7071.072362 1.971053 103.593946 410.793529
institutions 759.0 NaN NaN NaN 43.02054 21.685586 0.0 25.395 44.11
In [8]:
### Null value check
df.isnull().sum()
Out[8]: Unnamed: 0 0
sales 0
capital 0
patents 0
randd 0
employment 0
sp500 0
tobinq 21
value 0
institutions 0
dtype: int64
In [9]:
# Percentage of missing values
sales 0.000000
capital 0.000000
patents 0.000000
randd 0.000000
employment 0.000000
sp500 0.000000
tobinq 2.766798
value 0.000000
institutions 0.000000
dtype: float64
In [10]:
df2=df.drop(['Unnamed: 0'], axis = 1)
In [11]:
df2.head()
In [12]:
### check for duplicates in data
dups = df2.duplicated()
In [13]:
### unique values for categorical variables
if df2[column].dtype == 'object':
print(column.upper(),': ',df2[column].nunique())
print(df2[column].value_counts().sort_values())
print('\n')
SP500 : 2
yes 217
no 542
In [14]:
# Treating Null value
mean_value=df2['tobinq'].mean()
df2['tobinq'].fillna(value=mean_value, inplace=True)
print('Updated Dataframe:')
print(df2)
Updated Dataframe:
In [15]:
df2.isnull().sum()
Out[15]: sales 0
capital 0
patents 0
randd 0
employment 0
sp500 0
tobinq 0
value 0
institutions 0
dtype: int64
In [16]:
df2.describe()
In [17]:
##Univariate
plt.style.use('seaborn-whitegrid')
fig.set_size_inches(12, 20)
a = sns.distplot(df2['sales'] , ax=axes[0][0])
a.set_title("sales Distribution",fontsize=10)
a.set_title("sales Distribution",fontsize=15)
a = sns.distplot(df2['patents'] , ax=axes[1][0])
a.set_title("patents Distribution",fontsize=10)
a.set_title("patents Distribution",fontsize=10)
a = sns.distplot(df2['capital'] , ax=axes[2][0])
a.set_title("capital Distribution",fontsize=10)
a.set_title("capital Distribution",fontsize=10)
a = sns.distplot(df2['randd'] , ax=axes[3][0])
a.set_title("randd Distribution",fontsize=10)
a.set_title("randd Distribution",fontsize=10)
a = sns.distplot(df2['employment'] , ax=axes[0][0])
a.set_title("employment Distribution",fontsize=10)
a.set_title("employment Distribution",fontsize=10)
a = sns.distplot(df2['tobinq'] , ax=axes[1][0])
a.set_title("tobinq Distribution",fontsize=10)
a.set_title("tobinq Distribution",fontsize=10)
a = sns.distplot(df2['value'] , ax=axes[2][0])
a.set_title("value Distribution",fontsize=10)
a.set_title("value Distribution",fontsize=10)
In [18]:
def plot_distribution(df, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig.set_size_inches(12, 20)
a = sns.distplot(df2['sales'] , ax=axes[0][0])
a.set_title("sales Distribution",fontsize=10)
a.set_title("sales Distribution",fontsize=15)
a = sns.distplot(df2['patents'] , ax=axes[1][0])
a.set_title("patents Distribution",fontsize=10)
a.set_title("patents Distribution",fontsize=10)
a = sns.distplot(df2['institutions'] , ax=axes[2][0])
a.set_title("institutions",fontsize=10)
a.set_title("institutions",fontsize=10)
In [19]:
df2.columns
'value', 'institutions'],
dtype='object')
In [20]: df2.skew()
capital 7.555091
patents 7.766943
randd 10.270483
employment 9.068875
tobinq 3.332006
value 6.075996
institutions -0.168071
dtype: float64
In [21]:
### Data Distribution
sns.pairplot(df2, diag_kind='kde')
plt.show()
In [22]:
## Data Distribution
In [23]:
### checking for Correlations
df_cor = df2.corr()
plt.figure(figsize=(8,6))
Out[23]: <AxesSubplot:>
1.2 Impute null values if present? Do you think scaling is necessary in this case? (8 marks)
In [24]:
df2.isnull().sum()
Out[24]: sales 0
capital 0
patents 0
randd 0
employment 0
sp500 0
tobinq 0
value 0
institutions 0
dtype: int64
In [25]:
df2[df2.isin([0])].stack(0)
6 patents 0.0
7 randd 0.0
18 patents 0.0
22 patents 0.0
...
randd 0.0
In [26]:
for column in df2.columns:
if df2[column].dtype != 'object':
median = df2[column].median()
df2[column] = df2[column].fillna(median)
df.isnull().sum()
Out[26]: Unnamed: 0 0
sales 0
capital 0
patents 0
randd 0
employment 0
sp500 0
tobinq 21
value 0
institutions 0
dtype: int64
In [27]:
df2.dtypes
capital float64
patents int64
randd float64
employment float64
sp500 object
tobinq float64
value float64
institutions float64
dtype: object
In [28]:
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#num_d = df2.select_dtypes(exclude=['object'])
#df2[num_d.columns] = sc.fit_transform(num_d)
In [29]:
df2.head()
In [30]:
#outlier check
In [31]:
df2.isnull().sum()
Out[31]: sales 0
capital 0
patents 0
randd 0
employment 0
sp500 0
tobinq 0
value 0
institutions 0
dtype: int64
In [32]:
cols = ['sales' ,'capital', 'patents', 'randd', 'employment','tobinq','value','insti
for i in cols:
sns.boxplot(df2[i])
plt.show()
In [33]:
cont=df2.dtypes[(df2.dtypes!='uint8') & (df2.dtypes!='object')].index
In [34]:
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
In [35]:
for column in df2[cont].columns:
lr,ur=remove_outlier(df2[column])
df2[column]=np.where(df2[column]>ur,ur,df2[column])
df2[column]=np.where(df2[column]<lr,lr,df2[column])
In [36]:
cols = ['sales' ,'capital', 'patents', 'randd', 'employment','tobinq','value','insti
for i in cols:
sns.boxplot(df2[i])
plt.show()
In [37]:
df2_cor = df2.corr()
plt.figure(figsize=(8,6))
Out[37]: <AxesSubplot:>
1.3 Encode the data (having string values) for Modelling. Data Split: Split the data into test
and
train (70:30). Apply Linear regression. Performance Metrics: Check the performance
of
Predictions on Train and Test sets using Rsquare, RMSE. (8 marks)
In [38]:
### Converting categorical to dummy variables in data
data = pd.get_dummies(df2, columns=['sp500'])
data.head()
In [39]:
data.columns
dtype='object')
In [40]:
#### Train/ Test split - Unrequried column already drop, changing name for self comf
data_model = data
data_model.columns
dtype='object')
In [41]: data_model.head()
In [42]:
data_model.info()
<class 'pandas.core.frame.DataFrame'>
In [43]:
# Copy all the predictor variables into X dataframe
X = data_model.drop('sales', axis=1)
y = data_model[['sales']]
In [44]:
X.head()
In [45]:
X.shape
Out[45]: (759, 9)
In [46]:
localhost:8888/nbconvert/html/Predictive Modelling - Secondary.ipynb?download=false 20/26
1/21/22, 10:10 PM Predictive Modelling - Secondary
y.head()
Out[46]: sales
0 826.995050
1 407.753973
2 4371.988416
3 451.000010
4 174.927981
In [47]:
y.shape
Out[47]: (759, 1)
In [48]:
# Split X and y into training and test set in 70:30 ratio
In [49]:
## Linear Regression Model
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train,y_train)
Out[49]: LinearRegression()
In [50]:
# Let us explore the coefficients for each of the independent attributes
In [51]:
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
In [52]:
# R square on training data
regression_model.score(X_train, y_train)
Out[52]: 0.9358806629736066
In [53]:
# R square on testing data
regression_model.score(X_test, y_test)
Out[53]: 0.924129439335239
In [54]:
#RMSE on Training data
predicted_train=regression_model.fit(X_train, y_train).predict(X_train)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))
Out[54]: 394.6129494572075
In [71]:
#RMSE on Testing data
predicted_test=regression_model.fit(X_train, y_train).predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))
Out[71]: 399.74321332112794
In [55]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [56]:
i=0
if i < 11:
i = i+1
In [57]:
### Using Statsmodel library
data_train.head()
In [58]:
data_train.columns
dtype='object')
capital 0.406154
patents -4.647327
randd 0.639885
employment 78.613725
tobinq -39.925789
value 0.244625
institutions 0.217439
sp500_no -31.100320
sp500_yes 135.031767
dtype: float64
In [60]:
print(lm1.summary()) #Inferential statistics
==============================================================================
Df Model: 8
================================================================================
--------------------------------------------------------------------------------
==============================================================================
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly spe
cified.
[2] The smallest eigenvalue is 5.53e-30. This might indicate that there are
In [61]:
# Let us check the sum of squared errors by predicting value of y for test cases and
# subtracting from the actual y for the test cases
mse = np.mean((regression_model.predict(X_test)-y_test)**2)
In [62]:
# underroot of mean_sq_error is standard deviation i.e. avg variance between predict
import math
math.sqrt(mse)
Out[62]: 399.743213321128
In [63]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS
regression_model.score(X_test, y_test)
Out[63]: 0.924129439335239
In [64]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)
In [65]:
plt.scatter(y_test['sales'], y_pred)
In [66]:
### ITERATION 2
capital 0.406154
patents -4.647327
randd 0.639885
employment 78.613725
tobinq -39.925789
value 0.244625
institutions 0.217439
sp500_no -31.100320
sp500_yes 135.031767
dtype: float64
In [67]:
print(lm2.summary()) #Inferential statistics
==============================================================================
Df Model: 8
================================================================================
--------------------------------------------------------------------------------
==============================================================================
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly spe
cified.
[2] The smallest eigenvalue is 5.53e-30. This might indicate that there are
In [68]:
# concatenate X and y into a single dataframe
data_test=pd.concat([X_test,y_test],axis=1)
data_train.head()
In [69]:
data_test.head()
In [70]:
for i,j in np.array(lm2.params.reset_index()):
In [ ]:
In [ ]: