You are on page 1of 2

import pandas as pd

import os
import numpy as np
import seaborn as sns
import matplotlib as plt

os.getcwd()
os.chdir(r"E:\PFA\Datasets")
bollywood=pd.read_csv("bollywood.csv")

bollywood.head(10)
bollywood.columns
bollywood.index
bollywood.dtypes
bollywood.isnull().sum(axis="rows")
bollywood.describe()

bollywood.index = bollywood.MovieName
bollywood.drop('MovieName',1)
bollywood.columns

#general exploration
#top highest grossing films
bollywood.sort_values(by='BoxOfficeCollection', ascending = False)[['MovieName',
'BoxOfficeCollection','Budget']][0:5]
#top highest budget films
bollywood.sort_values(by='Budget', ascending = False)[['MovieName',
'BoxOfficeCollection','Budget']][0:5]
# top highest grossing films in Action category
bollywood[bollywood.Genre == 'Action'].sort_values(by = 'BoxOfficeCollection',
ascending = False)[['MovieName', 'BoxOfficeCollection','Budget']][0:5]
# top highest budget films in Action category
bollywood[bollywood.Genre == 'Action'].sort_values(by = 'Budget', ascending =
False)[['MovieName', 'BoxOfficeCollection','Budget']][0:5]

#How many movies of each genre


pd.unique(bollywood.Genre)
bollywood.Genre.value_counts(sort = True)

#Genre by ReleateTime cross tab


pd.unique(bollywood.ReleaseTime)
bollywood.ReleaseTime.value_counts(sort = True)
pd.crosstab(bollywood.Genre, bollywood.ReleaseTime)

#Cross Tabulate month of release vs Big Budget ?


import datetime as dt
date_time_str = str(bollywood['Release Date'][1])
date_time_obj = dt.datetime.strptime(date_time_str, '%d-%b-%y')
date_time_obj.month

#First create Release Month and BigBudget? columns


bollywood['ReleaseMonth'] = bollywood.apply(lambda
rec:dt.datetime.strptime(rec['Release Date'],'%d-%b-%y').month, axis = 1)
bollywood['ReleaseMonth'].value_counts(sort=True)
bollywood['BigBudget'] = bollywood.apply(lambda rec:rec.Budget>25, axis = 1)
#cross tab ReleaseMonth and BigBudget
bdf=pd.crosstab(bollywood['ReleaseMonth'], bollywood['BigBudget']).reset_index()
bdf.columns = ['ReleaseMonth', 'BBFalse', 'BB']
bdf
bdf.drop('BBFalse',1, inplace = True)
bdf.sort_values(by='BB', axis=0, ascending = False)

#Create ROI column and sort by ROI


bollywood['ROI'] = bollywood.apply(lambda rec:((rec.BoxOfficeCollection-
rec.Budget)/rec.Budget), axis = 1)
bollywood.sort_values(by = 'ROI', axis = 0, ascending = False)

#Compute average ROI by Releasetime


bollywood.groupby('ReleaseTime')['ROI'].agg(np.mean)

You might also like