Professional Documents
Culture Documents
'NBA - Sample - CSV': Ten Rows From The Data Set
'NBA - Sample - CSV': Ten Rows From The Data Set
csv') head(data)
GAME_ID DATE HOME_TEAM AWAY_TEAM PLAYER_NAME PLAYER_ID LOCATION WIN_LOSE SHOT_NUMBER PERIOD SEC_REMAIN SHOT_CLOCK DRIBBLES TOUCH_TI
summary(data)
## ## ## ## ## ## ## ##GAME_ID
## ## ## ## ## ## ##
DATEHOME_TEAMAWAY_TEAM
## ## ## ## ## ## ## ## ## ## ## ##
Min. :21400001 1st Length:50000Length:50000Length:50000
Qu.:21400235 Class :characterClass :characterClass :character Mode :characterMode :characterMode :character
Median :21400452
Mean :21400454 3rd Qu.:21400677 Max. :21400908 PLAYER_NAME
Length:50000 Class :character
## ## ## ## ## ## ## ## ## ## ## ##$##
DATE
## ## ## ## ## "OCT
## 28, 2014" : chr "OCT 28, 2014" "OCT 28, 2014" "OCT 28, 2014" ... "NOP" "NOP" "NOP" "NOP" ...
$ HOME_TEAM "ORL" "ORL" : chr
"ORL" "ORL" ...
$ AWAY_TEAM "Omer Asik": chr "Omer Asik" "Omer Asik" "Jrue Holiday" ...
$ PLAYER_NAME 201600 201600 : chr 201600 201950 201950 201950 201950 201950 201950 201583 ... "H" "H" "H" "H" ...
$ PLAYER_ID "W" "W" "W" : int"W" ...
$ LOCATION 1 7 8 2 3 4 6: chr
7 11 1 ...
$ WIN_LOSE 1 2 2 2 2 2 3: chr
3 4 1 ...
$ SHOT_NUMBER 607 647 361: int 424 331 229 595 528 181 237 ...
$ PERIOD 11.3 22.6 23.1
: int12.7 11.5 14 10.7 14.4 20.2 10.1 ...
$ SEC_REMAIN 0 0 0 11 3 11: int
0 10 1 0 ...
$ SHOT_CLOCK 0.8 0.3 2 10.5 : num
4.4 9.8 0.9 9.4 1.8 1.3 ...
$ DRIBBLES 3.6 1.2 2.3 3.6
: int20.6 1.3 20.9 16 4 25 ...
$ TOUCH_TIME 2 2 2 2 2 2 2: num 2 2 3 ...
$ SHOT_DIST "Nikola Vucevic"
: num "Nikola Vucevic" "Kyle O'Quinn" "Elfrid Payton" ... 202696 202696 203124 203901 203901 202696 203901 203901 203932 202699 ...
$ PTS_TYPE 1.7 3.6 2.1 2: int
4 3.3 6.7 2.8 5.5 4.2 ...
$ CLOSEST_DEFENDER 1 1 1 1 0 0 0: chr
1 1 0 ...
$ CLOSEST_DEFENDER_ID: int
$ CLOSE_DEF_DIST : num
$ SUCCESS : int
M=cor(data[sapply(data,is.numeric)]) M
library(DataExplorer)
plot_missing(data)
library(corrplot)
corrplot(M)
library(PerformanceAnalytics)
##
## Attaching package: 'zoo'
##
## Attaching package: 'PerformanceAnalytics'
data$SUCCESS = as.factor(data$SUCCESS)
library(ggplot2) library(readr)
library(repr)
options(repr.plot.width=6, repr.plot.height=3.5)
##
## Attaching package: 'dplyr'
summary(data$CLOSE_DEF_DIST)
# CTOUCH_TIME
library(dplyr) summary(data$TOUCH_TIME)
options(repr.plot.width=6, repr.plot.height=3.5)
model <- glm(SUCCESS ~ LOCATION + SHOT_NUMBER + PERIOD + PLAYER_ID +SHOT_DIST +CLOSE_DEF_DIST, data = data, famil
y = "binomial")
summary(model)
##
## Call:
## glm(formula = SUCCESS ~ LOCATION + SHOT_NUMBER + PERIOD + PLAYER_ID +
##SHOT_DIST + CLOSE_DEF_DIST, family = "binomial", data = data) ##
## Deviance Residuals:
## Min 1Q Median 3Q Max ## -2.0455 -1.0934 -0.8205 1.1522 2.0879 ##
## Coefficients:
The Location of the team has a very slight impact on the result of the shot, From the summary it’s is evident as we see that LOCATION is not
significant to predict the result.
barplot(table(data$WIN_LOSE,data$HOME_TEAM),beside = T,legend=c("Lose","Win"), col=c("#3C6688", "#45A778"), border="white",las=2,main="Team Analysis")
table(data$WIN_LOSE,data$HOME_TEAM)
##
## ATL BKN BOS CHA CHI CLE DAL DEN DET GSW HOU IND LAC LAL MEM
## L 868 753 928 849 931 883 850 838 881 743 857 722 901 808 906
## W 928 771 889 921 918 865 903 896 863 797 821 814 965 699 818
##
## MIA MIL MIN NOP NYK OKC ORL PHI PHX POR SAC SAS TOR UTA WAS
##L 752 666 816 847 733 737 797 667 897 903 1009 734 893 771 891
##W 715 709 809 802 737 701 839 791 868 904 1030 763 877 790 966
SAC has won the most matches at home,followed by WAS and LAC
remodel=glm(WIN_LOSE~LOCATION + SEC_REMAIN +PLAYER_ID ,data = data, family = "binomial") summary(remodel)
##
## Call:
## glm(formula = WIN_LOSE ~ LOCATION + SEC_REMAIN + PLAYER_ID, family = "binomial", ##data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max ## -1.4095 -1.1645 0.9642 1.1070 1.3282 ##
## Coefficients:
##Estimate Std. Error z value Pr(>|z|) ## (Intercept) -5.673e-02
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ##
## (Dispersion parameter for binomial family taken to be 1) ##
## Null deviance: 69312 on 49999 degrees of freedom ## Residual deviance: 68382 on 49996 degrees of freedom ## AIC: 68390
##
## Number of Fisher Scoring iterations: 4
Location and Player_id are very much statistically significant and sec_remain to predict the result of the match, as the p value is less than 0.05