Metrics

> # Load Data
> #2017
> trad17 <- read.csv("trad17.csv",stringsAsFactors = F)
> misc17 <- read.csv("misc17.csv", stringsAsFactors = F)
> def17 <- read.csv("def17.csv", stringsAsFactors = F)
> track17 <- read.csv("track17.csv", stringsAsFactors = F)
> Adv17 <- read.csv("Adv17.csv", stringsAsFactors = F)
> # 2018
> trad18 <- read.csv("trad18.csv", stringsAsFactors = F)
> misc18 <- read.csv("misc18.csv", stringsAsFactors = F)
> def18 <- read.csv("def18.csv", stringsAsFactors = F)
> track18 <- read.csv("track18.csv", stringsAsFactors = F)
> Adv18 <- read.csv("Adv18.csv", stringsAsFactors = F)
> # Salary for 2017-2018 season

> Salary2017_18 <- read.csv("salary2017_18.csv",stringsAsFactors = F)
> #VISUALIZATIONS.
> #install.packages("gridExtra")
> library(gridExtra)
> library(grid)
> pdf("Report.pdf")
> #explore the data

> names(Adv18)
[1] "Year" "Player" "Team" "Age" "GP" "W" "L"
[8] "Min" "OffRTG" "DefRTG" "NetRTG" "AST_per" "AST_TO"

"AST_RATIO"
[15] "OREBper" "DREBper" "REBper" "TO_Ratio" "EFGper" "TS_per"
"USG_per"
[22] "PACE" "PIE"
> View(Adv18)
> # Fix an error in Adv18 dataset. Observation #534 has an error in PIE (-400
should be -4)
> Adv18$PIE[534] <- -4
> # merge 2017 data

> library(dplyr)
> nba2017_1 <- left_join(trad17, misc17, by = c("Year", "Player", "Team", "Age",

"GP", "W", "L", "Min", "BLK", "PF"))
> names(nba2017_1)
[1] "Year" "Rk.x" "Player" "Team" "Age"
[6] "GP" "W" "L" "Min" "Pts"
[11] "FGM" "FGA" "FG_per" "three_PM" "three_PA"
[16] "three_Per" "FTM" "FTA" "FT_per" "OREB"
[21] "DREB" "REB" "AST" "TOV" "STL"
[26] "BLK" "PF" "FP" "DD2" "TD3"
[31] "Plus_minus" "Rk.y" "Pts_Off_TO" "second_pts" "FBPS"
[36] "PITP" "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"
[41] "BLKA" "PFD"
> nba2017_2 <- left_join(nba2017_1, def17, by = c("Year", "Player", "Team", "Age",

"GP", "W", "L", "Min", "BLK", "STL", "DREB"))
> names(nba2017_2)
[1] "Year" "Rk.x" "Player" "Team"
[5] "Age" "GP" "W" "L"
[9] "Min" "Pts" "FGM" "FGA"
[13] "FG_per" "three_PM" "three_PA" "three_Per"
[17] "FTM" "FTA" "FT_per" "OREB"
[21] "DREB" "REB" "AST" "TOV"
[25] "STL" "BLK" "PF" "FP"
[29] "DD2" "TD3" "Plus_minus" "Rk.y"
[33] "Pts_Off_TO" "second_pts" "FBPS" "PITP"
[37] "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"
[41] "BLKA" "PFD" "Rk" "Def.Rtg"
[45] "DREB_per" "Per_DREB" "STL_per" "Per_BLK"
[49] "OPP_PTS_OFF_TOV" "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint"
[53] "DEF_WS"
> nba2017_3 <- left_join(nba2017_2, track17, by = c("Year", "Player", "Team"))
> names(nba2017_3)
[5] "Age" "GP.x" "W.x" "L.x"
[9] "Min.x" "Pts" "FGM" "FGA"

[25] "STL" "BLK" "PF" "FP"
[41] "BLKA" "PFD" "Rk.x.x" "Def.Rtg"
[53] "DEF_WS" "Rk.y.y" "GP.y" "W.y"
[57] "L.y" "Min.y" "Dist_FT" "Dist_Miles"
[61] "Dist_Miles_off" "Dist_Miles_def" "avg_speed" "avg_speed_off"
[65] "avg_speed_def"
> nba2017_4 <- left_join(nba2017_3, Adv17, by = c("Year", "Player", "Team","Age"))
> names(nba2017_4)
[5] "Age" "GP.x" "W.x" "L.x"
[25] "STL" "BLK" "PF" "FP"

[65] "avg_speed_def" "GP" "W" "L"
[69] "Min" "OffRTG" "DefRTG" "NetRTG"
[73] "AST_per" "AST_TO" "AST_RATIO" "OREBper"
[77] "DREBper" "REBper" "TO_Ratio" "EFGper"
[81] "TS_per" "USG_per" "PACE" "PIE"
> # Clean the data by removing repeated columns

> nba2017_4 <- nba2017_4[ -c(2,32,43,54:58, 66:69)]
> # merge 2018 data

> nba2018_1 <- left_join(trad18, misc18, by = c("Year", "Player", "Team", "Age",
"GP", "W", "L", "Min", "BLK", "PF"))
> names(nba2018_1)
[1] "Year" "Rk.x" "Player" "Team" "Age"
[6] "GP" "W" "L" "Min" "Pts"
[11] "FGM" "FGA" "FG_per" "three_PM" "three_PA"
[16] "three_Per" "FTM" "FTA" "FT_per" "OREB"
[21] "DREB" "REB" "AST" "TOV" "STL"
[26] "BLK" "PF" "FP" "DD2" "TD3"
[31] "Plus_minus" "Rk.y" "Pts_Off_TO" "second_pts" "FBPS"
[36] "PITP" "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"
[41] "BLKA" "PFD"
> nba2018_2 <- left_join(nba2018_1, def18, by = c("Year", "Player", "Team", "Age",

"GP", "W", "L", "Min", "BLK", "STL", "DREB"))
> names(nba2018_2)
[5] "Age" "GP" "W" "L"
[9] "Min" "Pts" "FGM" "FGA"
[25] "STL" "BLK" "PF" "FP"

[41] "BLKA" "PFD" "Rk" "Def.Rtg"
[53] "DEF_WS"
> nba2018_3 <- left_join(nba2018_2, track18, by = c("Year", "Player", "Team"))
> names(nba2018_3)
[5] "Age" "GP.x" "W.x" "L.x"
[25] "STL" "BLK" "PF" "FP"
[65] "avg_speed_def"
> nba2018_4 <- left_join(nba2018_3, Adv18, by = c("Year", "Player", "Age", "Team"))
> names(nba2018_4)
[5] "Age" "GP.x" "W.x" "L.x"

[25] "STL" "BLK" "PF" "FP"
[65] "avg_speed_def" "GP" "W" "L"
[69] "Min" "OffRTG" "DefRTG" "NetRTG"
> # Clean the data by removing repeated columns

> nba2018_4 <- nba2018_4[ -c(2,32,43,54:58, 66:69)]
> names(nba2018_4)
[1] "Year" "Player" "Team" "Age"
[5] "GP.x" "W.x" "L.x" "Min.x"
[9] "Pts" "FGM" "FGA" "FG_per"
[13] "three_PM" "three_PA" "three_Per" "FTM"
[17] "FTA" "FT_per" "OREB" "DREB"
[21] "REB" "AST" "TOV" "STL"
[25] "BLK" "PF" "FP" "DD2"
[29] "TD3" "Plus_minus" "Pts_Off_TO" "second_pts"
[33] "FBPS" "PITP" "OPP_PTs_off_TO" "OPP_sec_pts"

[37] "OPP_FBPS" "OPP_PITP" "BLKA" "PFD"
[41] "Def.Rtg" "DREB_per" "Per_DREB" "STL_per"
[45] "Per_BLK" "OPP_PTS_OFF_TOV" "OPP_PTS_2nd_Chance" "OPP_PTS_FB"
[49] "OPP_PTS_Paint" "DEF_WS" "Dist_FT" "Dist_Miles"
[57] "avg_speed_def" "OffRTG" "DefRTG" "NetRTG"
> # Relabel for clearlity

> nba2017 <- nba2017_4
> nba2018 <- nba2018_4
> # Merge with salary

> names(Salary2017_18)
[1] "Player" "Team" "salary2017_18"
> nba2017 <- left_join(nba2017, Salary2017_18, by = "Player")
> names(nba2017)
[1] "Year" "Player" "Team.x" "Age"
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"

[73] "Team.y" "salary2017_18"
> # Clean data

> nba2017 <- nba2017[-73]
> # Explore data

> str(nba2017)
'data.frame': 517 obs. of 73 variables:
$ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
$ Player : chr "Russell Westbrook" "James Harden" "Isaiah Thomas"
"Anthony Davis" ...
$ Team.x : chr "OKC" "HOU" "BOS" "NOP" ...
$ Age : int 28 27 28 24 27 26 26 32 26 29 ...
$ GP.x : int 81 81 76 75 74 75 72 74 74 79 ...
$ W.x : int 46 54 51 31 47 38 30 51 54 65 ...
$ L.x : int 35 27 25 44 27 37 42 23 20 14 ...
$ Min.x : num 34.6 36.4 33.8 36.1 35.4 35.9 34.2 37.8 33.4 33.4 ...
$ Pts : num 31.6 29.1 28.9 28 27.3 27 27 26.4 25.5 25.3 ...
$ FGM : num 10.2 8.3 9 10.3 9.7 8.8 9 9.9 8.6 8.5 ...
$ FGA : num 24 18.9 19.4 20.3 20.9 19.8 19.9 18.2 17.7 18.3 ...
$ FG_per : num 42.5 44 46.3 50.5 46.7 44.4 45.2 54.8 48.5 46.8 ...
$ three_PM : num 2.5 3.2 3.2 0.5 0.4 2.9 1.8 1.7 2 4.1 ...
$ three_PA : num 7.2 9.3 8.5 1.8 1.7 7.7 5 4.6 5.2 10 ...
$ three_Per : num 34.3 34.7 37.9 29.9 26.6 37 36.1 36.3 38 41.1 ...
$ FTM : num 8.8 9.2 7.8 6.9 7.4 6.5 7.2 4.8 6.3 4.1 ...
$ FTA : num 10.4 10.9 8.5 8.6 8.7 7.3 9.3 7.2 7.2 4.6 ...
$ FT_per : num 84.5 84.7 90.9 80.2 84.2 89.5 77.2 67.4 88 89.8 ...
$ OREB : num 1.7 1.2 0.6 2.3 0.9 0.6 2.1 1.3 1.1 0.8 ...
$ DREB : num 9 7 2.1 9.5 4.3 4.3 8.9 7.3 4.7 3.7 ...
$ REB : num 10.7 8.1 2.7 11.8 5.2 4.9 11 8.6 5.8 4.5 ...
$ AST : num 10.4 11.2 5.9 2.1 3.9 5.9 4.6 8.7 3.5 6.6 ...
$ TOV : num 5.4 5.7 2.8 2.4 2.4 2.6 3.7 4.1 2.1 3 ...
$ STL : num 1.6 1.5 0.9 1.3 1.1 0.9 1.4 1.2 1.8 1.8 ...
$ BLK : num 0.4 0.5 0.2 2.2 0.2 0.3 1.3 0.6 0.7 0.2 ...
$ PF : num 2.3 2.7 2.2 2.2 1.8 2 3.9 1.8 1.6 2.3 ...
$ FP : num 60.6 55.8 41.5 53.3 40.7 42.6 51.4 51.3 43.3 43.6 ...
$ DD2 : int 62 64 5 49 5 11 46 42 9 9 ...
$ TD3 : int 42 22 0 0 0 0 2 13 0 0 ...
$ Plus_minus : num 3.1 5.2 3.6 0.7 2 1.1 -0.3 6.5 5.9 12.8 ...
$ Pts_Off_TO : num 4.7 4.4 3.8 3.5 3.5 3.5 3.6 4.1 4.3 4.4 ...
$ second_pts : num 3.6 2.2 1.9 3.8 1.9 1.6 3.1 2.5 2.2 2.2 ...
$ FBPS : num 6.7 4 4.5 2.2 2.5 2.5 1.3 5.5 3.5 6 ...
$ PITP : num 10.1 8.6 9.6 12.9 10.3 8.6 11.8 13.9 7.2 6.4 ...
$ OPP_PTs_off_TO : num 12.2 13.3 10.7 12 11.1 11.3 11.5 12.4 9.5 11.4 ...
$ OPP_sec_pts : num 9 10.1 10.6 8.8 10.1 9.6 8 10.1 8.6 9.5 ...
$ OPP_FBPS : num 8.1 11.1 8.9 9.6 8.9 9.9 10.8 10 8.8 10.1 ...
$ OPP_PITP : num 34.1 37.6 31.5 31.9 31.7 32.6 27.9 35.2 28.4 31.1 ...
$ BLKA : num 1.1 1.1 1.5 0.7 1.2 1.3 1.5 0.8 0.4 0.7 ...
$ PFD : num 6.7 6.2 6.3 6.9 5.5 5.6 8.2 5.9 5.4 3.5 ...
$ Def.Rtg : num 105 107 109 102 107 ...
$ DREB_per : num 27.9 21.2 7 26.9 13.4 13.4 30.5 20.9 16 11.3 ...
$ Per_DREB : num 35.2 27.6 9.5 34.9 17.8 17.6 39.6 27.3 20.6 15 ...
$ STL_per : num 27 24.3 18.1 22.2 18.9 17.4 24.5 23.1 32.6 24.1 ...
$ Per_BLK : num 10.3 15.4 5.8 49.9 5.2 7.2 41.9 19.3 18.2 4.5 ...
$ OPP_PTS_OFF_TOV : num 12.2 13.3 10.7 12 11.1 11.3 11.5 12.4 9.5 11.4 ...
$ OPP_PTS_2nd_Chance: num 9 10.1 10.6 8.8 10.1 9.6 8 10.1 8.6 9.5 ...
$ OPP_PTS_FB : num 8.1 11.1 8.9 9.6 8.9 9.9 10.8 10 8.8 10.1 ...
$ OPP_PTS_Paint : num 34.1 37.6 31.5 31.9 31.7 32.6 27.9 35.2 28.4 31.1 ...
$ DEF_WS : num 0.043 0.033 0.025 0.054 0.034 0.026 0.023 0.035 0.043
0.058 ...
$ Dist_FT : num 12533 11875 12281 12574 12655 ...
$ Dist_Miles : num 2.37 2.25 2.33 2.38 2.4 2.49 2.05 2.39 2.46 2.48 ...
$ Dist_Miles_off : num 1.28 1.18 1.28 1.24 1.34 1.41 1.07 1.26 1.38 1.32 ...
$ Dist_Miles_def : num 1.09 1.07 1.05 1.15 1.06 1.08 0.97 1.13 1.08 1.16 ...
$ avg_speed : num 4.13 3.72 4.14 3.96 4.06 4.17 3.6 3.8 4.43 4.46 ...
$ avg_speed_off : num 4.72 4.07 4.6 4.33 4.42 4.66 3.67 3.99 4.86 5.13 ...
$ avg_speed_def : num 3.61 3.4 3.68 3.64 3.69 3.67 3.52 3.62 3.97 3.88 ...
$ OffRTG : num 108 114 114 104 110 ...
$ DefRTG : num 105 107 109 102 107 ...
$ NetRTG : num 3.3 6.3 5 1.7 3.3 1.2 -2.2 7.7 8.6 17.2 ...
$ AST_per : num 54.3 50.5 30.5 11 20.4 27.7 26.2 38.8 18.2 28.7 ...
$ AST_TO : num 1.92 1.95 2.13 0.87 1.61 2.23 1.23 2.13 1.69 2.19 ...
$ AST_RATIO : num 23.4 27.6 18.5 7.3 12.6 18.6 14.3 25.6 13.3 22.2 ...
$ OREBper : num 5.3 3.5 1.9 6.7 3 1.8 7.2 4 3.8 2.7 ...
$ DREBper : num 27.9 21.2 7 26.9 13.4 13.4 30.5 20.9 16 11.3 ...
$ REBper : num 16.7 12.3 4.5 17 8.3 7.5 18.8 12.7 10 7.3 ...
$ TO_Ratio : num 12.2 14.1 8.7 8.4 7.8 8.3 11.6 12 7.9 10.1 ...
$ EFGper : num 47.6 52.5 54.6 51.8 47.7 51.6 49.8 59.4 54.1 58 ...
$ TS_per : num 55.4 61.3 62.5 58 55.2 58.6 56.2 61.9 61 62.4 ...
$ USG_per : num 40.8 34.1 33.7 32.6 34.2 30.9 36.4 29.7 31.2 29.2 ...
$ PACE : num 102.3 103 99.8 100.2 97.7 ...
$ PIE : num 23 19 16.1 19.2 15.5 15.9 17.8 18.3 17.4 15.1 ...
$ salary2017_18 : int 28530608 28299399 6261395 23775506 27739975 26153057
18063850 33285709 18868625 34682550 ...
> # Identify NAs in the salary17_18 column

> which(is.na(nba2017$salary2017_18))
[1] 130 133 161 189 207 217 221 233 236 238 241 251 277 285 309 317 322 332 334
337 340
[22] 345 347 349 351 352 360 365 366 367 369 372 373 376 378 379 389 390 395 400
405 411
[43] 413 414 415 422 426 429 431 433 442 444 447 460 461 462 463 470 474 475 481
482 483
[64] 484 488 491 492 495 497 501 503 504 506 509 510 512 514 515 516 517
> # Fill in missing data by looking up salaries online

> nba2017$salary2017_18[130] <- 3903900
> nba2017$salary2017_18[241]<- 1471382
> nba2017$salary2017_18[251]<- 1471382
> nba2017$salary2017_18[277]<-2093040
> nba2017$salary2017_18[309]<-2422560
> # Function to remove rows if specific column that still have NAs in the salary
column
> completeFun <- function(data, desiredCols) {
+ comp .... [TRUNCATED]
> # Remove NAs from players with missing salary. The majority of remaining empty
salary17_18 is due to the player not getting a deal.
> nba2017 <- co .... [TRUNCATED]
> # Check to see if it worked

> which(is.na(nba2017$salary2017_18))
integer(0)
> str(nba2017)
$ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
$ Player : chr "Russell Westbrook" "James Harden" "Isaiah Thomas"
"Anthony Davis" ...
$ Team.x : chr "OKC" "HOU" "BOS" "NOP" ...
$ Age : int 28 27 28 24 27 26 26 32 26 29 ...
$ GP.x : int 81 81 76 75 74 75 72 74 74 79 ...
$ W.x : int 46 54 51 31 47 38 30 51 54 65 ...
$ L.x : int 35 27 25 44 27 37 42 23 20 14 ...
$ Min.x : num 34.6 36.4 33.8 36.1 35.4 35.9 34.2 37.8 33.4 33.4 ...
$ Pts : num 31.6 29.1 28.9 28 27.3 27 27 26.4 25.5 25.3 ...
$ FGM : num 10.2 8.3 9 10.3 9.7 8.8 9 9.9 8.6 8.5 ...
$ FGA : num 24 18.9 19.4 20.3 20.9 19.8 19.9 18.2 17.7 18.3 ...
$ FG_per : num 42.5 44 46.3 50.5 46.7 44.4 45.2 54.8 48.5 46.8 ...
$ three_PM : num 2.5 3.2 3.2 0.5 0.4 2.9 1.8 1.7 2 4.1 ...
$ three_PA : num 7.2 9.3 8.5 1.8 1.7 7.7 5 4.6 5.2 10 ...
$ three_Per : num 34.3 34.7 37.9 29.9 26.6 37 36.1 36.3 38 41.1 ...
$ FTM : num 8.8 9.2 7.8 6.9 7.4 6.5 7.2 4.8 6.3 4.1 ...
$ FTA : num 10.4 10.9 8.5 8.6 8.7 7.3 9.3 7.2 7.2 4.6 ...
$ FT_per : num 84.5 84.7 90.9 80.2 84.2 89.5 77.2 67.4 88 89.8 ...
$ OREB : num 1.7 1.2 0.6 2.3 0.9 0.6 2.1 1.3 1.1 0.8 ...
$ DREB : num 9 7 2.1 9.5 4.3 4.3 8.9 7.3 4.7 3.7 ...
$ REB : num 10.7 8.1 2.7 11.8 5.2 4.9 11 8.6 5.8 4.5 ...
$ AST : num 10.4 11.2 5.9 2.1 3.9 5.9 4.6 8.7 3.5 6.6 ...
$ TOV : num 5.4 5.7 2.8 2.4 2.4 2.6 3.7 4.1 2.1 3 ...
$ STL : num 1.6 1.5 0.9 1.3 1.1 0.9 1.4 1.2 1.8 1.8 ...
$ BLK : num 0.4 0.5 0.2 2.2 0.2 0.3 1.3 0.6 0.7 0.2 ...
$ PF : num 2.3 2.7 2.2 2.2 1.8 2 3.9 1.8 1.6 2.3 ...
$ FP : num 60.6 55.8 41.5 53.3 40.7 42.6 51.4 51.3 43.3 43.6 ...
$ DD2 : int 62 64 5 49 5 11 46 42 9 9 ...
$ TD3 : int 42 22 0 0 0 0 2 13 0 0 ...
$ Plus_minus : num 3.1 5.2 3.6 0.7 2 1.1 -0.3 6.5 5.9 12.8 ...
$ Pts_Off_TO : num 4.7 4.4 3.8 3.5 3.5 3.5 3.6 4.1 4.3 4.4 ...
$ second_pts : num 3.6 2.2 1.9 3.8 1.9 1.6 3.1 2.5 2.2 2.2 ...
$ FBPS : num 6.7 4 4.5 2.2 2.5 2.5 1.3 5.5 3.5 6 ...
$ PITP : num 10.1 8.6 9.6 12.9 10.3 8.6 11.8 13.9 7.2 6.4 ...
$ OPP_PTs_off_TO : num 12.2 13.3 10.7 12 11.1 11.3 11.5 12.4 9.5 11.4 ...
$ OPP_sec_pts : num 9 10.1 10.6 8.8 10.1 9.6 8 10.1 8.6 9.5 ...
$ OPP_FBPS : num 8.1 11.1 8.9 9.6 8.9 9.9 10.8 10 8.8 10.1 ...
$ OPP_PITP : num 34.1 37.6 31.5 31.9 31.7 32.6 27.9 35.2 28.4 31.1 ...
$ BLKA : num 1.1 1.1 1.5 0.7 1.2 1.3 1.5 0.8 0.4 0.7 ...
$ PFD : num 6.7 6.2 6.3 6.9 5.5 5.6 8.2 5.9 5.4 3.5 ...
$ Def.Rtg : num 105 107 109 102 107 ...
$ DREB_per : num 27.9 21.2 7 26.9 13.4 13.4 30.5 20.9 16 11.3 ...
$ Per_DREB : num 35.2 27.6 9.5 34.9 17.8 17.6 39.6 27.3 20.6 15 ...
$ STL_per : num 27 24.3 18.1 22.2 18.9 17.4 24.5 23.1 32.6 24.1 ...
$ Per_BLK : num 10.3 15.4 5.8 49.9 5.2 7.2 41.9 19.3 18.2 4.5 ...
$ OPP_PTS_OFF_TOV : num 12.2 13.3 10.7 12 11.1 11.3 11.5 12.4 9.5 11.4 ...
$ OPP_PTS_2nd_Chance: num 9 10.1 10.6 8.8 10.1 9.6 8 10.1 8.6 9.5 ...
$ OPP_PTS_FB : num 8.1 11.1 8.9 9.6 8.9 9.9 10.8 10 8.8 10.1 ...
$ OPP_PTS_Paint : num 34.1 37.6 31.5 31.9 31.7 32.6 27.9 35.2 28.4 31.1 ...
$ DEF_WS : num 0.043 0.033 0.025 0.054 0.034 0.026 0.023 0.035 0.043
0.058 ...
$ Dist_FT : num 12533 11875 12281 12574 12655 ...
$ Dist_Miles : num 2.37 2.25 2.33 2.38 2.4 2.49 2.05 2.39 2.46 2.48 ...
$ Dist_Miles_def : num 1.09 1.07 1.05 1.15 1.06 1.08 0.97 1.13 1.08 1.16 ...
$ avg_speed : num 4.13 3.72 4.14 3.96 4.06 4.17 3.6 3.8 4.43 4.46 ...
$ avg_speed_off : num 4.72 4.07 4.6 4.33 4.42 4.66 3.67 3.99 4.86 5.13 ...
$ avg_speed_def : num 3.61 3.4 3.68 3.64 3.69 3.67 3.52 3.62 3.97 3.88 ...
$ OffRTG : num 108 114 114 104 110 ...
$ DefRTG : num 105 107 109 102 107 ...
$ NetRTG : num 3.3 6.3 5 1.7 3.3 1.2 -2.2 7.7 8.6 17.2 ...
$ AST_per : num 54.3 50.5 30.5 11 20.4 27.7 26.2 38.8 18.2 28.7 ...
$ AST_TO : num 1.92 1.95 2.13 0.87 1.61 2.23 1.23 2.13 1.69 2.19 ...
$ AST_RATIO : num 23.4 27.6 18.5 7.3 12.6 18.6 14.3 25.6 13.3 22.2 ...
$ OREBper : num 5.3 3.5 1.9 6.7 3 1.8 7.2 4 3.8 2.7 ...
$ DREBper : num 27.9 21.2 7 26.9 13.4 13.4 30.5 20.9 16 11.3 ...
$ REBper : num 16.7 12.3 4.5 17 8.3 7.5 18.8 12.7 10 7.3 ...
$ TO_Ratio : num 12.2 14.1 8.7 8.4 7.8 8.3 11.6 12 7.9 10.1 ...
$ EFGper : num 47.6 52.5 54.6 51.8 47.7 51.6 49.8 59.4 54.1 58 ...
$ TS_per : num 55.4 61.3 62.5 58 55.2 58.6 56.2 61.9 61 62.4 ...
$ USG_per : num 40.8 34.1 33.7 32.6 34.2 30.9 36.4 29.7 31.2 29.2 ...
$ PACE : num 102.3 103 99.8 100.2 97.7 ...
$ PIE : num 23 19 16.1 19.2 15.5 15.9 17.8 18.3 17.4 15.1 ...
$ salary2017_18 : num 28530608 28299399 6261395 23775506 27739975 ...
> ####################### Prepare the data for modeling

>
> # Remove variables that are not going to be used to generate the model.
> print(colnames( .... [TRUNCATED]
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"

[73] "salary2017_18"
> names(nba2017)
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"
[73] "salary2017_18"
> nba2017_mod <- nba2017[c(4:73)]
> # Partition the data using stratified sampling to ensure that train and test have
similar ranges of salary
> if (!requireNamespace("caret", quietly .... [TRUNCATED]
> library(caret)
> set.seed(123)
> ind <- createDataPartition(nba2017_mod$salary2017_18, p=0.70, list = F)
> train <- nba2017_mod[ind,]
> test <- nba2017_mod[-ind,]
> # create a vector of player and team names for the test data
> test_players <- nba2017[-ind, "Player"]
> test_teams <- nba2017$Team.x[-ind]
> # bind the player and team names to the test data
> test <- cbind(test_players, test)
> colnames(test)[1] <- "Player"
> test$Team <- test_teams
> print(colnames(test))
[1] "Player" "Age" "GP.x" "W.x"
[5] "L.x" "Min.x" "Pts" "FGM"
[9] "FGA" "FG_per" "three_PM" "three_PA"
[13] "three_Per" "FTM" "FTA" "FT_per"
[17] "OREB" "DREB" "REB" "AST"
[21] "TOV" "STL" "BLK" "PF"
[25] "FP" "DD2" "TD3" "Plus_minus"
[37] "BLKA" "PFD" "Def.Rtg" "DREB_per"
[41] "Per_DREB" "STL_per" "Per_BLK"

"OPP_PTS_OFF_TOV"
[45] "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint" "DEF_WS"
[49] "Dist_FT" "Dist_Miles" "Dist_Miles_off"

"Dist_Miles_def"
[53] "avg_speed" "avg_speed_off" "avg_speed_def" "OffRTG"
[57] "DefRTG" "NetRTG" "AST_per" "AST_TO"
[61] "AST_RATIO" "OREBper" "DREBper" "REBper"
[65] "TO_Ratio" "EFGper" "TS_per" "USG_per"
[69] "PACE" "PIE" "salary2017_18" "Team"
> head(test)
Player Age GP.x W.x L.x Min.x Pts FGM FGA FG_per three_PM
three_PA
1 Russell Westbrook 28 81 46 35 34.6 31.6 10.2 24.0 42.5 2.5
7.2
8 LeBron James 32 74 51 23 37.8 26.4 9.9 18.2 54.8 1.7
4.6
11 Kyrie Irving 25 72 47 25 35.1 25.2 9.3 19.7 47.3 2.5
6.1
12 Karl-Anthony Towns 21 82 31 51 37.0 25.1 9.8 18.0 54.2 1.2
3.4
13 Kevin Durant 28 62 51 11 33.4 25.1 8.9 16.5 53.7 1.9
5.0
14 Jimmy Butler 27 76 40 36 37.0 23.9 7.5 16.5 45.5 1.2
3.3
three_Per FTM FTA FT_per OREB DREB REB AST TOV STL BLK PF FP DD2 TD3
Plus_minus
1 34.3 8.8 10.4 84.5 1.7 9.0 10.7 10.4 5.4 1.6 0.4 2.3 60.6 62 42
3.1
8 36.3 4.8 7.2 67.4 1.3 7.3 8.6 8.7 4.1 1.2 0.6 1.8 51.3 42 13
6.5
11 40.1 4.1 4.6 90.5 0.7 2.5 3.2 5.8 2.5 1.2 0.3 2.2 39.7 8 0
4.6
12 36.7 4.3 5.2 83.2 3.6 8.7 12.3 2.7 2.6 0.7 1.3 2.9 47.1 62 1
-0.3
13 37.5 5.4 6.2 87.5 0.6 7.6 8.3 4.8 2.2 1.1 1.6 1.9 48.0 23 1
11.5
14 36.7 7.7 8.9 86.5 1.7 4.5 6.2 5.5 2.1 1.9 0.4 1.5 44.4 15 2
2.7
Pts_Off_TO second_pts FBPS PITP OPP_PTs_off_TO OPP_sec_pts OPP_FBPS OPP_PITP
BLKA PFD
1 4.7 3.6 6.7 10.1 12.2 9.0 8.1 34.1
1.1 6.7
8 4.1 2.5 5.5 13.9 12.4 10.1 10.0 35.2
0.8 5.9
11 2.7 1.8 2.5 8.3 11.1 10.1 10.4 33.6
1.0 3.4
12 3.4 4.7 1.5 14.1 13.1 10.3 11.0 35.7
1.1 4.7
13 4.6 2.4 5.7 10.0 10.7 9.6 9.3 31.3
0.4 4.6
14 4.0 2.6 4.0 7.8 11.2 9.2 10.6 34.2
0.8 6.5
Def.Rtg DREB_per Per_DREB STL_per Per_BLK OPP_PTS_OFF_TOV OPP_PTS_2nd_Chance
1 104.6 27.9 35.2 27.0 10.3 12.2 9.0
8 107.1 20.9 27.3 23.1 19.3 12.4 10.1
11 109.1 7.5 9.9 23.3 11.2 11.1 10.1
12 110.8 27.7 36.5 11.6 36.0 13.1 10.3
13 101.3 23.2 30.3 16.1 34.6 10.7 9.6
14 103.4 13.1 17.0 30.6 10.7 11.2 9.2
OPP_PTS_FB OPP_PTS_Paint DEF_WS Dist_FT Dist_Miles Dist_Miles_off Dist_Miles_def
1 8.1 34.1 0.043 12533.4 2.37 1.28 1.09
8 10.0 35.2 0.035 12611.3 2.39 1.26 1.13
11 10.4 33.6 0.024 12428.6 2.35 1.28 1.08
12 11.0 35.7 0.018 13353.1 2.53 1.35 1.18
13 9.3 31.3 0.056 11874.8 2.25 1.17 1.08
14 10.6 34.2 0.051 13728.7 2.60 1.41 1.19
avg_speed avg_speed_off avg_speed_def OffRTG DefRTG NetRTG AST_per AST_TO
AST_RATIO
1 4.13 4.72 3.61 107.9 104.6 3.3 54.3 1.92
23.4
8 3.80 3.99 3.62 114.9 107.1 7.7 38.8 2.13
25.6
11 4.03 4.34 3.72 114.2 109.1 5.1 27.8 2.32
19.3
12 4.11 4.28 3.94 109.9 110.8 -0.9 12.7 1.04
10.5
13 4.05 4.55 3.62 117.2 101.3 16.0 21.8 2.17
18.4
14 4.23 4.47 3.96 106.4 103.4 3.0 24.3 2.62
19.6
OREBper DREBper REBper TO_Ratio EFGper TS_per USG_per PACE PIE salary2017_18
Team
1 5.3 27.9 16.7 12.2 47.6 55.4 40.8 102.31 23.0 28530608
OKC
8 4.0 20.9 12.7 12.0 59.4 61.9 29.7 98.38 18.3 33285709
CLE
11 2.3 7.5 5.0 8.3 53.5 58.0 30.2 99.12 13.5 18868625
CLE
12 11.4 27.7 19.5 10.1 57.6 61.8 27.4 97.10 17.1 6216840
MIN
13 2.3 23.2 13.7 8.5 59.4 65.1 27.6 103.71 18.6 25000000
GSW
14 5.0 13.1 9.0 7.5 49.2 58.6 26.5 97.78 16.4 19301070
CHI
> mean(train$salary2017_18) # 7132338

[1] 7123783
> mean(test$salary2017_18) # 6988651

[1] 7008742
> ####################### Modeling

>
> # Make model
> # Try Ridge, Lasso, Elastic Net , and MAR(s) regression models
> # Custom Control Parameters
> .... [TRUNCATED]
> # Get the abreviation for the model type (eg. "lm" = linear model)
> names(getModelInfo())
[1] "ada" "AdaBag" "AdaBoost.M1"
[4] "adaboost" "amdai" "ANFIS"
[7] "avNNet" "awnb" "awtan"
[10] "bag" "bagEarth" "bagEarthGCV"
[13] "bagFDA" "bagFDAGCV" "bam"
[16] "bartMachine" "bayesglm" "binda"
[19] "blackboost" "blasso" "blassoAveraged"
[22] "bridge" "brnn" "BstLm"
[25] "bstSm" "bstTree" "C5.0"
[28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
[31] "cforest" "chaid" "CSimca"
[34] "ctree" "ctree2" "cubist"
[37] "dda" "deepboost" "DENFIS"
[40] "dnn" "dwdLinear" "dwdPoly"
[43] "dwdRadial" "earth" "elm"
[46] "enet" "evtree" "extraTrees"
[49] "fda" "FH.GBML" "FIR.DM"
[52] "foba" "FRBCS.CHI" "FRBCS.W"
[55] "FS.HGD" "gam" "gamboost"
[58] "gamLoess" "gamSpline" "gaussprLinear"
[61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
[64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
[67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
[70] "glm" "glmboost" "glmnet_h2o"
[73] "glmnet" "glmStepAIC" "gpls"
[76] "hda" "hdda" "hdrda"
[79] "HYFIS" "icr" "J48"
[82] "JRip" "kernelpls" "kknn"
[85] "knn" "krlsPoly" "krlsRadial"
[88] "lars" "lars2" "lasso"
[91] "lda" "lda2" "leapBackward"
[94] "leapForward" "leapSeq" "Linda"
[97] "lm" "lmStepAIC" "LMT"
[100] "loclda" "logicBag" "LogitBoost"
[103] "logreg" "lssvmLinear" "lssvmPoly"
[106] "lssvmRadial" "lvq" "M5"
[109] "M5Rules" "manb" "mda"
[112] "Mlda" "mlp" "mlpKerasDecay"
[115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
[118] "mlpML" "mlpSGD" "mlpWeightDecay"
[121] "mlpWeightDecayML" "monmlp" "msaenet"
[124] "multinom" "mxnet" "mxnetAdam"
[127] "naive_bayes" "nb" "nbDiscrete"
[130] "nbSearch" "neuralnet" "nnet"
[133] "nnls" "nodeHarvest" "null"
[136] "OneR" "ordinalNet" "ordinalRF"
[139] "ORFlog" "ORFpls" "ORFridge"
[142] "ORFsvm" "ownn" "pam"
[145] "parRF" "PART" "partDSA"
[148] "pcaNNet" "pcr" "pda"
[151] "pda2" "penalized" "PenalizedLDA"
[154] "plr" "pls" "plsRglm"
[157] "polr" "ppr" "pre"
[160] "PRIM" "protoclass" "qda"
[163] "QdaCov" "qrf" "qrnn"
[166] "randomGLM" "ranger" "rbf"
[169] "rbfDDA" "Rborist" "rda"
[172] "regLogistic" "relaxo" "rf"
[175] "rFerns" "RFlda" "rfRules"
[178] "ridge" "rlda" "rlm"
[181] "rmda" "rocc" "rotationForest"
[184] "rotationForestCp" "rpart" "rpart1SE"
[187] "rpart2" "rpartCost" "rpartScore"
[190] "rqlasso" "rqnc" "RRF"
[193] "RRFglobal" "rrlda" "RSimca"
[196] "rvmLinear" "rvmPoly" "rvmRadial"
[199] "SBC" "sda" "sdwd"
[202] "simpls" "SLAVE" "slda"
[205] "smda" "snn" "sparseLDA"
[208] "spikeslab" "spls" "stepLDA"
[211] "stepQDA" "superpc" "svmBoundrangeString"
[214] "svmExpoString" "svmLinear" "svmLinear2"
[217] "svmLinear3" "svmLinearWeights" "svmLinearWeights2"
[220] "svmPoly" "svmRadial" "svmRadialCost"
[223] "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
[226] "tan" "tanSearch" "treebag"
[229] "vbmpRadial" "vglmAdjCat" "vglmContRatio"
[232] "vglmCumulative" "widekernelpls" "WM"
[235] "wsrf" "xgbDART" "xgbLinear"
[238] "xgbTree" "xyf"
> # Linear Model

> # Tuning parameter: intercept
> set.seed(123)
> lm <- train(salary2017_18 ~.,

+ train,
+ method = "lm",
+ trControl = custom,
+ metric = "Rsquare ..." ... [TRUNCATED]
+ Fold1.Rep1: intercept=TRUE
- Fold1.Rep1: intercept=TRUE
Aggregating results
Fitting final model on full training set
> # Results
> lm
Linear Regression
310 samples
69 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 248, 248, 246, 250, 248, 248, ...
Resampling results:
RMSE Rsquared MAE

5573061 0.5166265 4212961
Tuning parameter 'intercept' was held constant at a value of TRUE
> #RMSE Rsquared MAE

> #5078933 0.5809521 3994940
> lmRsq = 0.5809521
> #Rsquared = more than 58.09% of variablity in salary2017_18 is because of the

model.
> summary(lm)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-15232772 -2471478 -307534 2214296 20368317
Coefficients: (7 not defined because of singularities)

Estimate Std. Error t value Pr(>|t|)
(Intercept) 45740787 34849462 1.313 0.19056
Age -19247 82908 -0.232 0.81661
GP.x -62340 31376 -1.987 0.04804 *
W.x 97900 49941 1.960 0.05108 .
L.x NA NA NA NA
Min.x -865502 913922 -0.947 0.34455
Pts -2241021 4417387 -0.507 0.61238
FGM 11662303 8092485 1.441 0.15082
FGA -1333665 1060606 -1.257 0.20978
FG_per -1075 232715 -0.005 0.99632
three_PM 7032143 5195724 1.353 0.17715
three_PA -723448 1380944 -0.524 0.60083
three_Per -48769 37139 -1.313 0.19035
FTM 333940 4382136 0.076 0.93932
FTA 4433247 2016352 2.199 0.02883 *
FT_per 31750 34945 0.909 0.36446
OREB -2916795 6931487 -0.421 0.67426
DREB -5356683 6427288 -0.833 0.40541
REB 5647224 6513127 0.867 0.38675
AST 2162185 3286406 0.658 0.51120
TOV 2655325 2909085 0.913 0.36225
STL 2316655 6351215 0.365 0.71560
BLK 3717027 6630356 0.561 0.57557
PF -913605 970834 -0.941 0.34760
FP -720267 2135143 -0.337 0.73615
DD2 -4270 82918 -0.051 0.95897
TD3 -172937 329938 -0.524 0.60064
Plus_minus 213610 340067 0.628 0.53049
Pts_Off_TO -3016009 1128598 -2.672 0.00803 **
second_pts -96457 1207370 -0.080 0.93639
FBPS -543039 621413 -0.874 0.38303
PITP -894797 585262 -1.529 0.12757
OPP_PTs_off_TO -51414 448808 -0.115 0.90889
OPP_sec_pts 488347 663773 0.736 0.46260
OPP_FBPS 166482 391125 0.426 0.67074
OPP_PITP -554989 250033 -2.220 0.02735 *
BLKA -2533418 2296181 -1.103 0.27096
PFD -1123284 1299779 -0.864 0.38831
Def.Rtg -298079 5863525 -0.051 0.95950
DREB_per 1827261 1313558 1.391 0.16545
Per_DREB -981560 559872 -1.753 0.08081 .
STL_per 18378 90880 0.202 0.83991
Per_BLK -35109 45483 -0.772 0.44090
OPP_PTS_OFF_TOV NA NA NA NA
OPP_PTS_2nd_Chance NA NA NA NA
OPP_PTS_FB NA NA NA NA
OPP_PTS_Paint NA NA NA NA
DEF_WS 127194390 88323171 1.440 0.15110
Dist_FT -26023 24036 -1.083 0.28002
Dist_Miles 98664466 101355904 0.973 0.33128
Dist_Miles_off 35478812 72362997 0.490 0.62436
Dist_Miles_def 78532760 72378668 1.085 0.27897
avg_speed -15491546 10915874 -1.419 0.15711
avg_speed_off 7783300 4972470 1.565 0.11880
avg_speed_def 3002674 6561488 0.458 0.64763
OffRTG 507557 5862244 0.087 0.93107
DefRTG NA NA NA NA
NetRTG -583678 5862831 -0.100 0.92078
AST_per -130520 206355 -0.633 0.52764
AST_TO 1691732 848785 1.993 0.04735 *
AST_RATIO -245075 161255 -1.520 0.12984
OREBper 405902 1228923 0.330 0.74146
DREBper NA NA NA NA
REBper -798493 2264593 -0.353 0.72469
TO_Ratio 28650 196698 0.146 0.88431
EFGper 24063 284371 0.085 0.93263
TS_per -199435 222730 -0.895 0.37144
USG_per -282752 203816 -1.387 0.16660
PACE -401117 163354 -2.456 0.01476 *
PIE 144334 465598 0.310 0.75683
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4632000 on 247 degrees of freedom

Multiple R-squared: 0.7121, Adjusted R-squared: 0.6399
F-statistic: 9.855 on 62 and 247 DF, p-value: < 2.2e-16
> lm$results # Look at P value of variables
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 TRUE 5573061 0.5166265 4212961 556313.6 0.06418149 436528.1
> # Visualizing predicted vs actual

> #plot(lm$finalModel) # Plotting of residues
>
> # Adjust margins (values are in inches)
> par(mar = c(5, 4, 4, .... [TRUNCATED]
> # Set up a 2x2 plot layout

> par(mfrow = c(2, 2))
> # Residuals vs Fitted

> fitted <- fitted(lm$finalModel)
> residuals <- residuals(lm$finalModel)
> plot(fitted, residuals, main = "Residuals vs Fitted", xlab = "Fitted Values",

ylab = "Residuals")
> abline(h = 0, col = "blue")
> # Add overall title

> mtext("Linear Model Diagnostic Plots", side = 3, line = -1.5, outer = TRUE, cex =
1.2)
> # Normal Q-Q

> qqnorm(residuals, main = "Normal Q-Q Plot", xlab = "Theoretical Quantiles", ylab
= "Sample Quantiles")
> qqline(residuals, col = "blue")
> # Scale-Location
> sqrt_std_resid <- sqrt(abs(residuals / sd(residuals)))
> plot(fitted, sqrt_std_resid, main = "Scale-Location", xlab = "Fitted Values",

ylab = "Square Root of Standardized Residuals")
> # Residuals vs Leverage

> hat_values <- hatvalues(lm$finalModel)
> plot(hat_values, residuals, main = "Residuals vs Leverage", xlab = "Leverage",

ylab = "Standardized Residuals")
> # Reset margins back to default

> par(mar = c(5, 4, 4, 2) + 0.1)
> # 1st plot = if the linearity assumption is meet you should see no pattern here
and the line should be flat, you should also see not pattern if the ....
[TRUNCATED]
> # Adjust margins (values are in inches)

> par(mar = c(5, 4, 4, 2) + 0.1) # default margin settings
> # Set up a 2x2 plot layout
> # Residuals vs Actual

> residuals <- resid(lm)
> predictedValues <- predict(lm)
> plot(train$salary2017_18, residuals, main = "Residuals vs Actual", xlab = "Actual

Values", ylab = "Residuals")
> # Add overall title

> mtext("Linear Model Diagnostic Plots", side = 3, line = -1.5, outer = TRUE, cex =
1.2)
> # Predicted vs Actual

> plot(train$salary2017_18, predictedValues, main = "Predicted vs Actual", xlab =
"Actual Values", ylab = "Predicted Values")
> abline(0, 1, col = "blue")
> # Variable Importance for Linear Model

> plot(varImp(lm), main = "Variable Importance - Linear Model")
> # Surprisingly BLKA > Opp 2nd chance pts > GP > Opp pts in the paint > W were the
top 5 correlated variables.
> # Reset margins back to default
> .... [TRUNCATED]
>
##################################################################################
> # Ridge regression
> # Ridge shrinks the coeficients but keeps .... [TRUNCATED]
> # Best lambda is 0.01

>
> set.seed(1234)
> ridge <- train(salary2017_18~., data = train,

+ method='ridge',
+ trControl = custom,
+ #tuneGrid = lam .... [TRUNCATED]
+ Fold1.Rep1: lambda=0e+00
- Fold1.Rep1: lambda=0e+00
+ Fold1.Rep1: lambda=1e-01
- Fold1.Rep1: lambda=1e-01
Aggregating results
Selecting tuning parameters
Fitting lambda = 0.1 on full training set
> ridge
Ridge Regression
310 samples
69 predictor
Pre-processing: centered (69), scaled (69)

Resampling results across tuning parameters:
lambda RMSE Rsquared MAE

0e+00 5476161 0.5339483 4145725
1e-04 5348630 0.5503810 4037699
1e-01 5048515 0.5943249 3858794
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was lambda = 0.1.
> #lambda RMSE Rsquared MAE

> #1e-01 4761807 0.6307733 3720376
> ridgeRsq = 0.6307733
> ridge$modelInfo
$label
[1] "Ridge Regression"
$library
[1] "elasticnet"
$type
[1] "Regression"
$parameters
parameter class label
1 lambda numeric Weight Decay
$grid
function (x, y, len = NULL, search = "grid")
{
if (search == "grid") {
out <- expand.grid(lambda = c(0, 10^seq(-1, -4, length = len -
1)))
}
else {
out <- data.frame(lambda = 10^runif(len, min = -5, 1))
}
out
}
$loop
NULL
$fit
function (x, y, wts, param, lev, last, classProbs, ...)
{
elasticnet::enet(as.matrix(x), y, lambda = param$lambda)
}
<bytecode: 0x000002157908d7a8>
$predict
function (modelFit, newdata, submodels = NULL)
{
elasticnet::predict.enet(modelFit, newdata, s = 1, mode = "fraction")$fit
}
<bytecode: 0x00000215796c2f60>
$predictors
function (x, s = NULL, ...)
{
if (is.null(s)) {
if (!is.null(x$tuneValue)) {
s <- x$tuneValue$.fraction
}
else stop("must supply a vaue of s")
out <- elasticnet::predict.enet(x, s = s, type = "coefficients",
mode = "fraction")$coefficients
}
else {
out <- elasticnet::predict.enet(x, s = s)$coefficients
}
names(out)[out != 0]
}
$tags
[1] "Linear Regression" "L2 Regularization"
$prob
NULL
$sort
function (x)
x[order(-x$lambda), ]
> # Variable Importance for Ridge Model
> plot(varImp(ridge, scale = F), main = "Variable Importance - Ridge Model")
> # FP(fantasy pts) > min > Pts > FGM > Dist_miles were the top 5 correlated
variables
>
> # There is another way to run ridge using glmnet and altho .... [TRUNCATED]
> set.seed(12345)
> glmnet_ridge <- train(salary2017_18 ~.,

+ train,
+ method = 'glmnet',
+ tuneGrid = expand.grid(alpha = .... [TRUNCATED]
+ Fold1.Rep1: alpha=0, lambda=1
- Fold1.Rep1: alpha=0, lambda=1
Aggregating results
Fitting alpha = 0, lambda = 1 on full training set
> # It determined that the best value for lambda = 1.

>
> glmnet_ridge
glmnet
310 samples
69 predictor
No pre-processing
lambda RMSE Rsquared MAE

0.0001 5014903 0.5838632 3853595
0.1112 5014903 0.5838632 3853595
0.2223 5014903 0.5838632 3853595
0.3334 5014903 0.5838632 3853595
0.4445 5014903 0.5838632 3853595
0.5556 5014903 0.5838632 3853595
0.6667 5014903 0.5838632 3853595
0.7778 5014903 0.5838632 3853595
0.8889 5014903 0.5838632 3853595
1.0000 5014903 0.5838632 3853595
Tuning parameter 'alpha' was held constant at a value of 0

The final values used for the model were alpha = 0 and lambda = 1.
> #lambda RMSE Rsquared MAE

> #1.0000 4750549 0.6325503 3699844
> glmnetRidgeRsq = 0.6325503
> plot(glmnet_ridge)
> # Add a descriptive title and axis labels

> # Extract values from glmnet_ridge
> lambda <- glmnet_ridge$results$lambda
> RMSE <- glmnet_ridge$results$RMSE
> Rsquared <- glmnet_ridge$results$Rsquared
> MAE <- glmnet_ridge$results$MAE
> # Create a plot with three lines for RMSE, Rsquared, and MAE
> plot(lambda, RMSE, type = "l", col = "blue", ylim = c(min(RMSE, Rsquared, MAE),
max(R .... [TRUNCATED]
> lines(lambda, Rsquared, col = "red")

> lines(lambda, MAE, col = "green")
> # Add a descriptive title

> title(main = "glmnetRidge Model")
> # Add a legend

> legend("topright", legend = c("RMSE", "Rsquared", "MAE"), col = c("blue", "red",
"green"), lty = 1)
> plot(varImp(glmnet_ridge, scale = F), main = "Variable Importance - glmnetRidge

Model") # Importance of vaiables.
> # DEF_win shares > BLKA > Dist_miles_def > avg_speed_off > 3pt made were the top
5 correlated variables
>
> # While both ridge and glmnet produce s .... [TRUNCATED]
> lasso <- train(salary2017_18 ~.,

+ train,
+ method = 'lasso',
+ preProc = c('scale', 'center'),
+ .... [TRUNCATED]
+ Fold1.Rep1: fraction=0.9
- Fold1.Rep1: fraction=0.9
Aggregating results
Fitting fraction = 0.1 on full training set
> lasso
The lasso
310 samples
69 predictor
Pre-processing: scaled (69), centered (69)

fraction RMSE Rsquared MAE

0.1 5160723 0.5659929 3904404
0.5 5404914 0.5373147 4066264
0.9 5472607 0.5286914 4120530
The final value used for the model was fraction = 0.1.
> #fraction RMSE Rsquared MAE

> #0.1 4735988 0.6286117 3680057
> lassoRsq = 0.6286117
> plot(varImp(lasso, scale = F), main = "Variable Importance - Lasso Model")
> # Similarly to Ridge FP(fantasy pts) > min > Pts > FGM > Dist_miles were the top
5 correlated variables
>
>
> #################################### .... [TRUNCATED]
> en <- train(salary2017_18 ~., data = train,

+ method = 'glmnet',
+ tuneGrid = expand.grid(alpha = seq(0,1, length = 10),
+ .... [TRUNCATED]
+ Fold1.Rep1: alpha=0.0000, lambda=0.2
- Fold1.Rep1: alpha=0.0000, lambda=0.2
Aggregating results
Fitting alpha = 0, lambda = 0.2 on full training set
> en
glmnet
310 samples
69 predictor
No pre-processing
alpha lambda RMSE Rsquared MAE

0.0000000 0.000100 4984094 0.5852860 3812247
0.0000000 0.050075 4984094 0.5852860 3812247
0.0000000 0.100050 4984094 0.5852860 3812247
0.0000000 0.150025 4984094 0.5852860 3812247
0.0000000 0.200000 4984094 0.5852860 3812247
0.1111111 0.000100 5246288 0.5567994 3965799
0.1111111 0.050075 5246288 0.5567994 3965799
0.1111111 0.100050 5246288 0.5567994 3965799
0.1111111 0.150025 5246288 0.5567994 3965799
0.1111111 0.200000 5246288 0.5567994 3965799
0.2222222 0.000100 5287975 0.5519448 3993132
0.2222222 0.050075 5287975 0.5519448 3993132
0.2222222 0.100050 5287975 0.5519448 3993132
0.2222222 0.150025 5287975 0.5519448 3993132
0.2222222 0.200000 5287975 0.5519448 3993132
0.3333333 0.000100 5308676 0.5493957 4008624
0.3333333 0.050075 5308676 0.5493957 4008624
0.3333333 0.100050 5308676 0.5493957 4008624
0.3333333 0.150025 5308676 0.5493957 4008624
0.3333333 0.200000 5308676 0.5493957 4008624
0.4444444 0.000100 5318649 0.5482208 4014054
0.4444444 0.050075 5318649 0.5482208 4014054
0.4444444 0.100050 5318649 0.5482208 4014054
0.4444444 0.150025 5318649 0.5482208 4014054
0.4444444 0.200000 5318649 0.5482208 4014054
0.5555556 0.000100 5324331 0.5475088 4020086
0.5555556 0.050075 5324331 0.5475088 4020086
0.5555556 0.100050 5324331 0.5475088 4020086
0.5555556 0.150025 5324331 0.5475088 4020086
0.5555556 0.200000 5324331 0.5475088 4020086
0.6666667 0.000100 5325486 0.5474729 4021906
0.6666667 0.050075 5325486 0.5474729 4021906
0.6666667 0.100050 5325486 0.5474729 4021906
0.6666667 0.150025 5325486 0.5474729 4021906
0.6666667 0.200000 5325486 0.5474729 4021906
0.7777778 0.000100 5325835 0.5472554 4023933
0.7777778 0.050075 5325835 0.5472554 4023933
0.7777778 0.100050 5325835 0.5472554 4023933
0.7777778 0.150025 5325835 0.5472554 4023933
0.7777778 0.200000 5325835 0.5472554 4023933
0.8888889 0.000100 5333578 0.5461531 4029946
0.8888889 0.050075 5333578 0.5461531 4029946
0.8888889 0.100050 5333578 0.5461531 4029946
0.8888889 0.150025 5333578 0.5461531 4029946
0.8888889 0.200000 5333578 0.5461531 4029946
1.0000000 0.000100 5331493 0.5464683 4024836
1.0000000 0.050075 5331493 0.5464683 4024836
1.0000000 0.100050 5331493 0.5464683 4024836
1.0000000 0.150025 5331493 0.5464683 4024836
1.0000000 0.200000 5331493 0.5464683 4024836
The final values used for the model were alpha = 0 and lambda = 0.2.
> #alpha lambda RMSE Rsquared MAE

> #0.0000000 0.000100 4720643 0.6341874 3698572
> enRsq = 0.6341874
> plot(en)
> plot(varImp(en, scale = F), main = "Variable Importance - Elastic Net Model")
> # Just like glmnet ridge DEF_win shares > BLKA > Dist_miles_def > avg_speed_off >
3pt made were the top 5 correlated variables
>
> ################ .... [TRUNCATED]
> grid = expand.grid(degree = 2, nprune = 7) # Play around with different nprune

and see which works best (went from 40 down to 7; gave the best Rsq)
> set.seed(54321)
> MARS1 <- train(salary2017_18 ~., data = train,

+ method = 'earth',
+ tuneGrid = grid,
+ trControl = custom .... [TRUNCATED]
+ Fold1.Rep1: degree=2, nprune=7
- Fold1.Rep1: degree=2, nprune=7
Aggregating results
> MARS1
Multivariate Adaptive Regression Spline
310 samples
69 predictor
No pre-processing
Resampling results:
RMSE Rsquared MAE
5716119 0.5098306 4095691
Tuning parameter 'nprune' was held constant at a value of 7

Tuning parameter 'degree'
was held constant at a value of 2
> # RMSE Rsquared MAE

> #5120492 0.5925366 3763366
> MARS1Rsq = 0.5925366
> MARS1$finalModel
Selected 7 of 37 terms, and 7 of 69 predictors (nprune=7)
Termination condition: RSq changed by less than 0.001 at 37 terms
Importance: FP, DEF_WS, W.x-unused, Age, three_PA, OREB, Dist_Miles_off, FGM, ...
Number of terms at each degree of interaction: 1 4 2
GCV 2.199971e+13 RSS 6.134086e+15 GRSq 0.6318521 RSq 0.6667271
> plot(varImp(MARS1, scale = F), main = "Variable Importance - MARS1 Model")
> # Min, Age, FP, BLKA, TD3 (triple doubles) were the only variables used.
>
> # Multivariate Adapative Regression Splines
> # tuning parameter degr .... [TRUNCATED]
> MARS <- train(salary2017_18 ~., data = train,

+ method = 'gcvEarth',
+ tuneLength = 5,
+ trControl = custom)
+ Fold1.Rep1: degree=1
- Fold1.Rep1: degree=1
Aggregating results
> MARS
Multivariate Adaptive Regression Splines
310 samples
69 predictor
No pre-processing
Resampling results:
RMSE Rsquared MAE

5294033 0.5488899 3872835
Tuning parameter 'degree' was held constant at a value of 1
> #RMSE Rsquared MAE

> #5004682 0.5956883 3747718
> MARSRsq = 0.5956883
> MARS$finalModel
Selected 13 of 21 terms, and 10 of 69 predictors
Termination condition: RSq changed by less than 0.001 at 21 terms
Importance: FP, DEF_WS, Age, OREB, Dist_FT, three_PA, OPP_FBPS, W.x, Pts, BLK, ...
Number of terms at each degree of interaction: 1 12 (additive model)
GCV 2.038615e+13 RSS 5.341501e+15 GRSq 0.6588536 RSq 0.7097892
> plot(varImp(MARS, scale = F), main = "Variable Importance - MARS Model")
> # FP, Dist_Miles_def, Age, TD3, BLKA, STL, OPP_FBPS, BLK, DEF_WS were the only
variables used
> # MARS looks like it does the best at incorporating .... [TRUNCATED]
> res <- resamples(model_list)

> summary(res)
Call:
summary.resamples(object = res)
Models: LinearModel, Ridge, GlmnetRidge, Lasso, ElasticNet, MARS, MARSs

Number of resamples: 25
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
LinearModel 3317404 3948132 4185130 4212961 4628162 5098863 0
Ridge 3081720 3640888 3912416 3858794 4118357 4528260 0
GlmnetRidge 3363802 3545343 3760396 3853595 4113259 4834166 0
Lasso 3147714 3710169 4003428 3904404 4104815 4580604 0
ElasticNet 2718386 3609310 3860008 3812247 3975460 4793636 0
MARS 3324116 3831165 4097809 4095691 4294680 5080470 0
MARSs 3176418 3605303 3879350 3872835 4047672 4683669 0
RMSE
LinearModel 4546094 5062952 5636885 5573061 5957401 6560565 0
Ridge 3993764 4819406 5076350 5048515 5330219 6133344 0
GlmnetRidge 4138136 4629204 4955967 5014903 5409471 6035031 0
Lasso 4101223 4768681 5257961 5160723 5522938 6290290 0
ElasticNet 3540897 4662299 5055350 4984094 5265990 6123995 0
MARS 4514745 5286385 5638189 5716119 6006468 7049854 0
MARSs 4114448 4993291 5250407 5294033 5790850 6381070 0
Rsquared
LinearModel 0.3504425 0.4776367 0.5263815 0.5166265 0.5593456 0.6182352 0
Ridge 0.4656987 0.5299787 0.5822513 0.5943249 0.6640283 0.7206675 0
GlmnetRidge 0.4459301 0.5475673 0.5782917 0.5838632 0.6411426 0.7502092 0
Lasso 0.3838528 0.4906664 0.5793424 0.5659929 0.6541656 0.7362773 0
ElasticNet 0.4360748 0.5217468 0.6004407 0.5852860 0.6439920 0.7503014 0
MARS 0.2517839 0.4342799 0.5320606 0.5098306 0.5765996 0.6478697 0
MARSs 0.3004270 0.4965708 0.5278863 0.5488899 0.6162878 0.7242066 0
> # Based on mean Rsquared it looks like the best model was en (Elastic Net
Regression)
> # However lets use all the models to predict the test datase .... [TRUNCATED]
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"

> names(nba2017)
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"
[73] "salary2017_18"
> nba2018 <- nba2018[-223,]
> str(nba2018)
$ Year : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
$ Player : chr "James Harden" "Anthony Davis" "LeBron James" "Damian
Lillard" ...
$ Team : chr "HOU" "NOP" "CLE" "POR" ...
$ Age : int 28 25 33 27 23 30 29 29 27 21 ...
$ GP.x : int 72 75 82 73 75 51 68 80 48 54 ...
$ W.x : int 59 45 50 44 39 41 49 47 27 16 ...
$ L.x : int 13 30 32 29 36 10 19 33 21 38 ...
$ Min.x : num 35.4 36.4 36.9 36.6 36.8 32 34.2 36.4 36.2 34.5 ...
$ Pts : num 30.4 28.1 27.5 26.9 26.9 26.4 26.4 25.4 25.2 24.9 ...
$ FGM : num 9 10.4 10.5 8.5 9.9 8.4 9.3 9.5 8.5 8.4 ...
$ FGA : num 20.1 19.5 19.3 19.4 18.7 16.9 18 21.1 18 19.5 ...
$ FG_per : num 44.9 53.4 54.2 43.9 52.9 49.5 51.6 44.9 47 43.2 ...
$ three_PM : num 3.7 0.7 1.8 3.1 0.6 4.2 2.5 1.2 2.2 2.7 ...
$ three_PA : num 10 2.2 5 8.6 1.9 9.8 6.1 4.1 6.1 7.1 ...
$ three_Per : num 36.7 34 36.7 36.1 30.7 42.3 41.9 29.8 35.4 38.3 ...
$ FTM : num 8.7 6.6 4.7 6.8 6.5 5.5 5.3 5.2 6.1 5.4 ...
$ FTA : num 10.1 8 6.5 7.4 8.5 5.9 5.9 7.1 8.2 6.1 ...
$ FT_per : num 85.8 82.8 73.1 91.6 76 92.1 88.9 73.7 74.6 87.8 ...
$ OREB : num 0.6 2.5 1.2 0.8 2.1 0.7 0.5 1.9 2.2 0.5 ...
$ DREB : num 4.8 8.6 7.5 3.6 8 4.4 6.4 8.2 10.7 4 ...
$ REB : num 5.4 11.1 8.6 4.5 10 5.1 6.8 10.1 12.9 4.5 ...
$ AST : num 8.8 2.3 9.1 6.6 4.8 6.1 5.4 10.3 5.4 4.7 ...
$ TOV : num 4.4 2.2 4.2 2.8 3 3 3 4.8 5 3.6 ...
$ STL : num 1.8 1.5 1.4 1.1 1.5 1.6 0.7 1.8 1.6 0.9 ...
$ BLK : num 0.7 2.6 0.9 0.4 1.4 0.2 1.8 0.3 1.6 0.3 ...
$ PF : num 2.3 2.1 1.7 1.6 3.1 2.2 2 2.5 3.8 3.1 ...
$ FP : num 53 55.1 54.1 43.6 51.7 43.8 47 54.3 53.3 37.2 ...
$ DD2 : int 31 50 52 11 42 5 15 59 38 4 ...
$ TD3 : int 4 1 18 0 1 0 2 25 3 0 ...
$ Plus_minus : num 7.3 3.9 1.3 4.7 2 9.5 5.1 4.8 1.7 -6.6 ...
$ Pts_Off_TO : num 4.7 4.2 4.1 3.4 4.1 4.3 3.7 5.2 3.5 2.6 ...
$ second_pts : num 2 4.5 2.2 2 3 1.6 1.9 3 3.3 1.8 ...
$ FBPS : num 2 3.6 4.2 2.2 4.7 4.3 4.3 5.5 1.9 3.4 ...
$ PITP : num 9.3 15.4 14.4 7.7 15.7 5.4 7.7 11.3 11.3 6.7 ...
$ OPP_PTs_off_TO : num 11.9 12.2 12.7 11.3 11.3 12.3 13.5 11.2 13.1 14.2 ...
$ OPP_sec_pts : num 8.8 10.6 9.7 8.9 9.5 9 10.1 9.3 9 9.2 ...
$ OPP_FBPS : num 9.2 10.9 9.1 8.7 7 9.9 10.8 9 10.3 11.6 ...
$ OPP_PITP : num 35.7 34.8 36.2 32.7 34.4 32.5 34.5 33.5 33.2 33.9 ...
$ BLKA : num 1.4 0.6 0.8 1.2 1.1 0.4 0.6 1.1 1.3 1.1 ...
$ PFD : num 7 7.4 5.4 5.4 6.8 4.1 4.8 5.8 7.8 5.4 ...
$ Def.Rtg : num 105 103 111 104 105 ...
$ DREB_per : num 15.3 24.1 22.5 10.6 24.9 14.1 19.2 25.4 32.5 12.6 ...
$ Per_DREB : num 19.3 31.6 29.1 13.3 32.4 18.5 25.6 32.3 41.3 16.4 ...
$ STL_per : num 26.8 24.3 26 19.2 21.7 27.9 13.3 25.8 28.3 17.3 ...
$ Per_BLK : num 19.4 53.5 28.9 8.9 32 3.3 33.7 6.9 40.2 7.7 ...
$ OPP_PTS_OFF_TOV : num 11.9 12.2 12.7 11.3 11.3 12.3 13.5 11.2 13.1 14.2 ...
$ OPP_PTS_2nd_Chance: num 8.8 10.6 9.7 8.9 9.5 9 10.1 9.3 9 9.2 ...
$ OPP_PTS_FB : num 9.2 10.9 9.1 8.7 7 9.9 10.8 9 10.3 11.6 ...
$ OPP_PTS_Paint : num 35.7 34.8 36.2 32.7 34.4 32.5 34.5 33.5 33.2 33.9 ...
$ DEF_WS : num 0.043 0.05 0.017 0.05 0.042 0.037 0.033 0.046 0.036
0.013 ...
$ Dist_FT : num 11475 12628 12545 13318 13292 ...
$ Dist_Miles : num 2.17 2.39 2.38 2.52 2.52 2.42 2.3 2.51 2.17 2.46 ...
$ Dist_Miles_def : num 1 1.15 1.12 1.11 1.19 1.13 1.09 1.16 1.05 1.11 ...
$ avg_speed : num 3.69 3.96 3.88 4.16 4.15 4.55 4.06 4.14 3.6 4.3 ...
$ avg_speed_off : num 3.97 4.31 4.1 4.53 4.61 5.24 4.55 4.67 3.77 4.73 ...
$ avg_speed_def : num 3.41 3.64 3.66 3.77 3.74 3.96 3.63 3.66 3.44 3.86 ...
$ OffRTG : num 114 109 113 108 109 ...
$ DefRTG : num 105 103 111 103 105 ...
$ NetRTG : num 9.9 5.4 1.7 4.8 3.5 14.7 8.8 5.7 1.8 -10 ...
$ AST_per : num 44.9 10.4 43.2 30.1 23.7 27.2 24.3 46.4 24.1 23.3 ...
$ AST_TO : num 2 1.07 2.15 2.33 1.62 2.03 1.77 2.15 1.06 1.3 ...
$ AST_RATIO : num 23.2 8.4 25.7 20.6 15.9 21.2 18.6 26.1 16.7 15.4 ...
$ OREBper : num 1.8 7.7 3.7 2.4 6.7 2.7 1.6 5.4 7.1 1.3 ...
$ DREBper : num 15.4 24.1 22.5 10.7 24.9 14.1 19.2 25.5 32.5 12.7 ...
$ REBper : num 8.8 16.3 13.3 6.6 15.9 8.9 11.1 15.1 20.2 7 ...
$ TO_Ratio : num 11.6 7.9 11.9 8.8 9.8 10.5 10.5 12.1 15.8 11.8 ...
$ EFGper : num 54.1 55.2 59 51.9 54.5 61.8 58.6 47.7 53 50.1 ...
$ TS_per : num 61.9 61.2 62.1 59.4 59.8 67.5 64 52.4 58.3 56.1 ...
$ USG_per : num 36.1 29.7 31.6 30.1 31.2 29.7 29.9 33.2 32.4 30.9 ...
$ PACE : num 99.8 103.3 100.1 100.4 98.4 ...
$ PIE : num 19.4 18.8 19.1 16.5 18.6 16.5 16.8 18.6 16.7 12.3 ...
> # Predict the salary using all the models.

> nba2018$PV_EN <- predict(en, nba2018)
> nba2018$PV_lm <- predict(lm, nba2018)
> nba2018$PV_ridge <- predict(ridge, nba2018)
> nba2018$PV_lasso <- predict(lasso, nba2018)
> nba2018$PV_MARS1 <- predict(MARS1, nba2018)
> nba2018$PV_MARS <- predict(MARS, nba2018)
> # Merge actual salary with nba2018 dataset and compute the difference between
actual and predicted
> names(nba2018)
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"

[73] "PV_EN" "PV_lm" "PV_ridge" "PV_lasso"
[77] "PV_MARS1" "PV_MARS"
> nba2018_salaryFinal <- left_join(nba2018, Salary2017_18, by = "Player")
> names(nba2018_salaryFinal)
[5] "GP.x" "W.x" "L.x" "Min.x"
[25] "BLK" "PF" "FP" "DD2"
[73] "PV_EN" "PV_lm" "PV_ridge" "PV_lasso"
[77] "PV_MARS1" "PV_MARS" "Team.y" "salary2017_18"

> # compute difference
>
> nba2018_salary <- nba2018_salaryFinal %>%
+ filter(salary2017_18 >= 0) %>%
+ select(Year:T .... [TRUNCATED]
> # Write the final data set to file so that you have a copy
> write.csv(nba2018_salary, "nba2018_salary.csv")
> # If you wanted to save the final model as a function that can be later recalled
do the following:
> saveRDS(en, "final_model.rds")
> fm <- readRDS("final_model.rds")
> print(fm)
glmnet
310 samples
69 predictor
No pre-processing
alpha lambda RMSE Rsquared MAE

0.0000000 0.000100 4984094 0.5852860 3812247
0.0000000 0.050075 4984094 0.5852860 3812247
0.0000000 0.100050 4984094 0.5852860 3812247
0.0000000 0.150025 4984094 0.5852860 3812247
0.0000000 0.200000 4984094 0.5852860 3812247
0.1111111 0.000100 5246288 0.5567994 3965799
0.1111111 0.050075 5246288 0.5567994 3965799
0.1111111 0.100050 5246288 0.5567994 3965799
0.1111111 0.150025 5246288 0.5567994 3965799
0.1111111 0.200000 5246288 0.5567994 3965799
0.2222222 0.000100 5287975 0.5519448 3993132
0.2222222 0.050075 5287975 0.5519448 3993132
0.2222222 0.100050 5287975 0.5519448 3993132
0.2222222 0.150025 5287975 0.5519448 3993132
0.2222222 0.200000 5287975 0.5519448 3993132
0.3333333 0.000100 5308676 0.5493957 4008624
0.3333333 0.050075 5308676 0.5493957 4008624
0.3333333 0.100050 5308676 0.5493957 4008624
0.3333333 0.150025 5308676 0.5493957 4008624
0.3333333 0.200000 5308676 0.5493957 4008624
0.4444444 0.000100 5318649 0.5482208 4014054
0.4444444 0.050075 5318649 0.5482208 4014054
0.4444444 0.100050 5318649 0.5482208 4014054
0.4444444 0.150025 5318649 0.5482208 4014054
0.4444444 0.200000 5318649 0.5482208 4014054
0.5555556 0.000100 5324331 0.5475088 4020086
0.5555556 0.050075 5324331 0.5475088 4020086
0.5555556 0.100050 5324331 0.5475088 4020086
0.5555556 0.150025 5324331 0.5475088 4020086
0.5555556 0.200000 5324331 0.5475088 4020086
0.6666667 0.000100 5325486 0.5474729 4021906
0.6666667 0.050075 5325486 0.5474729 4021906
0.6666667 0.100050 5325486 0.5474729 4021906
0.6666667 0.150025 5325486 0.5474729 4021906
0.6666667 0.200000 5325486 0.5474729 4021906
0.7777778 0.000100 5325835 0.5472554 4023933
0.7777778 0.050075 5325835 0.5472554 4023933
0.7777778 0.100050 5325835 0.5472554 4023933
0.7777778 0.150025 5325835 0.5472554 4023933
0.7777778 0.200000 5325835 0.5472554 4023933
0.8888889 0.000100 5333578 0.5461531 4029946
0.8888889 0.050075 5333578 0.5461531 4029946
0.8888889 0.100050 5333578 0.5461531 4029946
0.8888889 0.150025 5333578 0.5461531 4029946
0.8888889 0.200000 5333578 0.5461531 4029946
1.0000000 0.000100 5331493 0.5464683 4024836
1.0000000 0.050075 5331493 0.5464683 4024836
1.0000000 0.100050 5331493 0.5464683 4024836
1.0000000 0.150025 5331493 0.5464683 4024836
1.0000000 0.200000 5331493 0.5464683 4024836
The final values used for the model were alpha = 0 and lambda = 0.2.
> # Use the model for prediction

> predict1 <- predict(fm, train)
> sqrt(mean((train$salary2017_18-predict1)^2))
[1] 4426223
> predict2 <- predict(en, test)
> sqrt(mean((test$salary2017_18-predict2)^2))
[1] 4882551
> # Determine the team with the biggest difference between predicted and actual
salary. Basically which team is over/underpaying the most?
> # First .... [TRUNCATED]
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
[113] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
FALSE
FALSE
[141] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
FALSE
[155] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
FALSE
[183] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
FALSE
[197] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
FALSE
[211] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE
FALSE
FALSE
FALSE
[267] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE
[281] FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE
FALSE
FALSE
FALSE
[337] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
FALSE
[351] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE
FALSE
FALSE
[393] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
TRUE
[407] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
TRUE
[421] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
FALSE
[435] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
FALSE
[449] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
TRUE
[463] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
FALSE
[477] FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
FALSE
[491] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
FALSE
[505] FALSE FALSE FALSE FALSE
> nba2018_salary_minDup <- nba2018_salary[!duplicated(nba2018_salary$Player),]
> duplicated(nba2018_salary_minDup$Player)
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
[463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> names(nba2018_salary_minDup)
[1] "Year" "Player" "Team.x" "PV_EN" "PV_lm"
[6] "PV_ridge" "PV_lasso" "PV_MARS" "PV_MARS1"

"salary2017_18"
[11] "Diff_EN" "Diff_lm" "Diff_ridge" "Diff_lasso" "Diff_MARS"
[16] "Diff_MARS1"
> nba2018_salary_minDup <- tbl_df(nba2018_salary_minDup)
> team_summary <- nba2018_salary_minDup %>%

+ group_by(Team.x) %>%
+ summarise(count = n(),
+ .... [TRUNCATED]
> write.csv(team_summary, "team_summary.csv")
> # Which team has the most undervalued starts

> team_player_sum <- nba2018_salary_minDup %>%
+ filter(Diff_MARS1>0) %>%
+ .... [TRUNCATED]
> write.csv(team_player_sum, "team_player_sum.csv")
> generate_report <- function(nba_data, model, title) {

+
+ # Create the predicted salary column
+ nba_data$predicted_salary .... [TRUNCATED]
> # Linear Model report

> generate_report(test, lm, "Linear Model")
Top 10 Overpaid Players:

Player Team salary2017_18 predicted_salary
496 Brice Johnson LAC 1331160 -13229224
225 Luol Deng LAL 17190000 3315248
136 Allen Crabbe POR 19332500 6523083
122 Brandon Knight PHX 13618750 1477499
22 Carmelo Anthony NYK 26243760 15649683
231 Timofey Mozgov LAL 15280000 5151377
41 Danilo Gallinari DEN 20559599 13047887
84 Bojan Bogdanovic WAS 10500000 3192929
198 JR Smith CLE 13760000 6685088
30 Mike Conley MEM 28530608 22039449
Top 10 Underpaid Players:

32 Joel Embiid PHI 6100266 24691564
52 Nikola Jokic DEN 1471382 17267632
12 Karl-Anthony Towns MIN 6216840 20368716
70 Myles Turner IND 2569920 14925647
80 Marcus Morris DET 5000000 14130796
77 Jordan Crawford NOP 250000 8793973
151 Jusuf Nurkic POR 2947305 11483618
103 Seth Curry DAL 3028410 11409525
33 Jabari Parker MIL 6782392 14441479
16 Andrew Wiggins MIN 7574322 15113586
> # Ridge Regression report

> generate_report(test, ridge, "Ridge Regression")

136 Allen Crabbe POR 19332500 6193402
225 Luol Deng LAL 17190000 5768010
30 Mike Conley MEM 28530608 20880263
247 Joe Ingles UTA 14136364 7531657
274 Mirza Teletovic MIL 10500000 4078227
8 LeBron James CLE 33285709 27027050

52 Nikola Jokic DEN 1471382 18421292
32 Joel Embiid PHI 6100266 20114661
70 Myles Turner IND 2569920 15800851
78 Rudy Gobert UTA 21974719 30766360
37 Zach LaVine MIN 3202217 11713705
151 Jusuf Nurkic POR 2947305 10839567
> # Lasso Regression report

> generate_report(test, lasso, "Lasso Regression")

136 Allen Crabbe POR 19332500 6542146
225 Luol Deng LAL 17190000 4834092
30 Mike Conley MEM 28530608 20634034
84 Bojan Bogdanovic WAS 10500000 3700649
247 Joe Ingles UTA 14136364 7589820
274 Mirza Teletovic MIL 10500000 4164355

52 Nikola Jokic DEN 1471382 18450205
32 Joel Embiid PHI 6100266 19035356
70 Myles Turner IND 2569920 15184927
103 Seth Curry DAL 3028410 10215573
37 Zach LaVine MIN 3202217 10362436
151 Jusuf Nurkic POR 2947305 10048525
> # Elastic Net Regression report

> generate_report(test, en, "Elastic Net Regression")

136 Allen Crabbe POR 19332500 6330836
225 Luol Deng LAL 17190000 5811020
30 Mike Conley MEM 28530608 19875574
8 LeBron James CLE 33285709 25286026
247 Joe Ingles UTA 14136364 7521891
83 Kentavious Caldwell-Pope DET 17745894 11436397

52 Nikola Jokic DEN 1471382 17496431
32 Joel Embiid PHI 6100266 18875456
70 Myles Turner IND 2569920 15042000
37 Zach LaVine MIN 3202217 11106542
151 Jusuf Nurkic POR 2947305 10453134
78 Rudy Gobert UTA 21974719 28839663
> # Multivariate Adaptive Regression Splines report

> generate_report(test, MARS, "Multivariate Adaptive Regression Splines")

136 Allen Crabbe POR 19332500 5503996
225 Luol Deng LAL 17190000 4024876
41 Danilo Gallinari DEN 20559599 12580196
71 Tim Hardaway Jr. ATL 16500000 9486645
83 Kentavious Caldwell-Pope DET 17745894 10992606
8 LeBron James CLE 33285709 26543149
30 Mike Conley MEM 28530608 21893894

37 Zach LaVine MIN 3202217 21057632
52 Nikola Jokic DEN 1471382 19120226
17 Kemba Walker CHA 12000000 24010706
14 Jimmy Butler CHI 19301070 30863710
33 Jabari Parker MIL 6782392 16738377
24 Klay Thompson GSW 17826150 26746695
427 Andrew Bogut CLE 2328652 9696171
331 Willie Reed MIA 1577230 8119430
32 Joel Embiid PHI 6100266 12593246
> # Use the highest Rsquared model for prediction (Elastic Net)
> predictions <- predict(en, newdata = test)
> # Calculate the difference between actual and predicted salaries

> test$diff <- test$salary2017_18 - predictions
> print(colnames(test))
[1] "Player" "Age" "GP.x" "W.x"
[5] "L.x" "Min.x" "Pts" "FGM"
[9] "FGA" "FG_per" "three_PM" "three_PA"
[13] "three_Per" "FTM" "FTA" "FT_per"
[17] "OREB" "DREB" "REB" "AST"
[21] "TOV" "STL" "BLK" "PF"
[25] "FP" "DD2" "TD3" "Plus_minus"
[37] "BLKA" "PFD" "Def.Rtg" "DREB_per"
[41] "Per_DREB" "STL_per" "Per_BLK"

"OPP_PTS_OFF_TOV"
[45] "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint" "DEF_WS"
[49] "Dist_FT" "Dist_Miles" "Dist_Miles_off"

"Dist_Miles_def"
[53] "avg_speed" "avg_speed_off" "avg_speed_def" "OffRTG"
[57] "DefRTG" "NetRTG" "AST_per" "AST_TO"
[61] "AST_RATIO" "OREBper" "DREBper" "REBper"
[65] "TO_Ratio" "EFGper" "TS_per" "USG_per"
[69] "PACE" "PIE" "salary2017_18" "Team"
[73] "diff"
> # Identify underperforming players (positive diff values)

> underperforming_players <- test[test$diff > 0, c("Player", "salary2017_18",
"diff")]
> # Identify overpaid players (negative diff values)

> overpaid_players <- test[test$diff < 0, c("Player", "salary2017_18", "diff")]
> # Find team performance (sum of diff values for each team)
> team_performance <- aggregate(diff ~ Team, data = test, sum)
> # Identify teams that overpaid (positive sum of diff values)
> overpaid_teams <- team_performance[team_performance$diff > 0, ]
> # Identify teams that underpaid (negative sum of diff values)

> underpaid_teams <- team_performance[team_performance$diff < 0, ]
> # Plot top 10 underperforming players

> library(ggplot2)
> top_underperforming <- head(underperforming_players[order(-

underperforming_players$diff),], 10)
> ggplot(top_underperforming, aes(x = reorder(Player, diff), y = diff)) +

+ geom_bar(stat = "identity", fill = "blue") +
+ theme(axis. .... [TRUNCATED]
> # Plot top 10 overpaid players

> top_overpaid <- head(overpaid_players[order(overpaid_players$diff),], 10)
> ggplot(top_overpaid, aes(x = reorder(Player, diff), y = diff)) +

+ geom_bar(stat = "identity", fill = "red") +
+ theme(axis.text.x = .... [TRUNCATED]
> # Plot team performance

> ggplot(team_performance, aes(x = reorder(Team, diff), y = diff, fill = diff > 0))
+
+ geom_bar(stat = "identity") .... [TRUNCATED]
> dev.off();
null device
1
> sink()

Metrics

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Metrics

Uploaded by

Copyright:

Available Formats

> # Load Data

> misc17 <- read.csv("misc17.csv", stringsAsFactors = F)

> def17 <- read.csv("def17.csv", stringsAsFactors = F)

> track17 <- read.csv("track17.csv", stringsAsFactors = F)

> Adv17 <- read.csv("Adv17.csv", stringsAsFactors = F)

> misc18 <- read.csv("misc18.csv", stringsAsFactors = F)

> def18 <- read.csv("def18.csv", stringsAsFactors = F)

> track18 <- read.csv("track18.csv", stringsAsFactors = F)

> Adv18 <- read.csv("Adv18.csv", stringsAsFactors = F)

> # Salary for 2017-2018 season

> #explore the data

[8] "Min" "OffRTG" "DefRTG" "NetRTG" "AST_per" "AST_TO"

> # merge 2017 data

> nba2017_1 <- left_join(trad17, misc17, by = c("Year", "Player", "Team", "Age",

[11] "FGM" "FGA" "FG_per" "three_PM" "three_PA"

[16] "three_Per" "FTM" "FTA" "FT_per" "OREB"

[21] "DREB" "REB" "AST" "TOV" "STL"

[26] "BLK" "PF" "FP" "DD2" "TD3"

[31] "Plus_minus" "Rk.y" "Pts_Off_TO" "second_pts" "FBPS"

[36] "PITP" "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"

[41] "BLKA" "PFD"

> nba2017_2 <- left_join(nba2017_1, def17, by = c("Year", "Player", "Team", "Age",

[5] "Age" "GP" "W" "L"

[9] "Min" "Pts" "FGM" "FGA"

[13] "FG_per" "three_PM" "three_PA" "three_Per"

[17] "FTM" "FTA" "FT_per" "OREB"

[21] "DREB" "REB" "AST" "TOV"

[25] "STL" "BLK" "PF" "FP"

[29] "DD2" "TD3" "Plus_minus" "Rk.y"

[33] "Pts_Off_TO" "second_pts" "FBPS" "PITP"

[37] "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"

[41] "BLKA" "PFD" "Rk" "Def.Rtg"

[45] "DREB_per" "Per_DREB" "STL_per" "Per_BLK"

[49] "OPP_PTS_OFF_TOV" "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint"

> nba2017_3 <- left_join(nba2017_2, track17, by = c("Year", "Player", "Team"))

[5] "Age" "GP.x" "W.x" "L.x"

[9] "Min.x" "Pts" "FGM" "FGA"

[13] "FG_per" "three_PM" "three_PA" "three_Per"

[21] "DREB" "REB" "AST" "TOV"

[25] "STL" "BLK" "PF" "FP"

[29] "DD2" "TD3" "Plus_minus" "Rk.y"

[33] "Pts_Off_TO" "second_pts" "FBPS" "PITP"

[37] "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"

[41] "BLKA" "PFD" "Rk.x.x" "Def.Rtg"

[45] "DREB_per" "Per_DREB" "STL_per" "Per_BLK"

[49] "OPP_PTS_OFF_TOV" "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint"

[53] "DEF_WS" "Rk.y.y" "GP.y" "W.y"

[57] "L.y" "Min.y" "Dist_FT" "Dist_Miles"

[61] "Dist_Miles_off" "Dist_Miles_def" "avg_speed" "avg_speed_off"

> nba2017_4 <- left_join(nba2017_3, Adv17, by = c("Year", "Player", "Team","Age"))

[5] "Age" "GP.x" "W.x" "L.x"

[9] "Min.x" "Pts" "FGM" "FGA"

[13] "FG_per" "three_PM" "three_PA" "three_Per"

[17] "FTM" "FTA" "FT_per" "OREB"

[21] "DREB" "REB" "AST" "TOV"

[25] "STL" "BLK" "PF" "FP"

[29] "DD2" "TD3" "Plus_minus" "Rk.y"

[33] "Pts_Off_TO" "second_pts" "FBPS" "PITP"

[37] "OPP_PTs_off_TO" "OPP_sec_pts" "OPP_FBPS" "OPP_PITP"

[41] "BLKA" "PFD" "Rk.x.x" "Def.Rtg"

[45] "DREB_per" "Per_DREB" "STL_per" "Per_BLK"

[49] "OPP_PTS_OFF_TOV" "OPP_PTS_2nd_Chance" "OPP_PTS_FB" "OPP_PTS_Paint"