Diamonds: Analyze Diamonds by Their Cut, Color, Clarity, Price, and Other Attributes

Diamonds
Analyze diamonds by their cut, color,

clarity, price, and other attributes
Group Details
18BM60011 Anjali Bhawnani
18BM60069 Neeraj Kumar Soni
18BM60063 Manpreet Singh
18BM60055 Akash Konapure

Dataset and attributes:
This dataset contains the prices and other attributes of almost 54,000 diamonds.
The attributes are as follows:
ATTRIBUTE EXPLANATION
PRICE price in US dollars (\$326--\$18,823)
CARAT weight of the diamond (0.2--5.01)
CUT quality of the cut (Fair, Good, Very Good,
Premium, Ideal)
COLOR diamond colour, from J (worst) to D (best)
CLARITY a measurement of how clear the diamond is (I1
(worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF
(best))
X length in mm (0--10.74)
Y width in mm (0--58.9)
Z depth in mm (0--31.8)
DEPTH total depth percentage = z / mean(x, y) = 2 * z /
(x + y) (43--79)
TABLE width of top of diamond relative to widest
point (43--95)
Method:
The method used in regression.
Code:
library(ggplot2)
library(plyr)
library(dplyr) #Data manipulation
#Read file
diamonds<-read.csv(file.choose(),header = T)
#data cleaning and preparation
summary(diamonds)
#Change the cut variable to a factor
diamonds$cut <- as.factor(diamonds$cut)
#Change the color variable to a factor

diamonds$color <- as.factor(diamonds$color)
#Change the clarity variable to a factor
diamonds$clarity <- as.factor(diamonds$clarity)
levels(diamonds$cut)
#Check the missing values in the dataset column-wise
na <- colSums(is.na(diamonds))
#No missing value found in the dataset
#Finding outliers
# Seperating continuous and categorical variables
categorical_var <- diamonds[,c(2,3,4)]
continous_var <- diamonds[,-c(2,3,4)]
# Checking the 98th percent value for each column of continuous variable
percentile_98th <- diamonds[1,]
for(i in 1:ncol(continous_var)){
if(is.numeric(continous_var[,i])){
percentile_98th[,i] <- quantile(continous_var[,i], 0.98, na.rm = T)
#Changing outlier values to the 98th percentile value
for(i in 1:ncol(continous_var)){
if(is.numeric(continous_var[,i])){
continous_var[which(continous_var[,i]>percentile_98th[,i]),i] <- percentile_98th[,i]
#rebinding the outlier treated continous variables with categorical variables
data_main <- cbind(continous_var,categorical_var)

#Plotting variables against price to visually see their impact
# Plotting cut vs price
a <- ggplot(data = data_main)+
geom_bar(aes(x=cut,y=price),stat = "summary",fun.y="mean",alpha=1,fill="blue")+
xlab("Cut Type")+
ylab("Average Price")+
ggtitle("Cut vs Price")
# Plotting color vs price
b <- ggplot(data = data_main)+
geom_bar(aes(x=color,y=price),stat = "summary",fun.y="mean",alpha=1,fill="red")+
xlab("Color")+
ggtitle("Color vs Price")
# Plotting clarity vs price
c <- ggplot(data = data_main)+
geom_bar(aes(x=clarity,y=price),stat = "summary",fun.y="mean",alpha=1,fill="green")+
xlab("Clarity Type")+
ggtitle("Clarity vs Price")
# Plotting carat vs price
d <- ggplot(data = data_main)+
geom_bar(aes(x=carat,y=price),stat = "summary",fun.y="mean",alpha=1,fill="orange")+
xlab("Carat")+
ggtitle("Carat vs Price")
# Plotting depth vs price
e <- ggplot(data = data_main)+
geom_bar(aes(x=depth,y=price),stat = "summary",fun.y="mean",alpha=1,fill="pink")+
xlab("Depth")+
ggtitle("Depth vs Price")
# Plotting table vs price
f <- ggplot(data = data_main)+
geom_bar(aes(x=table,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+
xlab("Table")+
ggtitle("Table vs Price")
# Plotting X vs price
g <- ggplot(data = data_main)+
geom_bar(aes(x=x,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+
xlab("X")+
ggtitle("X vs Price")
# Plotting Y vs price
h <- ggplot(data = data_main)+
geom_bar(aes(x=y,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+
xlab("Y")+
ggtitle("Y vs Price")
# Plotting Z vs price
i <- ggplot(data = data_main)+

geom_bar(aes(x=z,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+
xlab("Z")+
ggtitle("Z vs Price")
#Checking the values assigned to categories and change them to 1,2,3..
#Revalue the cut column
table(data_main$cut)
data_main$cut <- revalue(data_main$cut,c("Fair"=1,"Good"=2,"Ideal"=3,"Premium"=4,"Very

Good"=5))
#Revalue the colors column
table(data_main$color)
data_main$color <- revalue(data_main$color,c("D"=1,"E"=2,"F"=3,"G"=4,"H"=5,"I"=6,"J"=7))
#Revalue the clarity variable
table(data_main$clarity)
data_main$clarity <-
revalue(data_main$clarity,c("I1"=1,"IF"=2,"SI1"=3,"SI2"=4,"VS1"=5,"VS2"=6,"VVS1"=7, "VVS2"=8))
#Applying linear regression
model1 <- lm(price~.,data=data_main)
summary(model1)
plot(model1)
#Extracting residuals from model1
residuals1 <- resid(model1)
#Plotting the residuals
plot(residuals1)
Steps and Output
1. Read the data and see how data looks.

2. Summary of the data is:
X carat cut color

Min. : 1 Min. :0.2000 Fair : 1610 D: 6775
1st Qu.:13486 1st Qu.:0.4000 Good : 4906 E: 9797
Median :26971 Median :0.7000 Ideal :21551 F: 9542
Mean :26971 Mean :0.7979 Premium :13791 G:11292
3rd Qu.:40455 3rd Qu.:1.0400 Very Good:12082 H: 8304
Max. :53940 Max. :5.0100 I: 5422
J: 2808
clarity depth table price
SI1 :13065 Min. :43.00 Min. :43.00 Min. : 326
VS2 :12258 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950
SI2 : 9194 Median :61.80 Median :57.00 Median : 2401
VS1 : 8171 Mean :61.75 Mean :57.46 Mean : 3933
VVS2 : 5066 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324
VVS1 : 3655 Max. :79.00 Max. :95.00 Max. :18823
(Other): 2531
X Y Z
Min. : 0.000 Min. : 0.00 Min. : 0.000
1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
Median : 5.700 Median : 5.710 Median : 3.530
Mean : 5.731 Mean : 5.735 Mean : 3.539
3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
Max. :10.740 Max. :58.900 Max. :31.800
3. Categorizing variables:
Qualitative Features (Categorical): Cut, Color, Clarity.

Quantitative Features (Numerical): Carat, Depth , Table , Price , X , Y, Z.
Target variable is price.
4. Create factors for categorical variables

5. Find the missing values
Price 0
Carat 0
Cut 0
Color 0
Clarity 0
X 0
Y 0
Z 0
Depth 0
Table 0
6. Next we find outliers using 98th percentile function

7. And change the outlier values to 98 th percentile value. We can do this and the above step for
only continuous variable.
8. Next we plot each variable against price to see the impact.
9. Next we run the linear regression model.
Residuals:
Min 1Q Median 3Q Max
-21248.6 -618.6 -170.8 414.6 8640.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.094e+03 4.412e+02 -4.746 2.08e-06 ***
X 4.998e-03 3.343e-04 14.951 < 2e-16 ***
clarity2 5.245e+03 4.917e+01 106.662 < 2e-16 ***
clarity3 3.551e+03 4.210e+01 84.339 < 2e-16 ***
clarity4 2.624e+03 4.225e+01 62.105 < 2e-16 ***
clarity5 4.470e+03 4.296e+01 104.051 < 2e-16 ***
clarity6 4.163e+03 4.229e+01 98.425 < 2e-16 ***
clarity7 4.930e+03 4.544e+01 108.503 < 2e-16 ***
clarity8 4.860e+03 4.421e+01 109.949 < 2e-16 ***
depth -3.715e+01 5.431e+00 -6.840 8.01e-12 ***
table -2.191e+01 2.931e+00 -7.478 7.69e-14 ***
x -8.806e+02 8.136e+01 -10.824 < 2e-16 ***
y 3.795e+02 8.042e+01 4.718 2.38e-06 ***
z 1.190e+02 6.100e+01 1.951 0.0511 .
carat 9.753e+03 4.245e+01 229.756 < 2e-16 ***
cut2 6.178e+02 3.237e+01 19.088 < 2e-16 ***
cut3 8.710e+02 3.121e+01 27.907 < 2e-16 ***
cut4 8.236e+02 3.018e+01 27.294 < 2e-16 ***
cut5 7.692e+02 3.078e+01 24.991 < 2e-16 ***
color2 -2.097e+02 1.723e+01 -12.171 < 2e-16 ***
color3 -2.837e+02 1.743e+01 -16.280 < 2e-16 ***
color4 -4.820e+02 1.706e+01 -28.247 < 2e-16 ***
color5 -9.537e+02 1.814e+01 -52.565 < 2e-16 ***
color6 -1.427e+03 2.038e+01 -70.010 < 2e-16 ***
color7 -2.289e+03 2.517e+01 -90.963 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1088 on 53915 degrees of freedom

Multiple R-squared: 0.9224, Adjusted R-squared: 0.9224
F-statistic: 2.67e+04 on 24 and 53915 DF, p-value: < 2.2e-16
10. And plot the residuals .

Diamonds: Analyze Diamonds by Their Cut, Color, Clarity, Price, and Other Attributes

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Diamonds: Analyze Diamonds by Their Cut, Color, Clarity, Price, and Other Attributes

Uploaded by

Copyright:

Available Formats

Diamonds

Analyze diamonds by their cut, color,

18BM60011 Anjali Bhawnani

18BM60069 Neeraj Kumar Soni

18BM60063 Manpreet Singh

18BM60055 Akash Konapure

The attributes are as follows:

library(dplyr) #Data manipulation

#data cleaning and preparation

#Change the cut variable to a factor

diamonds$cut <- as.factor(diamonds$cut)

#Change the color variable to a factor

#Change the clarity variable to a factor

diamonds$clarity <- as.factor(diamonds$clarity)

#Check the missing values in the dataset column-wise

#No missing value found in the dataset

# Seperating continuous and categorical variables

categorical_var <- diamonds[,c(2,3,4)]

continous_var <- diamonds[,-c(2,3,4)]

percentile_98th <- diamonds[1,]

percentile_98th[,i] <- quantile(continous_var[,i], 0.98, na.rm = T)

#Changing outlier values to the 98th percentile value

continous_var[which(continous_var[,i]>percentile_98th[,i]),i] <- percentile_98th[,i]

#rebinding the outlier treated continous variables with categorical variables

data_main <- cbind(continous_var,categorical_var)

# Plotting cut vs price

a <- ggplot(data = data_main)+

# Plotting color vs price

b <- ggplot(data = data_main)+

# Plotting clarity vs price

c <- ggplot(data = data_main)+

# Plotting carat vs price

d <- ggplot(data = data_main)+

e <- ggplot(data = data_main)+

# Plotting table vs price

f <- ggplot(data = data_main)+

g <- ggplot(data = data_main)+

h <- ggplot(data = data_main)+

i <- ggplot(data = data_main)+

#Checking the values assigned to categories and change them to 1,2,3..

#Revalue the cut column

data_main$cut <- revalue(data_main$cut,c("Fair"=1,"Good"=2,"Ideal"=3,"Premium"=4,"Very

#Revalue the colors column

data_main$color <- revalue(data_main$color,c("D"=1,"E"=2,"F"=3,"G"=4,"H"=5,"I"=6,"J"=7))

#Revalue the clarity variable

#Applying linear regression

model1 <- lm(price~.,data=data_main)

#Extracting residuals from model1

residuals1 <- resid(model1)

#Plotting the residuals

1. Read the data and see how data looks.

X carat cut color

Qualitative Features (Categorical): Cut, Color, Clarity.

4. Create factors for categorical variables

6. Next we find outliers using 98th percentile function

Residual standard error: 1088 on 53915 degrees of freedom

10. And plot the residuals .

You might also like