You are on page 1of 14

Diamonds

Analyze diamonds by their cut, color,


clarity, price, and other attributes

Group Details

18BM60011 Anjali Bhawnani

18BM60069 Neeraj Kumar Soni

18BM60063 Manpreet Singh

18BM60055 Akash Konapure


Dataset and attributes:

This dataset contains the prices and other attributes of almost 54,000 diamonds.

The attributes are as follows:

ATTRIBUTE EXPLANATION
PRICE price in US dollars (\$326--\$18,823)
CARAT weight of the diamond (0.2--5.01)
CUT quality of the cut (Fair, Good, Very Good,
Premium, Ideal)
COLOR diamond colour, from J (worst) to D (best)
CLARITY a measurement of how clear the diamond is (I1
(worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF
(best))
X length in mm (0--10.74)
Y width in mm (0--58.9)
Z depth in mm (0--31.8)
DEPTH total depth percentage = z / mean(x, y) = 2 * z /
(x + y) (43--79)
TABLE width of top of diamond relative to widest
point (43--95)

Method:
The method used in regression.

Code:

library(ggplot2)

library(plyr)

library(dplyr) #Data manipulation

#Read file

diamonds<-read.csv(file.choose(),header = T)

#data cleaning and preparation

summary(diamonds)

#Change the cut variable to a factor

diamonds$cut <- as.factor(diamonds$cut)

#Change the color variable to a factor


diamonds$color <- as.factor(diamonds$color)

#Change the clarity variable to a factor

diamonds$clarity <- as.factor(diamonds$clarity)

levels(diamonds$cut)

#Check the missing values in the dataset column-wise

na <- colSums(is.na(diamonds))

#No missing value found in the dataset

#Finding outliers

# Seperating continuous and categorical variables

categorical_var <- diamonds[,c(2,3,4)]

continous_var <- diamonds[,-c(2,3,4)]

# Checking the 98th percent value for each column of continuous variable

percentile_98th <- diamonds[1,]

for(i in 1:ncol(continous_var)){

if(is.numeric(continous_var[,i])){

percentile_98th[,i] <- quantile(continous_var[,i], 0.98, na.rm = T)

#Changing outlier values to the 98th percentile value

for(i in 1:ncol(continous_var)){

if(is.numeric(continous_var[,i])){

continous_var[which(continous_var[,i]>percentile_98th[,i]),i] <- percentile_98th[,i]

#rebinding the outlier treated continous variables with categorical variables

data_main <- cbind(continous_var,categorical_var)


#Plotting variables against price to visually see their impact

# Plotting cut vs price

a <- ggplot(data = data_main)+

geom_bar(aes(x=cut,y=price),stat = "summary",fun.y="mean",alpha=1,fill="blue")+

xlab("Cut Type")+

ylab("Average Price")+

ggtitle("Cut vs Price")

# Plotting color vs price

b <- ggplot(data = data_main)+

geom_bar(aes(x=color,y=price),stat = "summary",fun.y="mean",alpha=1,fill="red")+

xlab("Color")+

ylab("Average Price")+

ggtitle("Color vs Price")

# Plotting clarity vs price

c <- ggplot(data = data_main)+

geom_bar(aes(x=clarity,y=price),stat = "summary",fun.y="mean",alpha=1,fill="green")+

xlab("Clarity Type")+

ylab("Average Price")+

ggtitle("Clarity vs Price")

# Plotting carat vs price

d <- ggplot(data = data_main)+

geom_bar(aes(x=carat,y=price),stat = "summary",fun.y="mean",alpha=1,fill="orange")+

xlab("Carat")+

ylab("Average Price")+

ggtitle("Carat vs Price")
# Plotting depth vs price

e <- ggplot(data = data_main)+

geom_bar(aes(x=depth,y=price),stat = "summary",fun.y="mean",alpha=1,fill="pink")+

xlab("Depth")+

ylab("Average Price")+

ggtitle("Depth vs Price")

# Plotting table vs price

f <- ggplot(data = data_main)+

geom_bar(aes(x=table,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+

xlab("Table")+

ylab("Average Price")+

ggtitle("Table vs Price")

# Plotting X vs price

g <- ggplot(data = data_main)+

geom_bar(aes(x=x,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+

xlab("X")+

ylab("Average Price")+

ggtitle("X vs Price")

# Plotting Y vs price

h <- ggplot(data = data_main)+

geom_bar(aes(x=y,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+

xlab("Y")+

ylab("Average Price")+

ggtitle("Y vs Price")

# Plotting Z vs price

i <- ggplot(data = data_main)+


geom_bar(aes(x=z,y=price),stat = "summary",fun.y="mean",alpha=1,fill="purple")+

xlab("Z")+

ylab("Average Price")+

ggtitle("Z vs Price")

#Checking the values assigned to categories and change them to 1,2,3..

#Revalue the cut column

table(data_main$cut)

data_main$cut <- revalue(data_main$cut,c("Fair"=1,"Good"=2,"Ideal"=3,"Premium"=4,"Very


Good"=5))

#Revalue the colors column

table(data_main$color)

data_main$color <- revalue(data_main$color,c("D"=1,"E"=2,"F"=3,"G"=4,"H"=5,"I"=6,"J"=7))

#Revalue the clarity variable

table(data_main$clarity)

data_main$clarity <-
revalue(data_main$clarity,c("I1"=1,"IF"=2,"SI1"=3,"SI2"=4,"VS1"=5,"VS2"=6,"VVS1"=7, "VVS2"=8))

#Applying linear regression

model1 <- lm(price~.,data=data_main)

summary(model1)

plot(model1)

#Extracting residuals from model1

residuals1 <- resid(model1)

#Plotting the residuals

plot(residuals1)
Steps and Output

1. Read the data and see how data looks.


2. Summary of the data is:

X carat cut color


Min. : 1 Min. :0.2000 Fair : 1610 D: 6775
1st Qu.:13486 1st Qu.:0.4000 Good : 4906 E: 9797
Median :26971 Median :0.7000 Ideal :21551 F: 9542
Mean :26971 Mean :0.7979 Premium :13791 G:11292
3rd Qu.:40455 3rd Qu.:1.0400 Very Good:12082 H: 8304
Max. :53940 Max. :5.0100 I: 5422
J: 2808
clarity depth table price
SI1 :13065 Min. :43.00 Min. :43.00 Min. : 326
VS2 :12258 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950
SI2 : 9194 Median :61.80 Median :57.00 Median : 2401
VS1 : 8171 Mean :61.75 Mean :57.46 Mean : 3933
VVS2 : 5066 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324
VVS1 : 3655 Max. :79.00 Max. :95.00 Max. :18823
(Other): 2531
X Y Z
Min. : 0.000 Min. : 0.00 Min. : 0.000
1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
Median : 5.700 Median : 5.710 Median : 3.530
Mean : 5.731 Mean : 5.735 Mean : 3.539
3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
Max. :10.740 Max. :58.900 Max. :31.800

3. Categorizing variables:

Qualitative Features (Categorical): Cut, Color, Clarity.


Quantitative Features (Numerical): Carat, Depth , Table , Price , X , Y, Z.
Target variable is price.

4. Create factors for categorical variables


5. Find the missing values

Price 0
Carat 0
Cut 0
Color 0
Clarity 0
X 0
Y 0
Z 0
Depth 0
Table 0

6. Next we find outliers using 98th percentile function


7. And change the outlier values to 98 th percentile value. We can do this and the above step for
only continuous variable.
8. Next we plot each variable against price to see the impact.
9. Next we run the linear regression model.

Residuals:
Min 1Q Median 3Q Max
-21248.6 -618.6 -170.8 414.6 8640.5

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.094e+03 4.412e+02 -4.746 2.08e-06 ***
X 4.998e-03 3.343e-04 14.951 < 2e-16 ***
clarity2 5.245e+03 4.917e+01 106.662 < 2e-16 ***
clarity3 3.551e+03 4.210e+01 84.339 < 2e-16 ***
clarity4 2.624e+03 4.225e+01 62.105 < 2e-16 ***
clarity5 4.470e+03 4.296e+01 104.051 < 2e-16 ***
clarity6 4.163e+03 4.229e+01 98.425 < 2e-16 ***
clarity7 4.930e+03 4.544e+01 108.503 < 2e-16 ***
clarity8 4.860e+03 4.421e+01 109.949 < 2e-16 ***
depth -3.715e+01 5.431e+00 -6.840 8.01e-12 ***
table -2.191e+01 2.931e+00 -7.478 7.69e-14 ***
x -8.806e+02 8.136e+01 -10.824 < 2e-16 ***
y 3.795e+02 8.042e+01 4.718 2.38e-06 ***
z 1.190e+02 6.100e+01 1.951 0.0511 .
carat 9.753e+03 4.245e+01 229.756 < 2e-16 ***
cut2 6.178e+02 3.237e+01 19.088 < 2e-16 ***
cut3 8.710e+02 3.121e+01 27.907 < 2e-16 ***
cut4 8.236e+02 3.018e+01 27.294 < 2e-16 ***
cut5 7.692e+02 3.078e+01 24.991 < 2e-16 ***
color2 -2.097e+02 1.723e+01 -12.171 < 2e-16 ***
color3 -2.837e+02 1.743e+01 -16.280 < 2e-16 ***
color4 -4.820e+02 1.706e+01 -28.247 < 2e-16 ***
color5 -9.537e+02 1.814e+01 -52.565 < 2e-16 ***
color6 -1.427e+03 2.038e+01 -70.010 < 2e-16 ***
color7 -2.289e+03 2.517e+01 -90.963 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1088 on 53915 degrees of freedom


Multiple R-squared: 0.9224, Adjusted R-squared: 0.9224
F-statistic: 2.67e+04 on 24 and 53915 DF, p-value: < 2.2e-16

10. And plot the residuals .

You might also like