Monday, February 2, 2015

Exploring Diamonds Dataset Part 2

First look at data.

library(ggplot2)
library(hexbin)
data(diamonds)
str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Scatterplot of Price vs X (length in mm)

ggplot(data=diamonds, aes(x = x, y = price)) + geom_point() + 
  xlab("Length in mm") + ylab("Price") + ggtitle("Price vs Length in mm")

plot of chunk unnamed-chunk-2

Same scatterplot without outliers.

ggplot(data=diamonds, aes(x = x, y = price)) + geom_jitter(alpha=1/15, position = position_jitter(h=0)) + 
  xlab("Length in mm") + ylab("Price") + ggtitle("Price vs Length in mm") +
  coord_cartesian(xlim=quantile(diamonds$x, probs=c(.025, .975)), ylim=quantile(diamonds$price, probs=c(.025,.975))) 

plot of chunk unnamed-chunk-3

What are observation about the scatterplot of price vs x?

  1. It's looks like there are exists quadratic dependency between variables.

  2. From plot we can see few groups of diamonds price's with respect to length.

  3. There exists outliers.

Correlations

What is the correlation between price and x ?

with(diamonds, cor.test(price, x, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.1594, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

What is the correlation between price and y?

with(diamonds, cor.test(price, y, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.1415, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

What is the correlation between price and z?

with(diamonds, cor.test(price, z, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6015, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Price vs Depth

ggplot(data=diamonds, aes(x=depth, y = price)) + xlab("Depth") +
  ylab("Price") + ggtitle("Price vs Depth") +
  stat_binhex() +
  scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2))

plot of chunk unnamed-chunk-7

Most diamonds are between this values of depth.

with(diamonds, quantile(depth, probs=c(0.05,0.95)))
##   5%  95% 
## 59.3 63.8

What's correlation between of depth vs price?

with(diamonds, cor.test(depth, price, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

For prediction depth isn't so important looking at correlation coefficient, but we can't say that unimportant at all using significant level 5%. P-value is less than .05, so wee can reject hypothesys that true correlation is equal to 0.

Price vs Carat

Very intelligent scatterplot.

ggplot(data=diamonds, aes(x=carat, y = price)) + xlab("Carat") +
  ylab("Price") + ggtitle("Price vs Carat") +
  stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=1), contour=FALSE) + 
  geom_point(size=0.5) +
  stat_density2d(geom="tile", aes(fill=..density..^0.25,     alpha=ifelse(..density..^0.25<0.4,0,1)), contour=FALSE) + 
  scale_fill_gradientn(colours = colorRampPalette(c("white", blues9))(256))

plot of chunk unnamed-chunk-10

Price vs Volume

diamonds$volume = with(diamonds, x*y*z)
ggplot(data=diamonds, aes(x=volume, y=price)) + 
  xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") + 
  geom_jitter(alpha=1/20, position=position_jitter(h=0), color='blue') + 
  coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)))

plot of chunk unnamed-chunk-11

Correlation between Price and Volume.

with(diamonds, cor.test(price, volume, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 486.3295, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9008054 0.9039398
## sample estimates:
##       cor 
## 0.9023845

How many diamonds have 0 Volume?

table(diamonds$volume == 0)[2]
## TRUE 
##   20

Correlation between Price and Volume without outliers.

with(subset(diamonds, volume > 0 & volume < 800), cor.test(volume, price, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  volume and price
## t = 559.1912, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Price vs Volume Smooth

ggplot(data=diamonds, aes(x=volume, y=price)) + 
  xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") + 
  geom_jitter(alpha=1/10, position=position_jitter(h=0)) + 
  coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)), ylim = c(0, max(diamonds$price))) +
  stat_smooth(method = "gam", formula = y ~ s(x), size = 1)

plot of chunk unnamed-chunk-15

Price vs Volume by Color

ggplot(data=diamonds, aes(x=volume, y=price, color = color)) + 
  xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") + 
  geom_jitter(alpha=1/4, position=position_jitter(h=0)) + 
  coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)), ylim = c(0, max(diamonds$price)))

Mean price by Clarity using dplyr

library(dplyr)
diamondsByClarity <- diamonds %>% group_by(clarity) %>%
  summarise(mean_price = mean(price),
            median_price = median(as.numeric(price)),
            max_price = max(price),
            min_price = min(price),
            n = n()) %>%
  arrange(clarity)
diamondsByClarity
## Source: local data frame [8 x 6]
## 
##   clarity mean_price median_price max_price min_price     n
## 1      I1   3924.169         3344     18531       345   741
## 2     SI2   5063.029         4072     18804       326  9194
## 3     SI1   3996.001         2822     18818       326 13065
## 4     VS2   3924.989         2054     18823       334 12258
## 5     VS1   3839.455         2005     18795       327  8171
## 6    VVS2   3283.737         1311     18768       336  5066
## 7    VVS1   2523.115         1093     18777       336  3655
## 8      IF   2864.839         1080     18806       369  1790

Bar chart of mean price

library(gridExtra)
mean_by_clarity <- diamonds %>% group_by(clarity) %>% summarise(mean_price = mean(price))
mean_by_color <- diamonds %>% group_by(color) %>% summarise(mean_price=mean(price))
mean_by_cut <- diamonds %>% group_by(cut) %>% summarise(mean_price = mean(price))

g1 <- ggplot(data=mean_by_clarity, aes(y=mean_price, x = clarity, fill=clarity)) + geom_bar(stat='identity', colour="black") + xlab("Clarity") + ylab("Mean Price") + ggtitle("Mean price by clarity")
g2 <- ggplot(data=mean_by_color, aes(y=mean_price, x = color, fill = color)) + geom_bar(stat='identity',colour="black") + xlab("Color") + ylab("Mean Price") + ggtitle("Mean price by color")
g3 <- ggplot(data=mean_by_cut, aes(y=mean_price, x = cut, fill = cut)) + geom_bar(stat='identity',colour="black") + xlab("Cut") + ylab("Mean Price") + ggtitle("Mean price by cur")

grid.arrange(g1,g2,g3)

plot of chunk unnamed-chunk-18

Session Info

sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggsubplot_0.3.2 dplyr_0.4.1     hexbin_1.27.0   mgcv_1.8-3     
##  [5] nlme_3.1-118    alr3_2.0.5      car_2.0-21      markdown_0.7.4 
##  [9] knitr_1.9       ggplot2_1.0.0   gridExtra_0.9.1
## 
## loaded via a namespace (and not attached):
##  [1] assertthat_0.1   colorspace_1.2-4 DBI_0.3.1        digest_0.6.4    
##  [5] evaluate_0.5.5   formatR_1.0      gtable_0.1.2     htmltools_0.2.6 
##  [9] labeling_0.3     lattice_0.20-29  lazyeval_0.1.10  magrittr_1.5    
## [13] MASS_7.3-35      Matrix_1.1-5     mime_0.2         munsell_0.4.2   
## [17] nnet_7.3-8       parallel_3.1.2   plyr_1.8.1       proto_0.3-10    
## [21] Rcpp_0.11.3      reshape2_1.4     rmarkdown_0.3.8  scales_0.2.4    
## [25] stringr_0.6.2    tools_3.1.2      yaml_2.1.13

No comments:

Post a Comment