Thursday, February 5, 2015

Exploring Diamonds Dataset Part 3

First look at data.

library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Price Histograms with Facet and Color

 ggplot(data=diamonds, aes(x=price, color = cut)) +
  geom_histogram(binwidth=0.1) +
  facet_wrap(~color) + scale_x_log10() +
  xlab("Price") + ylab("Frequency") + ggtitle("Price with Facet and Color")

Price vs Table Colored by Cut

 ggplot(data=diamonds, aes(x=table, y=price, color = cut)) + 
  geom_point() +
  xlab("Table") + ylab("Price") + ggtitle("Price vs Table by Cut") + 
  scale_color_brewer(type = 'qual')

What is the typical table range for the majority of diamonds of ideal cut?

with(subset(diamonds, cut == "Ideal") , quantile(table, probs= c(0.1,0.9)))
## 10% 90% 
##  54  57

What is the typical table range for the majority of diamonds of premium cut?

with(subset(diamonds, cut=='Premium'), quantile(table, probs=c(0.1,0.9)))
## 10% 90% 
##  57  61

Price vs Volume and Diamond Clarity

diamonds$volume <- with(diamonds, x*y*z)
 ggplot(data=subset(diamonds,volume < quantile(volume, probs=0.99) & volume > 0), aes(x =volume, y=price, color=clarity)) +
  geom_point() + scale_color_brewer(type = 'div') + scale_y_log10() +
  xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume by Clarity")

Price/Carat Binned, Faceted, & Colored

 ggplot( data = diamonds,aes( x = cut, y = price/carat, color = color)) + 
  geom_point(position = position_jitter(width = 0.4)) +
  facet_wrap(~clarity) + 
  scale_color_brewer(type='div') +
  xlab("Cut") + ylab("Price/Carat")

Price vs Carat and Clarity with Transformations

library(scales)
cuberoot_trans <- function() trans_new(
  'cuberoot', transform=function(x) x^(1/3),
  inverse=function(x) x^3)

ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Clarity', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')

Price vs Carat and Cut with Transformation

 ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Cut', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')

Price vs Carat and Color with Transformation

 ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Color', reverse = F,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Color')

Build Linear Model for Price

library(memisc)
## Loading required package: lattice
## Loading required package: MASS
## 
## Attaching package: 'memisc'
## 
## The following object is masked from 'package:scales':
## 
##     percent
## 
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## 
## The following object is masked from 'package:base':
## 
##     as.array
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data=diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1,m2,m3,m4,m5)
## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamonds)
## 
## ======================================================================
##                     m1         m2         m3         m4         m5    
## ----------------------------------------------------------------------
## (Intercept)      2.821***   1.039***   0.874***   0.932***   0.415*** 
##                 (0.006)    (0.019)    (0.019)    (0.017)    (0.010)   
## I(carat^(1/3))   5.558***   8.568***   8.703***   8.438***   9.144*** 
##                 (0.007)    (0.032)    (0.031)    (0.028)    (0.016)   
## carat                      -1.137***  -1.163***  -0.992***  -1.093*** 
##                            (0.012)    (0.011)    (0.010)    (0.006)   
## cut: .L                                0.224***   0.224***   0.120*** 
##                                       (0.004)    (0.004)    (0.002)   
## cut: .Q                               -0.062***  -0.062***  -0.031*** 
##                                       (0.004)    (0.003)    (0.002)   
## cut: .C                                0.051***   0.052***   0.014*** 
##                                       (0.003)    (0.003)    (0.002)   
## cut: ^4                                0.018***   0.018***  -0.002    
##                                       (0.003)    (0.002)    (0.001)   
## color: .L                                        -0.373***  -0.441*** 
##                                                  (0.003)    (0.002)   
## color: .Q                                        -0.129***  -0.093*** 
##                                                  (0.003)    (0.002)   
## color: .C                                         0.001     -0.013*** 
##                                                  (0.003)    (0.002)   
## color: ^4                                         0.029***   0.012*** 
##                                                  (0.003)    (0.002)   
## color: ^5                                        -0.016***  -0.003*   
##                                                  (0.003)    (0.001)   
## color: ^6                                        -0.023***   0.001    
##                                                  (0.002)    (0.001)   
## clarity: .L                                                  0.907*** 
##                                                             (0.003)   
## clarity: .Q                                                 -0.240*** 
##                                                             (0.003)   
## clarity: .C                                                  0.131*** 
##                                                             (0.003)   
## clarity: ^4                                                 -0.063*** 
##                                                             (0.002)   
## clarity: ^5                                                  0.026*** 
##                                                             (0.002)   
## clarity: ^6                                                 -0.002    
##                                                             (0.002)   
## clarity: ^7                                                  0.032*** 
##                                                             (0.001)   
## ----------------------------------------------------------------------
## R-squared            0.924      0.935      0.939     0.951       0.984
## adj. R-squared       0.924      0.935      0.939     0.951       0.984
## sigma                0.280      0.259      0.250     0.224       0.129
## F               652012.063 387489.366 138654.523 87959.467  173791.084
## p                    0.000      0.000      0.000     0.000       0.000
## Log-likelihood   -7962.499  -3631.319  -1837.416  4235.240   34091.272
## Deviance          4242.831   3613.360   3380.837  2699.212     892.214
## AIC              15930.999   7270.637   3690.832 -8442.481  -68140.544
## BIC              15957.685   7306.220   3761.997 -8317.942  -67953.736
## N                53940      53940      53940     53940       53940    
## ======================================================================

Session Info

sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] memisc_0.96-10  MASS_7.3-35     lattice_0.20-29 scales_0.2.4   
## [5] ggplot2_1.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] car_2.0-21         colorspace_1.2-4   digest_0.6.4      
##  [4] evaluate_0.5.5     formatR_1.0        grid_3.1.2        
##  [7] gtable_0.1.2       htmltools_0.2.6    knitr_1.9         
## [10] labeling_0.3       munsell_0.4.2      nnet_7.3-8        
## [13] plyr_1.8.1         proto_0.3-10       RColorBrewer_1.0-5
## [16] Rcpp_0.11.3        reshape2_1.4       rmarkdown_0.3.8   
## [19] stringr_0.6.2      tools_3.1.2

No comments:

Post a Comment