First look at data.
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Price Histograms with Facet and Color
ggplot(data=diamonds, aes(x=price, color = cut)) +
geom_histogram(binwidth=0.1) +
facet_wrap(~color) + scale_x_log10() +
xlab("Price") + ylab("Frequency") + ggtitle("Price with Facet and Color")

Price vs Table Colored by Cut
ggplot(data=diamonds, aes(x=table, y=price, color = cut)) +
geom_point() +
xlab("Table") + ylab("Price") + ggtitle("Price vs Table by Cut") +
scale_color_brewer(type = 'qual')

What is the typical table range for the majority of diamonds of ideal cut?
with(subset(diamonds, cut == "Ideal") , quantile(table, probs= c(0.1,0.9)))
## 10% 90%
## 54 57
What is the typical table range for the majority of diamonds of premium cut?
with(subset(diamonds, cut=='Premium'), quantile(table, probs=c(0.1,0.9)))
## 10% 90%
## 57 61
Price vs Volume and Diamond Clarity
diamonds$volume <- with(diamonds, x*y*z)
ggplot(data=subset(diamonds,volume < quantile(volume, probs=0.99) & volume > 0), aes(x =volume, y=price, color=clarity)) +
geom_point() + scale_color_brewer(type = 'div') + scale_y_log10() +
xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume by Clarity")

Price/Carat Binned, Faceted, & Colored
ggplot( data = diamonds,aes( x = cut, y = price/carat, color = color)) +
geom_point(position = position_jitter(width = 0.4)) +
facet_wrap(~clarity) +
scale_color_brewer(type='div') +
xlab("Cut") + ylab("Price/Carat")

Price vs Carat and Clarity with Transformations
library(scales)
cuberoot_trans <- function() trans_new(
'cuberoot', transform=function(x) x^(1/3),
inverse=function(x) x^3)
ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')

Price vs Carat and Cut with Transformation
ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Cut', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')

Price vs Carat and Color with Transformation
ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Color', reverse = F,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')

Build Linear Model for Price
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'memisc'
##
## The following object is masked from 'package:scales':
##
## percent
##
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
##
## The following object is masked from 'package:base':
##
## as.array
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data=diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1,m2,m3,m4,m5)
##
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color,
## data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +
## clarity, data = diamonds)
##
## ======================================================================
## m1 m2 m3 m4 m5
## ----------------------------------------------------------------------
## (Intercept) 2.821*** 1.039*** 0.874*** 0.932*** 0.415***
## (0.006) (0.019) (0.019) (0.017) (0.010)
## I(carat^(1/3)) 5.558*** 8.568*** 8.703*** 8.438*** 9.144***
## (0.007) (0.032) (0.031) (0.028) (0.016)
## carat -1.137*** -1.163*** -0.992*** -1.093***
## (0.012) (0.011) (0.010) (0.006)
## cut: .L 0.224*** 0.224*** 0.120***
## (0.004) (0.004) (0.002)
## cut: .Q -0.062*** -0.062*** -0.031***
## (0.004) (0.003) (0.002)
## cut: .C 0.051*** 0.052*** 0.014***
## (0.003) (0.003) (0.002)
## cut: ^4 0.018*** 0.018*** -0.002
## (0.003) (0.002) (0.001)
## color: .L -0.373*** -0.441***
## (0.003) (0.002)
## color: .Q -0.129*** -0.093***
## (0.003) (0.002)
## color: .C 0.001 -0.013***
## (0.003) (0.002)
## color: ^4 0.029*** 0.012***
## (0.003) (0.002)
## color: ^5 -0.016*** -0.003*
## (0.003) (0.001)
## color: ^6 -0.023*** 0.001
## (0.002) (0.001)
## clarity: .L 0.907***
## (0.003)
## clarity: .Q -0.240***
## (0.003)
## clarity: .C 0.131***
## (0.003)
## clarity: ^4 -0.063***
## (0.002)
## clarity: ^5 0.026***
## (0.002)
## clarity: ^6 -0.002
## (0.002)
## clarity: ^7 0.032***
## (0.001)
## ----------------------------------------------------------------------
## R-squared 0.924 0.935 0.939 0.951 0.984
## adj. R-squared 0.924 0.935 0.939 0.951 0.984
## sigma 0.280 0.259 0.250 0.224 0.129
## F 652012.063 387489.366 138654.523 87959.467 173791.084
## p 0.000 0.000 0.000 0.000 0.000
## Log-likelihood -7962.499 -3631.319 -1837.416 4235.240 34091.272
## Deviance 4242.831 3613.360 3380.837 2699.212 892.214
## AIC 15930.999 7270.637 3690.832 -8442.481 -68140.544
## BIC 15957.685 7306.220 3761.997 -8317.942 -67953.736
## N 53940 53940 53940 53940 53940
## ======================================================================
Session Info
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] memisc_0.96-10 MASS_7.3-35 lattice_0.20-29 scales_0.2.4
## [5] ggplot2_1.0.0
##
## loaded via a namespace (and not attached):
## [1] car_2.0-21 colorspace_1.2-4 digest_0.6.4
## [4] evaluate_0.5.5 formatR_1.0 grid_3.1.2
## [7] gtable_0.1.2 htmltools_0.2.6 knitr_1.9
## [10] labeling_0.3 munsell_0.4.2 nnet_7.3-8
## [13] plyr_1.8.1 proto_0.3-10 RColorBrewer_1.0-5
## [16] Rcpp_0.11.3 reshape2_1.4 rmarkdown_0.3.8
## [19] stringr_0.6.2 tools_3.1.2
No comments:
Post a Comment