First look at data.
library(ggplot2)
library(hexbin)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Scatterplot of Price vs X (length in mm)
ggplot(data=diamonds, aes(x = x, y = price)) + geom_point() +
xlab("Length in mm") + ylab("Price") + ggtitle("Price vs Length in mm")
Same scatterplot without outliers.
ggplot(data=diamonds, aes(x = x, y = price)) + geom_jitter(alpha=1/15, position = position_jitter(h=0)) +
xlab("Length in mm") + ylab("Price") + ggtitle("Price vs Length in mm") +
coord_cartesian(xlim=quantile(diamonds$x, probs=c(.025, .975)), ylim=quantile(diamonds$price, probs=c(.025,.975)))
What are observation about the scatterplot of price vs x?
It's looks like there are exists quadratic dependency between variables.
From plot we can see few groups of diamonds price's with respect to length.
There exists outliers.
Correlations
What is the correlation between price and x ?
with(diamonds, cor.test(price, x, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.1594, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
What is the correlation between price and y?
with(diamonds, cor.test(price, y, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.1415, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
What is the correlation between price and z?
with(diamonds, cor.test(price, z, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6015, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
Price vs Depth
ggplot(data=diamonds, aes(x=depth, y = price)) + xlab("Depth") +
ylab("Price") + ggtitle("Price vs Depth") +
stat_binhex() +
scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2))
Most diamonds are between this values of depth.
with(diamonds, quantile(depth, probs=c(0.05,0.95)))
## 5% 95%
## 59.3 63.8
What's correlation between of depth vs price?
with(diamonds, cor.test(depth, price, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
For prediction depth isn't so important looking at correlation coefficient, but we can't say that unimportant at all using significant level 5%. P-value is less than .05, so wee can reject hypothesys that true correlation is equal to 0.
Price vs Carat
Very intelligent scatterplot.
ggplot(data=diamonds, aes(x=carat, y = price)) + xlab("Carat") +
ylab("Price") + ggtitle("Price vs Carat") +
stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=1), contour=FALSE) +
geom_point(size=0.5) +
stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=ifelse(..density..^0.25<0.4,0,1)), contour=FALSE) +
scale_fill_gradientn(colours = colorRampPalette(c("white", blues9))(256))
Price vs Volume
diamonds$volume = with(diamonds, x*y*z)
ggplot(data=diamonds, aes(x=volume, y=price)) +
xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") +
geom_jitter(alpha=1/20, position=position_jitter(h=0), color='blue') +
coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)))
Correlation between Price and Volume.
with(diamonds, cor.test(price, volume, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 486.3295, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9008054 0.9039398
## sample estimates:
## cor
## 0.9023845
How many diamonds have 0 Volume?
table(diamonds$volume == 0)[2]
## TRUE
## 20
Correlation between Price and Volume without outliers.
with(subset(diamonds, volume > 0 & volume < 800), cor.test(volume, price, method='pearson'))
##
## Pearson's product-moment correlation
##
## data: volume and price
## t = 559.1912, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
Price vs Volume Smooth
ggplot(data=diamonds, aes(x=volume, y=price)) +
xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") +
geom_jitter(alpha=1/10, position=position_jitter(h=0)) +
coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)), ylim = c(0, max(diamonds$price))) +
stat_smooth(method = "gam", formula = y ~ s(x), size = 1)
Price vs Volume by Color
ggplot(data=diamonds, aes(x=volume, y=price, color = color)) +
xlab("Volume") + ylab("Price") + ggtitle("Price vs Volume") +
geom_jitter(alpha=1/4, position=position_jitter(h=0)) +
coord_cartesian(xlim = c(0, quantile(diamonds$volume, 0.99)), ylim = c(0, max(diamonds$price)))
Mean price by Clarity using dplyr
library(dplyr)
diamondsByClarity <- diamonds %>% group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(as.numeric(price)),
max_price = max(price),
min_price = min(price),
n = n()) %>%
arrange(clarity)
diamondsByClarity
## Source: local data frame [8 x 6]
##
## clarity mean_price median_price max_price min_price n
## 1 I1 3924.169 3344 18531 345 741
## 2 SI2 5063.029 4072 18804 326 9194
## 3 SI1 3996.001 2822 18818 326 13065
## 4 VS2 3924.989 2054 18823 334 12258
## 5 VS1 3839.455 2005 18795 327 8171
## 6 VVS2 3283.737 1311 18768 336 5066
## 7 VVS1 2523.115 1093 18777 336 3655
## 8 IF 2864.839 1080 18806 369 1790
Bar chart of mean price
library(gridExtra)
mean_by_clarity <- diamonds %>% group_by(clarity) %>% summarise(mean_price = mean(price))
mean_by_color <- diamonds %>% group_by(color) %>% summarise(mean_price=mean(price))
mean_by_cut <- diamonds %>% group_by(cut) %>% summarise(mean_price = mean(price))
g1 <- ggplot(data=mean_by_clarity, aes(y=mean_price, x = clarity, fill=clarity)) + geom_bar(stat='identity', colour="black") + xlab("Clarity") + ylab("Mean Price") + ggtitle("Mean price by clarity")
g2 <- ggplot(data=mean_by_color, aes(y=mean_price, x = color, fill = color)) + geom_bar(stat='identity',colour="black") + xlab("Color") + ylab("Mean Price") + ggtitle("Mean price by color")
g3 <- ggplot(data=mean_by_cut, aes(y=mean_price, x = cut, fill = cut)) + geom_bar(stat='identity',colour="black") + xlab("Cut") + ylab("Mean Price") + ggtitle("Mean price by cur")
grid.arrange(g1,g2,g3)
Session Info
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggsubplot_0.3.2 dplyr_0.4.1 hexbin_1.27.0 mgcv_1.8-3
## [5] nlme_3.1-118 alr3_2.0.5 car_2.0-21 markdown_0.7.4
## [9] knitr_1.9 ggplot2_1.0.0 gridExtra_0.9.1
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 colorspace_1.2-4 DBI_0.3.1 digest_0.6.4
## [5] evaluate_0.5.5 formatR_1.0 gtable_0.1.2 htmltools_0.2.6
## [9] labeling_0.3 lattice_0.20-29 lazyeval_0.1.10 magrittr_1.5
## [13] MASS_7.3-35 Matrix_1.1-5 mime_0.2 munsell_0.4.2
## [17] nnet_7.3-8 parallel_3.1.2 plyr_1.8.1 proto_0.3-10
## [21] Rcpp_0.11.3 reshape2_1.4 rmarkdown_0.3.8 scales_0.2.4
## [25] stringr_0.6.2 tools_3.1.2 yaml_2.1.13
No comments:
Post a Comment