Sunday, February 1, 2015

Exploring Diamonds Dataset Part 1

Diamonds

library(ggplot2)
data(diamonds)

How many observations are in data set?

dim(diamonds)[1]
## [1] 53940

How many variables are in data set?

dim(diamonds)[2]
## [1] 10

How many ordered factors are in data set?

str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

What letter represents the best color for a diamond?

head(levels(diamonds$color),1)
## [1] "D"

Price Histogram

ggplot(aes(x=price), data=diamonds) + geom_histogram(color='blue', fill='#099DD9', binwidth=100) +
  xlab("Price") + ylab("Frequency") + ggtitle("Histogram of diamonds prices")

plot of chunk unnamed-chunk-6

Price Summary

summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

Diamond Counts

How many diamonds cost less than $500?

dim(subset(diamonds, price < 500))[1]
## [1] 1729

How many diamonds cost less than $250?

dim(subset(diamonds, price < 250))[1]
## [1] 0

How many diamonds cost $15,000 or more?

dim(subset(diamonds, price >= 15000))[1]
## [1] 1656

Cheaper Diamonds

ggplot(aes(x=price), data=subset(diamonds, price < 1500)) +
  geom_histogram(color='blue', fill='#099DD9', binwidth = 1) +
  xlab("Price") + ylab("Frequency") + ggtitle("Histogram of cheaper diamonds")

plot of chunk unnamed-chunk-11

Price by Cut Histograms

ggplot(aes(x=price), data=diamonds) +
  geom_histogram(color = 'blue', fill='#099DD9', binwidth=1) +
  facet_wrap(~cut, ncol=2, scales="free") + xlab("Price") + ylab("Frequency") +
  ggtitle("Histogram of diamonds by cut")

plot of chunk unnamed-chunk-12

Price by Cut

by(diamonds$price, diamonds$cut, summary)
## diamonds$cut: Fair
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3282    4359    5206   18570 
## -------------------------------------------------------- 
## diamonds$cut: Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327    1145    3050    3929    5028   18790 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     912    2648    3982    5373   18820 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1046    3185    4584    6296   18820 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     878    1810    3458    4678   18810

Price per Carat by Cut

ggplot(aes(x=price/carat), data=diamonds) +
  geom_histogram(color='blue', fill='#099DD9', binwidth=.05) +
  facet_wrap(~cut, ncol=2, scales='free') + xlab("Price per Carat") + ylab("Frequency") +
  ggtitle("Price per Carat by Cut") + scale_x_log10()

plot of chunk unnamed-chunk-15

Price Box Plots

ggplot(aes(y=price, x=cut), data=diamonds)+
  geom_boxplot() + xlab("Cut") + ylab("Price") + ggtitle("Boxplot. Price by Cut")

plot of chunk unnamed-chunk-16

ggplot(aes(y=price, x=clarity), data=diamonds)+
  geom_boxplot() + xlab("Clarity") + ylab("Price") + ggtitle("Boxplot. Price by Clarity")

plot of chunk unnamed-chunk-16

ggplot(aes(y=price, x=color), data=diamonds)+
  geom_boxplot() + xlab("Color") + ylab("Price") + ggtitle("Boxplot. Price by Color")

plot of chunk unnamed-chunk-16

Interquartile Range

What is the price range for the middle 50% of diamonds with color D?

summary(subset(diamonds, color == "D")$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690
#alternative
quantile(subset(diamonds, color == "D")$price, probs=c(0.25, 0.75))
##    25%    75% 
##  911.0 4213.5

What is the price range for the middle 50% of diamonds with color J?

quantile(subset(diamonds, color == "J")$price, probs=c(0.25, 0.75))
##    25%    75% 
## 1860.5 7695.0

What is the IQR for diamonds with the best color?

IQR(subset(diamonds, color == head(levels(diamonds$color), 1))$price)
## [1] 3302.5

What is the IQR for diamonds with the worst color?

IQR(subset(diamonds, color == tail(levels(diamonds$color), 1))$price)
## [1] 5834.5

Price per Carat Box Plots by Color

ggplot(aes(y=price/carat, x=color), data=diamonds) +
  geom_boxplot(colour = I("#3366FF")) + xlab("Color") + ylab("Price per Carat") +
  ggtitle("Price per Carat by Color")

plot of chunk unnamed-chunk-21

Carat Frequency Polygon

ggplot(aes(x = carat), data = diamonds) +
  geom_freqpoly(binwidth=0.01, color='blue') +
  xlab("Carat") + ylab("Frequency") + ggtitle("Carat Frequency Polygon")

plot of chunk unnamed-chunk-22

What cara size has frequency greater than 2000?

t <- table(diamonds$carat)
t[t>2000]
## 
##  0.3 0.31 1.01 
## 2604 2249 2242

Session Info

sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] markdown_0.7.4  knitr_1.9       ggplot2_1.0.0   gridExtra_0.9.1
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_1.0     
##  [5] gtable_0.1.2     htmltools_0.2.6  labeling_0.3     MASS_7.3-35     
##  [9] mime_0.2         munsell_0.4.2    plyr_1.8.1       proto_0.3-10    
## [13] Rcpp_0.11.3      reshape2_1.4     rmarkdown_0.3.8  scales_0.2.4    
## [17] stringr_0.6.2    tools_3.1.2

No comments:

Post a Comment