Diamonds
library(ggplot2)
data(diamonds)
How many observations are in data set?
dim(diamonds)[1]
## [1] 53940
How many variables are in data set?
dim(diamonds)[2]
## [1] 10
How many ordered factors are in data set?
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
What letter represents the best color for a diamond?
head(levels(diamonds$color),1)
## [1] "D"
Price Histogram
ggplot(aes(x=price), data=diamonds) + geom_histogram(color='blue', fill='#099DD9', binwidth=100) +
xlab("Price") + ylab("Frequency") + ggtitle("Histogram of diamonds prices")
Price Summary
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
Diamond Counts
How many diamonds cost less than $500?
dim(subset(diamonds, price < 500))[1]
## [1] 1729
How many diamonds cost less than $250?
dim(subset(diamonds, price < 250))[1]
## [1] 0
How many diamonds cost $15,000 or more?
dim(subset(diamonds, price >= 15000))[1]
## [1] 1656
Cheaper Diamonds
ggplot(aes(x=price), data=subset(diamonds, price < 1500)) +
geom_histogram(color='blue', fill='#099DD9', binwidth = 1) +
xlab("Price") + ylab("Frequency") + ggtitle("Histogram of cheaper diamonds")
Price by Cut Histograms
ggplot(aes(x=price), data=diamonds) +
geom_histogram(color = 'blue', fill='#099DD9', binwidth=1) +
facet_wrap(~cut, ncol=2, scales="free") + xlab("Price") + ylab("Frequency") +
ggtitle("Histogram of diamonds by cut")
Price by Cut
by(diamonds$price, diamonds$cut, summary)
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 2050 3282 4359 5206 18570
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 1145 3050 3929 5028 18790
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 912 2648 3982 5373 18820
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1046 3185 4584 6296 18820
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 878 1810 3458 4678 18810
Price per Carat by Cut
ggplot(aes(x=price/carat), data=diamonds) +
geom_histogram(color='blue', fill='#099DD9', binwidth=.05) +
facet_wrap(~cut, ncol=2, scales='free') + xlab("Price per Carat") + ylab("Frequency") +
ggtitle("Price per Carat by Cut") + scale_x_log10()
Price Box Plots
ggplot(aes(y=price, x=cut), data=diamonds)+
geom_boxplot() + xlab("Cut") + ylab("Price") + ggtitle("Boxplot. Price by Cut")
ggplot(aes(y=price, x=clarity), data=diamonds)+
geom_boxplot() + xlab("Clarity") + ylab("Price") + ggtitle("Boxplot. Price by Clarity")
ggplot(aes(y=price, x=color), data=diamonds)+
geom_boxplot() + xlab("Color") + ylab("Price") + ggtitle("Boxplot. Price by Color")
Interquartile Range
What is the price range for the middle 50% of diamonds with color D?
summary(subset(diamonds, color == "D")$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
#alternative
quantile(subset(diamonds, color == "D")$price, probs=c(0.25, 0.75))
## 25% 75%
## 911.0 4213.5
What is the price range for the middle 50% of diamonds with color J?
quantile(subset(diamonds, color == "J")$price, probs=c(0.25, 0.75))
## 25% 75%
## 1860.5 7695.0
What is the IQR for diamonds with the best color?
IQR(subset(diamonds, color == head(levels(diamonds$color), 1))$price)
## [1] 3302.5
What is the IQR for diamonds with the worst color?
IQR(subset(diamonds, color == tail(levels(diamonds$color), 1))$price)
## [1] 5834.5
Price per Carat Box Plots by Color
ggplot(aes(y=price/carat, x=color), data=diamonds) +
geom_boxplot(colour = I("#3366FF")) + xlab("Color") + ylab("Price per Carat") +
ggtitle("Price per Carat by Color")
Carat Frequency Polygon
ggplot(aes(x = carat), data = diamonds) +
geom_freqpoly(binwidth=0.01, color='blue') +
xlab("Carat") + ylab("Frequency") + ggtitle("Carat Frequency Polygon")
What cara size has frequency greater than 2000?
t <- table(diamonds$carat)
t[t>2000]
##
## 0.3 0.31 1.01
## 2604 2249 2242
Session Info
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] markdown_0.7.4 knitr_1.9 ggplot2_1.0.0 gridExtra_0.9.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_1.0
## [5] gtable_0.1.2 htmltools_0.2.6 labeling_0.3 MASS_7.3-35
## [9] mime_0.2 munsell_0.4.2 plyr_1.8.1 proto_0.3-10
## [13] Rcpp_0.11.3 reshape2_1.4 rmarkdown_0.3.8 scales_0.2.4
## [17] stringr_0.6.2 tools_3.1.2
No comments:
Post a Comment