library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/Kateryna/Documents/2020/IndependentStudy/Rstudio
library(e1071)
library(skimr)
#library(lubridate)
#library(forcats)
#library(ggplot2)
sales <- readRDS(here::here("data/sales.rds"))
Chapter3: 2 different types of descriptive statistics: measures of the central tendency and measures of dispersion.
mean(sales$floor_area_sqm)
## [1] 97.58903
median(sales$floor_area_sqm)
## [1] 96
# from https://stackoverflow.com/a/25635740
manual_mode <- function(x, na.rm = FALSE) { # we don't use 'mode' as a function name because it already exists
if (na.rm) {
x <- x[!is.na(x)]
}
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
}
manual_mode(sales$floor_area_sqm)
## [1] 67
"Range"
## [1] "Range"
max(sales$floor_area_sqm) - min(sales$floor_area_sqm)
## [1] 249
"Interquartile Range"
## [1] "Interquartile Range"
IQR(sales$floor_area_sqm)
## [1] 36
"Standard Deviation"
## [1] "Standard Deviation"
sd(sales$floor_area_sqm)
## [1] 24.22276
"Coefficient of variation"
## [1] "Coefficient of variation"
sd(sales$floor_area_sqm) / mean(sales$floor_area_sqm)
## [1] 0.2482119
"Kurtosis and Skewness from the 'e1071` library"
## [1] "Kurtosis and Skewness from the 'e1071` library"
kurtosis(sales$floor_area_sqm)
## [1] -0.1450646
skewness(sales$floor_area_sqm)
## [1] 0.2770161
"Summary"
## [1] "Summary"
summary(sales$floor_area_sqm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.00 76.00 96.00 97.59 112.00 280.00
"Skim"
## [1] "Skim"
skim(sales$floor_area_sqm)
Name | sales$floor_area_sqm |
Number of rows | 79100 |
Number of columns | 1 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | None |
Data summary
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
data | 0 | 1 | 97.59 | 24.22 | 31 | 76 | 96 | 112 | 280 | ▃▇▁▁▁ |
ggplot(sales, aes(x = floor_area_sqm)) +
geom_histogram(binwidth = 5)
ggplot(sales, aes(x = floor_area_sqm)) +
geom_histogram(aes(y = ..density..), binwidth = 5) +
stat_function(fun = dnorm, args = list(mean = mean(sales$floor_area_sqm), sd = sd(sales$floor_area_sqm)))
sales %>%
filter(floor_area_sqm == 67)
## # A tibble: 4,904 x 11
## month town flat_type block street_name storey_range floor_area_sqm
## <date> <chr> <fct> <chr> <chr> <fct> <dbl>
## 1 2015-01-01 ANG ~ 3 ROOM 603 ANG MO KIO~ 07 TO 09 67
## 2 2015-01-01 ANG ~ 3 ROOM 109 ANG MO KIO~ 01 TO 03 67
## 3 2015-01-01 ANG ~ 3 ROOM 218 ANG MO KIO~ 07 TO 09 67
## 4 2015-01-01 ANG ~ 3 ROOM 471 ANG MO KIO~ 07 TO 09 67
## 5 2015-01-01 ANG ~ 3 ROOM 434 ANG MO KIO~ 07 TO 09 67
## 6 2015-01-01 ANG ~ 3 ROOM 560 ANG MO KIO~ 07 TO 09 67
## 7 2015-01-01 ANG ~ 3 ROOM 631 ANG MO KIO~ 07 TO 09 67
## 8 2015-01-01 ANG ~ 3 ROOM 442 ANG MO KIO~ 10 TO 12 67
## 9 2015-01-01 ANG ~ 3 ROOM 558 ANG MO KIO~ 10 TO 12 67
## 10 2015-01-01 ANG ~ 3 ROOM 212 ANG MO KIO~ 10 TO 12 67
## # ... with 4,894 more rows, and 4 more variables: flat_model <fct>,
## # lease_commence_date <dbl>, remaining_lease <dbl>, resale_price <dbl>
#%>% View()
ggplot(sales, aes(x = 1, y = floor_area_sqm)) +
geom_boxplot()
ggplot(sales, aes(x = 1, y = floor_area_sqm)) +
geom_violin()
ggplot(sales, aes(x = floor_area_sqm)) +
geom_histogram(binwidth = 10) +
facet_wrap(vars(flat_type), scales = "free_y")
ggplot(sales, aes(x = flat_type, y = floor_area_sqm)) +
geom_violin()