DataScience Workbook
/
08: Data Visualization
/ 2.
Introduction to scientific graphing
/ 2.3. RStudio – data processing & plotting with R
Introduction
Predicting the age of abalone from physical measurements.
Dataset characteristics: multivariate
Attribute type: categorical, integer, real
Subject area: life
Instances: 4177
Associated Tasks: classification
Attributes: 8
abalone <- readr::read_csv( "abalone.data", col_names = F )
names( abalone ) <- c( "Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings" )
abalone
## # A tibble: 4,177 × 9
## Sex Length Diameter Height Whole_weight Shucked_weight Viscera_weight Shell_weight Rings
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M 0.455 0.365 0.095 0.514 0.224 0.101 0.15 15
## 2 M 0.35 0.265 0.09 0.226 0.0995 0.0485 0.07 7
## 3 F 0.53 0.42 0.135 0.677 0.256 0.142 0.21 9
## 4 M 0.44 0.365 0.125 0.516 0.216 0.114 0.155 10
## 5 I 0.33 0.255 0.08 0.205 0.0895 0.0395 0.055 7
## 6 I 0.425 0.3 0.095 0.352 0.141 0.0775 0.12 8
## 7 F 0.53 0.415 0.15 0.778 0.237 0.142 0.33 20
## 8 F 0.545 0.425 0.125 0.768 0.294 0.150 0.26 16
## 9 M 0.475 0.37 0.125 0.509 0.216 0.112 0.165 9
## 10 F 0.55 0.44 0.15 0.894 0.314 0.151 0.32 19
## # ℹ 4,167 more rows
glimpse( abalone )
## Rows: 4,177
## Columns: 9
## $ Sex <chr> "M", "M", "F", "M", "I", "I", "F", "F", "M", "F", "F", "M", "M", "F", "F", "M", "I", "F", "M", "M", "M", "I"…
## $ Length <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0.475, 0.550, 0.525, 0.430, 0.490, 0.535, 0.470, 0.5…
## $ Diameter <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0.370, 0.440, 0.380, 0.350, 0.380, 0.405, 0.355, 0.4…
## $ Height <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0.125, 0.150, 0.140, 0.110, 0.135, 0.145, 0.100, 0.1…
## $ Whole_weight <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0.7680, 0.5095, 0.8945, 0.6065, 0.4060, 0.5415, 0.68…
## $ Shucked_weight <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0.2940, 0.2165, 0.3145, 0.1940, 0.1675, 0.2175, 0.27…
## $ Viscera_weight <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0.1495, 0.1125, 0.1510, 0.1475, 0.0810, 0.0950, 0.17…
## $ Shell_weight <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0.165, 0.320, 0.210, 0.135, 0.190, 0.205, 0.185, 0.2…
## $ Rings <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12, 7, 10, 7, 9, 11, 10, 12, 9, 10, 11, 11, 12, 15, 1…
abalone %>%
select( where(is.numeric) ) %>%
psych::describe() %>%
kableExtra::kbl() %>%
kableExtra::kable_styling( bootstrap_options = c("striped", "hover") )
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Length | 1 | 4177 | 0.5239921 | 0.1200929 | 0.5450 | 0.5324783 | 0.1186080 | 0.0750 | 0.8150 | 0.7400 | -0.6394138 | 0.0616411 | 0.0018582 |
Diameter | 2 | 4177 | 0.4078813 | 0.0992399 | 0.4250 | 0.4146994 | 0.0963690 | 0.0550 | 0.6500 | 0.5950 | -0.6087607 | -0.0482711 | 0.0015355 |
Height | 3 | 4177 | 0.1395164 | 0.0418271 | 0.1400 | 0.1402498 | 0.0370650 | 0.0000 | 1.1300 | 1.1300 | 3.1265706 | 75.8953091 | 0.0006472 |
Whole_weight | 4 | 4177 | 0.8287422 | 0.4903890 | 0.7995 | 0.7995646 | 0.5285469 | 0.0020 | 2.8255 | 2.8235 | 0.5305773 | -0.0264756 | 0.0075877 |
Shucked_weight | 5 | 4177 | 0.3593675 | 0.2219629 | 0.3360 | 0.3439231 | 0.2349921 | 0.0010 | 1.4880 | 1.4870 | 0.7185815 | 0.5912553 | 0.0034344 |
Viscera_weight | 6 | 4177 | 0.1805936 | 0.1096143 | 0.1710 | 0.1733193 | 0.1178667 | 0.0005 | 0.7600 | 0.7595 | 0.5914271 | 0.0809994 | 0.0016960 |
Shell_weight | 7 | 4177 | 0.2388309 | 0.1392027 | 0.2340 | 0.2305173 | 0.1475187 | 0.0015 | 1.0050 | 1.0035 | 0.6204809 | 0.5281636 | 0.0021538 |
Rings | 8 | 4177 | 9.9336845 | 3.2241690 | 9.0000 | 9.6410410 | 2.9652000 | 1.0000 | 29.0000 | 28.0000 | 1.1133019 | 2.3239123 | 0.0498868 |
p <- ggplot(abalone, aes(x=Diameter, y=Height, color=Sex)) + geom_point() + theme_minimal()
p <- ggplotly(p)
p
# Create a histogram with the default bin range
p1 <- abalone %>%
ggplot(aes(x=Rings)) +
geom_histogram(color="black", fill="steelblue")
# Convert to plotly
p1 <- ggplotly(p1)
# Create a histogram with a custom bin range
p2 <- abalone %>%
ggplot(aes(x=Rings)) +
geom_histogram(binwidth = 2, boundary = 0, color="black", fill="steelblue") +
scale_x_continuous(breaks = seq(0, 30, 4), limits = c(0, 30))
# Convert to plotly
p2 <- ggplotly(p2)
# Create a boxplot
p3 <- abalone %>%
ggplot(aes(y=Rings)) +
geom_boxplot(fill="steelblue") +
scale_y_continuous(breaks = seq(0, 30, 4), limits = c(0, 30))
# Convert to plotly
p3 <- ggplotly(p3)
# Print the plots using the `subplot` function from plotly
p4 <- subplot( p1, p2, p3, nrows = 2 )
p4
# Compute the correlation matrix
corr_matrix <- cor( abalone %>% select(where(is.numeric)), method = "spearman" )
# Convert the correlation matrix into a tidy format
corr_df <- as.data.frame( corr_matrix ) %>%
rownames_to_column( "Var1" ) %>%
reshape2::melt( id.vars = "Var1", variable.name = "Var2", value.name = "Correlation" )
# Create a heatmap with ggplot2
heatmap <- ggplot( corr_df, aes(x = Var1, y = Var2, fill = Correlation) ) +
geom_tile() +
geom_text( aes(label = round(Correlation, 2)), size = 4 ) +
scale_fill_gradient2( low = "blue", high = "red", mid = "white", midpoint = 0 ) +
theme_minimal() +
theme( axis.text.x = element_text(angle = 45, hjust = 1) )
# Convert the heatmap to a plotly interactive graphic
heatmap <- ggplotly(heatmap)
heatmap
The attributes that are most correlated to Rings
are Shell_weight
and Height
.