DataScience Workbook / 08: Data Visualization / 2. Introduction to scientific graphing / 2.3. RStudio – data processing & plotting with R

Introduction

Predicting the age of abalone from physical measurements.
Dataset characteristics: multivariate
Attribute type: categorical, integer, real
Subject area: life
Instances: 4177
Associated Tasks: classification
Attributes: 8

abalone <- readr::read_csv( "abalone.data", col_names = F )
names( abalone ) <- c( "Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings" )
abalone
## # A tibble: 4,177 × 9
##    Sex   Length Diameter Height Whole_weight Shucked_weight Viscera_weight Shell_weight Rings
##    <chr>  <dbl>    <dbl>  <dbl>        <dbl>          <dbl>          <dbl>        <dbl> <dbl>
##  1 M      0.455    0.365  0.095        0.514         0.224          0.101         0.15     15
##  2 M      0.35     0.265  0.09         0.226         0.0995         0.0485        0.07      7
##  3 F      0.53     0.42   0.135        0.677         0.256          0.142         0.21      9
##  4 M      0.44     0.365  0.125        0.516         0.216          0.114         0.155    10
##  5 I      0.33     0.255  0.08         0.205         0.0895         0.0395        0.055     7
##  6 I      0.425    0.3    0.095        0.352         0.141          0.0775        0.12      8
##  7 F      0.53     0.415  0.15         0.778         0.237          0.142         0.33     20
##  8 F      0.545    0.425  0.125        0.768         0.294          0.150         0.26     16
##  9 M      0.475    0.37   0.125        0.509         0.216          0.112         0.165     9
## 10 F      0.55     0.44   0.15         0.894         0.314          0.151         0.32     19
## # ℹ 4,167 more rows
glimpse( abalone )
## Rows: 4,177
## Columns: 9
## $ Sex            <chr> "M", "M", "F", "M", "I", "I", "F", "F", "M", "F", "F", "M", "M", "F", "F", "M", "I", "F", "M", "M", "M", "I"…
## $ Length         <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0.475, 0.550, 0.525, 0.430, 0.490, 0.535, 0.470, 0.5…
## $ Diameter       <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0.370, 0.440, 0.380, 0.350, 0.380, 0.405, 0.355, 0.4…
## $ Height         <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0.125, 0.150, 0.140, 0.110, 0.135, 0.145, 0.100, 0.1…
## $ Whole_weight   <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0.7680, 0.5095, 0.8945, 0.6065, 0.4060, 0.5415, 0.68…
## $ Shucked_weight <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0.2940, 0.2165, 0.3145, 0.1940, 0.1675, 0.2175, 0.27…
## $ Viscera_weight <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0.1495, 0.1125, 0.1510, 0.1475, 0.0810, 0.0950, 0.17…
## $ Shell_weight   <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0.165, 0.320, 0.210, 0.135, 0.190, 0.205, 0.185, 0.2…
## $ Rings          <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12, 7, 10, 7, 9, 11, 10, 12, 9, 10, 11, 11, 12, 15, 1…
abalone %>%
  select( where(is.numeric) ) %>%
  psych::describe() %>%
  kableExtra::kbl() %>%
  kableExtra::kable_styling( bootstrap_options = c("striped", "hover") )
vars n mean sd median trimmed mad min max range skew kurtosis se
Length 1 4177 0.5239921 0.1200929 0.5450 0.5324783 0.1186080 0.0750 0.8150 0.7400 -0.6394138 0.0616411 0.0018582
Diameter 2 4177 0.4078813 0.0992399 0.4250 0.4146994 0.0963690 0.0550 0.6500 0.5950 -0.6087607 -0.0482711 0.0015355
Height 3 4177 0.1395164 0.0418271 0.1400 0.1402498 0.0370650 0.0000 1.1300 1.1300 3.1265706 75.8953091 0.0006472
Whole_weight 4 4177 0.8287422 0.4903890 0.7995 0.7995646 0.5285469 0.0020 2.8255 2.8235 0.5305773 -0.0264756 0.0075877
Shucked_weight 5 4177 0.3593675 0.2219629 0.3360 0.3439231 0.2349921 0.0010 1.4880 1.4870 0.7185815 0.5912553 0.0034344
Viscera_weight 6 4177 0.1805936 0.1096143 0.1710 0.1733193 0.1178667 0.0005 0.7600 0.7595 0.5914271 0.0809994 0.0016960
Shell_weight 7 4177 0.2388309 0.1392027 0.2340 0.2305173 0.1475187 0.0015 1.0050 1.0035 0.6204809 0.5281636 0.0021538
Rings 8 4177 9.9336845 3.2241690 9.0000 9.6410410 2.9652000 1.0000 29.0000 28.0000 1.1133019 2.3239123 0.0498868
p <- ggplot(abalone, aes(x=Diameter, y=Height, color=Sex)) + geom_point() + theme_minimal()
p <- ggplotly(p)
p

# Create a histogram with the default bin range
p1 <- abalone %>%
  ggplot(aes(x=Rings)) +
  geom_histogram(color="black", fill="steelblue")

# Convert to plotly
p1 <- ggplotly(p1)

# Create a histogram with a custom bin range
p2 <- abalone %>%
  ggplot(aes(x=Rings)) +
  geom_histogram(binwidth = 2, boundary = 0, color="black", fill="steelblue") +
  scale_x_continuous(breaks = seq(0, 30, 4), limits = c(0, 30))

# Convert to plotly
p2 <- ggplotly(p2)

# Create a boxplot
p3 <- abalone %>%
  ggplot(aes(y=Rings)) +
  geom_boxplot(fill="steelblue") +
  scale_y_continuous(breaks = seq(0, 30, 4), limits = c(0, 30))

# Convert to plotly
p3 <- ggplotly(p3)

# Print the plots using the `subplot` function from plotly
p4 <- subplot( p1, p2, p3, nrows = 2 )
p4

# Compute the correlation matrix
corr_matrix <- cor( abalone %>% select(where(is.numeric)), method = "spearman" )

# Convert the correlation matrix into a tidy format
corr_df <- as.data.frame( corr_matrix ) %>%
  rownames_to_column( "Var1" ) %>%
  reshape2::melt( id.vars = "Var1", variable.name = "Var2", value.name = "Correlation" )

# Create a heatmap with ggplot2
heatmap <- ggplot( corr_df, aes(x = Var1, y = Var2, fill = Correlation) ) +
  geom_tile() +
  geom_text( aes(label = round(Correlation, 2)), size = 4 ) +
  scale_fill_gradient2( low = "blue", high = "red", mid = "white", midpoint = 0 ) +
  theme_minimal() +
  theme( axis.text.x = element_text(angle = 45, hjust = 1) )

# Convert the heatmap to a plotly interactive graphic
heatmap <- ggplotly(heatmap)
heatmap

The attributes that are most correlated to Rings are Shell_weight and Height.