Question

Shapiro-Wilks test on groups within a column

I would like to perform a Shapiro Wilks test on my data, but on groups within a column.

I am working on the following dataset

structure(list(Year = c(2000L, 2001L, 2001L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2004L, 2004L, 2004L, 2004L, 
2004L, 2004L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2006L, 2006L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2009L, 
2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 
2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2021L, 2021L, 2021L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 
2021L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 2022L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 2023L, 
2023L, 2023L, 2023L, NA, NA, NA), Weight = c("492", "536", "564", 
"630", "462", "466", "552", "490", "606", "482", "478", "604", 
"616", "558", "576", "596", "500", "542", "600", "550", "586", 
"584", "596", "568", "612", "464", "596", "475", "636", "614", 
"624", "636", "554", "560", "578", "504", "668", "570", "504", 
"534", "553", "558", "530", "690", "456", "522", "604", "500", 
"612", "538", "512", "498", "534", "404", "502", "580", "538", 
"436", "620", "580", "588", "600", "620", "434", "568", "436", 
"460", "462", "520", "472", "546", "498", "586", "640", "474", 
"582", "688", "400", "530", "534", "540", "552", "488", "510", 
"382", "498", "594", "546", "610", "542", "644", "644", "545", 
"580", "464", "562", "472", "522", "558", "556", "630", "556", 
"554", "550", "550", "585", "512", "598", "420", "655", "584", 
"598", "570", "558", "390", "612", "628", "554", "580", "500", 
"588", "630", "580", "468", "432", "540", "514", "536", "500", 
"630", "574", "578", "482", "558", "468", "532", "628", "422", 
"624", "556", "526", "522", "560", "540", "636", "700", "508", 
"454", "672", "422", "566", "550", "625", "515", "650", "650", 
"650", "658", "482", "688", "695", "560", "600", "626", "510", 
"507", "627", "670", "590", "513", "622", "693", "476", "648", 
"600", "585", "534", "474", "526", "525", "603", "550", "520", 
"554", "569", "508", "533", "594", "510", "469", "499", "560", 
"560", "436", "555", "560", "396", "470", "456", "586", "536", 
"553", "700", "479", "553", "659", "566", "427", "583", "466", 
"530", "552", "601", "593", "515", "523", "585", "538", "524", 
"646", "681", "595", "405", "601", "426", "473", "438", "541", 
"568", "533", "480", "434", "596", "508", "606", "480", "523", 
"472", "521", "480", "393", "645", "694", "715", "653", "635", 
"638", "592", "599", "574", "505", "590", "471", "538", "533", 
"539", "586", "493", "456", "581", "519", "606", "512", "462", 
"667", "576", "394", "439", "500", "645", "494", "612", "479", 
"613", "507", "443", "630", "688", "613", "583", "548", "588", 
"613", "566", "564", "601", "637", "552", "580", "620", "605", 
"556", "531", "520", "501", "582", "556", "548", "590", "514", 
"587", "593", "649", "482", "488", "541", "538", "533", "590", 
"540", "489", "585", "672", "543", "537", "535", "565", "598", 
"639", "373", "704", "573", "607", "460", "479", "407", "705", 
"572", "534", "517", "496", "476", "640", "#DIV/0!", "2000", 
"0")), row.names = c(5L, 8L, 11L, 12L, 13L, 16L, 17L, 20L, 22L, 
27L, 28L, 30L, 31L, 33L, 34L, 35L, 36L, 37L, 39L, 40L, 42L, 44L, 
46L, 48L, 50L, 52L, 53L, 54L, 56L, 58L, 59L, 61L, 62L, 63L, 65L, 
66L, 67L, 69L, 70L, 72L, 73L, 74L, 75L, 77L, 78L, 79L, 80L, 81L, 
82L, 83L, 84L, 86L, 89L, 90L, 92L, 93L, 94L, 97L, 99L, 102L, 
103L, 104L, 106L, 107L, 109L, 110L, 112L, 113L, 115L, 116L, 117L, 
118L, 119L, 120L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L, 
130L, 131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 
141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 
152L, 153L, 154L, 155L, 157L, 158L, 159L, 160L, 161L, 162L, 163L, 
164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 173L, 174L, 
175L, 176L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 184L, 185L, 
186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L, 194L, 195L, 196L, 
197L, 198L, 199L, 200L, 201L, 202L, 203L, 204L, 205L, 206L, 207L, 
208L, 210L, 211L, 212L, 213L, 214L, 215L, 216L, 217L, 218L, 219L, 
220L, 221L, 222L, 223L, 224L, 225L, 228L, 229L, 230L, 231L, 234L, 
237L, 238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 246L, 247L, 
248L, 249L, 250L, 251L, 252L, 253L, 254L, 255L, 256L, 257L, 258L, 
259L, 260L, 262L, 263L, 264L, 265L, 266L, 267L, 269L, 270L, 271L, 
272L, 273L, 274L, 275L, 276L, 277L, 278L, 279L, 280L, 281L, 282L, 
283L, 284L, 285L, 286L, 287L, 288L, 289L, 290L, 291L, 292L, 293L, 
294L, 295L, 296L, 297L, 298L, 299L, 300L, 301L, 303L, 304L, 305L, 
306L, 307L, 308L, 309L, 310L, 311L, 312L, 313L, 314L, 315L, 316L, 
317L, 318L, 320L, 321L, 322L, 323L, 324L, 325L, 326L, 327L, 328L, 
329L, 330L, 331L, 332L, 333L, 334L, 337L, 338L, 339L, 340L, 341L, 
342L, 343L, 344L, 345L, 346L, 347L, 348L, 349L, 350L, 351L, 352L, 
353L, 354L, 355L, 356L, 357L, 358L, 359L, 360L, 361L, 362L, 363L, 
364L, 365L, 366L, 367L, 368L, 369L, 370L, 371L, 372L, 373L, 374L, 
375L, 376L, 377L, 378L, 380L, 381L, 382L, 383L, 386L, 387L, 388L, 
389L, 390L, 391L, 392L, 393L, 394L, 395L, 396L, 397L, 591L, 628L, 
629L), class = "data.frame")

I would like to perform a Shapiro-Wilks test on Weights grouped by Year. I can't find the code for this though - I have tried the following but it doesn't work.

df3 %>% group_by(Year) %>%
  shapiro.test(Weight)

This returns the error

Error in shapiro.test(., Weight) : unused argument (Weight)
 3  69  3
1 Jan 1970

Solution

 4

First, clean the data,

> df3a <- df3 |> 
+   subset(!is.na(Year)) |> 
+   type.convert(as.is=TRUE)

Then aggregate and include a tryCatch for cases when sample size is below 3 for a year.

> aggregate(Weight ~ Year, df3a, \(x) {
+    tryCatch(
+    unlist(shapiro.test(x)[c('statistic', 'p.value')]), 
+      error=\(e) array(, 2, list(c('W', 'p.value')))
+    )
+  })
   Year   Weight.W Weight.p.value
1  2000         NA             NA
2  2001         NA             NA
3  2002 0.88202907     0.27850516
4  2003 0.90747164     0.22757363
5  2004 0.75446004     0.02201890
6  2005 0.97029203     0.88053925
7  2006         NA             NA
8  2007 0.90706715     0.26144681
9  2008 0.90544953     0.09830065
10 2009 0.97429258     0.85779500
11 2018 0.95501346     0.09098129
12 2019 0.97641193     0.82170672
13 2020 0.93880732     0.09331150
14 2021 0.97733646     0.45938118
15 2022 0.97761009     0.54079708
16 2023 0.98273943     0.58904451
2024-07-12
jay.sf

Solution

 3

You can do this with tapply:

tapply(df3$Weight, df3$Year, shapiro.test)

However, there are some things in your data that prevent this from working, and that you need to fix first:

  • You have "#!DIV/0" somewhere in your Weight column, and this makes it a character vector, and not a numeric one. You need to either explicitly remove them, or just use as.numeric() and they will become NAs.
  • You have less than three weights for some years, and Shapiro-Wilk's test can't work with such few values. You need to filter out these years before testing.
2024-07-12
Z&#233; Loff

Solution

 3
df3 %>% 
  filter(n()>2, .by=Year) |> # Shapiro's test needs 3 obs.
  mutate(Weight=as.numeric(Weight)) |>
  group_by(Year) |>
  summarise(stat=shapiro.test(Weight)$statistic,
            p.value=shapiro.test(Weight)$p.value)

# A tibble: 13 x 3
    Year  stat p.value
   <int> <dbl>   <dbl>
 1  2002 0.882  0.279 
 2  2003 0.907  0.228 
 3  2004 0.754  0.0220
 4  2005 0.970  0.881 
 5  2007 0.907  0.261 
 6  2008 0.905  0.0983
 7  2009 0.974  0.858 
 8  2018 0.955  0.0910
 9  2019 0.976  0.822 
10  2020 0.939  0.0933
11  2021 0.977  0.459 
12  2022 0.978  0.541 
13  2023 0.983  0.589
2024-07-12
Edward

Solution

 0
library(dplyr)
library(tidyr)

df |> 
  filter(n() >= 3, !is.na(Year), .by = Year) |>
  mutate(Weight = as.numeric(Weight), 
         results = broom::tidy(shapiro.test(Weight)), .by = Year) |>
  unnest(results)
2024-07-12
LMc