Question

Order columns based on suffix with dplyr and stringr

I have a dataframe that has columns that are separated out by suffix see ex_df. Each time I run the code there may be varying column amounts based on database entries so I am trying to find a way to organize it more intuitively. This means that when running my code this can actually get to be to _20 instead and would rather not have to edit my code each time.

#Df on how it currently is

ex_df <- tibble("address" = c("123 Str", "456 str"),
                "activity_1" = c("aa","bb"),
                "activity_2" = c("cc","dd"),
                "type_1" = c("11","22"),
                "type_2" = c("33","44")
                )

#how i want the df to look like
new_df <- tibble("address" = c("123 Str", "456 str"),
                "type_1" = c("11","22"),
                "activity_1" = c("aa","bb"),
                "type_2" = c("33","44"),
                "activity_2" = c("cc","dd")
                )

If it was always only two columns I would just do a select() or reorder(). And I do not want to use select(order(colnames(ex_df))) because then activity would come before address and the issue would still not be solved.

3 77 3

1 Jan 1970

Solution

You can pivot back and forth and take advantage of the names_vary argument.

pivot_longer(ex_df, -address, names_to=c(".value", "name"), names_sep="_") |>
  mutate(name=as.integer(name)) |>
  arrange(name) |>
  pivot_wider(values_from=c("activity","type"), names_vary = "slowest")

# A tibble: 2 × 5
  address activity_1 type_1 activity_2 type_2
  <chr>   <chr>      <chr>  <chr>      <chr> 
1 123 Str aa         11     cc         33    
2 456 str bb         22     dd         44

This will now work for any number of suffixes. (Thanks s_baldur)

ex_df <- tibble::tibble(
  activity_1 = c("aa", "bb"),
  address = c("123 Str", "456 str"),
  type_30 = c("00", "99"),
  activity_2 = c("cc", "dd"),
  activity_30 = c("hh", "jj"),
  type_1 = c("11", "22"),
  type_2 = c("33", "44"),
)

2024-07-18

Edward

Solution

helper <- function(x) {
  if (length(x) < 2) return(0)
  switch(x[1], "type" = 0.5, "activity" = 1) + as.numeric(x[2])
}

ex_df[order(sapply(strsplit(names(ex_df), "_"), helper))]

#   address type_1 activity_1 type_2 activity_2
#   <chr>   <chr>  <chr>      <chr>  <chr>     
# 1 123 Str 11     aa         33     cc        
# 2 456 str 22     bb         44     dd

Another test dataset

ex_df <- tibble::tibble(
  activity_1 = c("aa", "bb"),
  address = c("123 Str", "456 str"),
  type_30 = c("00", "99"),
  activity_2 = c("cc", "dd"),
  activity_30 = c("hh", "jj"),
  type_1 = c("11", "22"),
  type_2 = c("33", "44"),
)

ex_df[order(sapply(strsplit(names(ex_df), "_"), helper))]
  address type_1 activity_1 type_2 activity_2 type_30 activity_30
  <chr>   <chr>  <chr>      <chr>  <chr>      <chr>   <chr>      
1 123 Str 11     aa         33     cc         00      hh         
2 456 str 22     bb         44     dd         99      jj

2024-07-18

s_baldur

Solution

This will put all columns that do not end with a digit (e.g., address) at the front of the data frame:

library(dplyr)
library(stringr)

ex_df |>
  select(str_order(str_extract(names(ex_df), "\\d+$"), numeric = T, na_last = F))
#  address activity_1 type_1 activity_2 type_2
#   <chr>   <chr>      <chr>  <chr>      <chr> 
# 1 123 Str aa         11     cc         33    
# 2 456 str bb         22     dd         44

2024-07-18

LMc

Solution

In case there's a number at the end of names this gsub takes it to the front, which makes it rank after the zero-attached non-number names.

EDIT, added _ to make it easier for the regex to distinguish between mid-part and ending number

ex_df[, order(gsub("^(.).*_(\\d+)$", "\\2\\1", paste0(0, colnames(ex_df))))]
# A tibble: 2 × 5
  address activity_1 type_1 activity_2 type_2
  <chr>   <chr>      <chr>  <chr>      <chr>
1 123 Str aa         11     cc         33
2 456 str bb         22     dd         44

2024-07-18

Andre Wildberg

Solution

You can use str_detect() to extract the numbers after the underscore and use the order of these to re-order the columns. (Wrapped in a function here for reuseability):

library(tibble)
ex_df <- tibble("address" = c("123 Str", "456 str"),
                "activity_1" = c("aa","bb"),
                "activity_2" = c("cc","dd"),
                "type_1" = c("11","22"),
                "type_2" = c("33","44")
)

col_order <- function(df) {
  unique(
    c(
      # first get the variables not ending in a number - these will come first
      which(stringr::str_detect(names(df), "_\\d+$", negate = TRUE)),   
      # now the numbered variables (as.numeric to ensure 9 comes before 10)
      order(as.numeric(stringr::str_extract(names(df), "_(\\d+)$", group = 1))) 
    )
  )
}

ex_df[, col_order(ex_df)]
#> # A tibble: 2 × 5
#>   address activity_1 type_1 activity_2 type_2
#>   <chr>   <chr>      <chr>  <chr>      <chr> 
#> 1 123 Str aa         11     cc         33    
#> 2 456 str bb         22     dd         44

^{Created on 2024-07-18 with reprex v2.1.0}

2024-07-18

shaun_m