admin管理员组

文章数量:1022249

This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data

id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df

  id state period
1  1      C  Start
2  1      A    End
3  2      A  Start
4  2      A    End
5  3      B  Start
6  3      A    End
7  4      C  Start
8  4      A    End

and the expected output

    transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to B    0
5     B to A    1
6     B to C    0
7     C to C    0
8     C to A    2
9     C to B    0

I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!

This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data

id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df

  id state period
1  1      C  Start
2  1      A    End
3  2      A  Start
4  2      A    End
5  3      B  Start
6  3      A    End
7  4      C  Start
8  4      A    End

and the expected output

    transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to B    0
5     B to A    1
6     B to C    0
7     C to C    0
8     C to A    2
9     C to B    0

I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!

Share Improve this question edited Nov 19, 2024 at 13:27 T Richard asked Nov 19, 2024 at 13:08 T RichardT Richard 6013 silver badges12 bronze badges 7
  • Is it always 2 rows per ID? – zx8754 Commented Nov 19, 2024 at 13:14
  • 1 @zx8754 it's possible to have many rows per ID, but this particular case its 2 rows per ID. Thanks! – T Richard Commented Nov 19, 2024 at 13:17
  • "start A end B" same as "start B end A" ? – zx8754 Commented Nov 19, 2024 at 13:24
  • @zx8754 Not same. – T Richard Commented Nov 19, 2024 at 13:25
  • If there are many rows per ID, how do you know which Start matches with which End? – Allan Cameron Commented Nov 19, 2024 at 13:29
 |  Show 2 more comments

6 Answers 6

Reset to default 6

A solution entirely within base R could be something like:

do.call("c",
  split(df$state, df$id) |>
  lapply(paste, collapse = " to ")) |>
  factor(levels = sort(c(outer(unique(df$state), unique(df$state), 
                               FUN = paste, sep = " to ")))) |>
  table() |>
  as.data.frame() |>
  setNames(c("transition", "freq"))
#>   transition freq
#> 1     A to A    1
#> 2     A to B    0
#> 3     A to C    0
#> 4     B to A    1
#> 5     B to B    0
#> 6     B to C    0
#> 7     C to A    2
#> 8     C to B    0
#> 9     C to C    0

You can try

states <- sort(unique(df$state))
transform(
    aggregate(
        id ~ .,
        merge(
            expand.grid(state.Start = states, state.End = states),
            reshape(
                df,
                direction = "wide",
                idvar = "id",
                timevar = "period"
            ),
            all = TRUE
        ),
        list,
        na.action = na.pass
    ),
    freq = lengths(id) * !is.na(id)
)

gives

  state.Start state.End   id freq
1           A         A    2    1
2           B         A    3    1
3           C         A 1, 4    2
4           A         B   NA    0
5           B         B   NA    0
6           C         B   NA    0
7           A         C   NA    0
8           B         C   NA    0
9           C         C   NA    0

You could merge permutations with repetition to the first and last element (I interpreted that from your comment) of the state variable pasted together.

> data.frame(
+   Var1=RcppAlgos::permuteGeneral(sort(unique(df$state)), 2, rep=TRUE, 
+                                  FUN=paste, collapse=' to ') |> do.call(what='rbind')
+ ) |> 
+   merge(
+     split(df$state, df$id) |> lapply(\(x) toString(x[c(1, length(x))])) |>
+       sub(', ', ' to ', x=_) |> table() |> as.data.frame(),
+     all=TRUE
+   ) |> 
+   transform(Freq=replace(Freq, is.na(Freq), 0)) |>  setNames(c('transition', 'freq'))
  transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to A    1
5     B to B    0
6     B to C    0
7     C to A    2
8     C to B    0
9     C to C    0

With tidyverse, using left_join on the crossing states

library(dplyr)
library(tidyr)

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  reframe(freq = id * 0 + n(), .by = c(Start, End)) %>% 
  distinct()

output

# A tibble: 9 × 3
  Start End    freq
  <chr> <chr> <dbl>
1 A     A         1
2 A     B        NA
3 A     C        NA
4 B     A         1
5 B     B        NA
6 B     C        NA
7 C     A         2
8 C     B        NA
9 C     C        NA

or, to get the complete desired format

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  mutate(freq = id * 0 + n(),
         freq = replace(freq, is.na(freq), 0), .by = c(Start, End)) %>% 
  mutate(transition = purrr::map2_vec(Start, End, ~ paste(.x, "to", .y)), .before=1) %>%
  select(-c(Start, End, id)) %>% 
  distinct()
Joining with `by = join_by(Start, End)`
# A tibble: 9 × 2
  transition  freq
  <chr>      <dbl>
1 A to A         1
2 A to B         0
3 A to C         0
4 B to A         1
5 B to B         0
6 B to C         0
7 C to A         2
8 C to B         0
9 C to C         0

An alternative format of the data is to store in a length(id) x length(period) matrix:

rows = unique(df$id)
cols = unique(df$period)
mat = matrix(nrow = length(rows), ncol = length(cols), 
             dimnames = list(id = rows, period = cols))
mat[cbind(df$id, df$period)] = df$state
#mat
#   period
#id  Start End
#  1 "C"   "A"
#  2 "A"   "A"
#  3 "B"   "A"
#  4 "C"   "A"

And then, tabulate:

lvls = unique(df$state)
as.data.frame(table(factor(mat[, "Start"], lvls), 
                    factor(mat[, "End"], lvls)))
#  Var1 Var2 Freq
#1    C    C    0
#2    A    C    0
#3    B    C    0
#4    C    A    2
#5    A    A    1
#6    B    A    1
#7    C    B    0
#8    A    B    0
#9    B    B    0

and format as needed afterwards.

a (detailled) solution with data.table :)

Code:

library(data.table)
library(stringr)

# define the example
dt_example <- data.table(
  id = c(1,1,2,2,3,3,4,4),
  state = c("C","A", "A", "A", "B", "A", "C", "A"),
  period = rep(c("Start", "End"), 4)
)

# define the target output structure
dt_target <- data.table( 
  Start.Point = c( rep("A",3), rep("B",3),rep("C",3) ), 
  End.Point = rep(c("A", "B", "C"),3) 
)
dt_target[, Path := str_c(Start.Point, " -> ", End.Point)]

# work the data
dt_agregate <- merge( 
  dt_example[period == "Start", .(id, Start.Point = state)], 
  dt_example[period == "End", .(id, End.Point = state)], 
  by = "id", 
  allow.cartesian = T
)
dt_agregate[, Path := str_c(Start.Point, " -> ", End.Point)]

# attach the aggreagation results
dt_target <- merge(
   dt_target[,.(Path)],
   dt_agregate[, .(freq = .N), .(Path)],
   by = "Path", 
   all = T
 )

# replace NA by 0
setnafill(dt_target, fill = 0, cols = 2)
dt_target

Output:

Key: <Path>
     Path  freq
   <char> <int>
1: A -> A     1
2: A -> B     0
3: A -> C     0
4: B -> A     1
5: B -> B     0
6: B -> C     0
7: C -> A     2
8: C -> B     0
9: C -> C     0

This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data

id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df

  id state period
1  1      C  Start
2  1      A    End
3  2      A  Start
4  2      A    End
5  3      B  Start
6  3      A    End
7  4      C  Start
8  4      A    End

and the expected output

    transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to B    0
5     B to A    1
6     B to C    0
7     C to C    0
8     C to A    2
9     C to B    0

I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!

This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data

id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df

  id state period
1  1      C  Start
2  1      A    End
3  2      A  Start
4  2      A    End
5  3      B  Start
6  3      A    End
7  4      C  Start
8  4      A    End

and the expected output

    transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to B    0
5     B to A    1
6     B to C    0
7     C to C    0
8     C to A    2
9     C to B    0

I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!

Share Improve this question edited Nov 19, 2024 at 13:27 T Richard asked Nov 19, 2024 at 13:08 T RichardT Richard 6013 silver badges12 bronze badges 7
  • Is it always 2 rows per ID? – zx8754 Commented Nov 19, 2024 at 13:14
  • 1 @zx8754 it's possible to have many rows per ID, but this particular case its 2 rows per ID. Thanks! – T Richard Commented Nov 19, 2024 at 13:17
  • "start A end B" same as "start B end A" ? – zx8754 Commented Nov 19, 2024 at 13:24
  • @zx8754 Not same. – T Richard Commented Nov 19, 2024 at 13:25
  • If there are many rows per ID, how do you know which Start matches with which End? – Allan Cameron Commented Nov 19, 2024 at 13:29
 |  Show 2 more comments

6 Answers 6

Reset to default 6

A solution entirely within base R could be something like:

do.call("c",
  split(df$state, df$id) |>
  lapply(paste, collapse = " to ")) |>
  factor(levels = sort(c(outer(unique(df$state), unique(df$state), 
                               FUN = paste, sep = " to ")))) |>
  table() |>
  as.data.frame() |>
  setNames(c("transition", "freq"))
#>   transition freq
#> 1     A to A    1
#> 2     A to B    0
#> 3     A to C    0
#> 4     B to A    1
#> 5     B to B    0
#> 6     B to C    0
#> 7     C to A    2
#> 8     C to B    0
#> 9     C to C    0

You can try

states <- sort(unique(df$state))
transform(
    aggregate(
        id ~ .,
        merge(
            expand.grid(state.Start = states, state.End = states),
            reshape(
                df,
                direction = "wide",
                idvar = "id",
                timevar = "period"
            ),
            all = TRUE
        ),
        list,
        na.action = na.pass
    ),
    freq = lengths(id) * !is.na(id)
)

gives

  state.Start state.End   id freq
1           A         A    2    1
2           B         A    3    1
3           C         A 1, 4    2
4           A         B   NA    0
5           B         B   NA    0
6           C         B   NA    0
7           A         C   NA    0
8           B         C   NA    0
9           C         C   NA    0

You could merge permutations with repetition to the first and last element (I interpreted that from your comment) of the state variable pasted together.

> data.frame(
+   Var1=RcppAlgos::permuteGeneral(sort(unique(df$state)), 2, rep=TRUE, 
+                                  FUN=paste, collapse=' to ') |> do.call(what='rbind')
+ ) |> 
+   merge(
+     split(df$state, df$id) |> lapply(\(x) toString(x[c(1, length(x))])) |>
+       sub(', ', ' to ', x=_) |> table() |> as.data.frame(),
+     all=TRUE
+   ) |> 
+   transform(Freq=replace(Freq, is.na(Freq), 0)) |>  setNames(c('transition', 'freq'))
  transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to A    1
5     B to B    0
6     B to C    0
7     C to A    2
8     C to B    0
9     C to C    0

With tidyverse, using left_join on the crossing states

library(dplyr)
library(tidyr)

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  reframe(freq = id * 0 + n(), .by = c(Start, End)) %>% 
  distinct()

output

# A tibble: 9 × 3
  Start End    freq
  <chr> <chr> <dbl>
1 A     A         1
2 A     B        NA
3 A     C        NA
4 B     A         1
5 B     B        NA
6 B     C        NA
7 C     A         2
8 C     B        NA
9 C     C        NA

or, to get the complete desired format

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  mutate(freq = id * 0 + n(),
         freq = replace(freq, is.na(freq), 0), .by = c(Start, End)) %>% 
  mutate(transition = purrr::map2_vec(Start, End, ~ paste(.x, "to", .y)), .before=1) %>%
  select(-c(Start, End, id)) %>% 
  distinct()
Joining with `by = join_by(Start, End)`
# A tibble: 9 × 2
  transition  freq
  <chr>      <dbl>
1 A to A         1
2 A to B         0
3 A to C         0
4 B to A         1
5 B to B         0
6 B to C         0
7 C to A         2
8 C to B         0
9 C to C         0

An alternative format of the data is to store in a length(id) x length(period) matrix:

rows = unique(df$id)
cols = unique(df$period)
mat = matrix(nrow = length(rows), ncol = length(cols), 
             dimnames = list(id = rows, period = cols))
mat[cbind(df$id, df$period)] = df$state
#mat
#   period
#id  Start End
#  1 "C"   "A"
#  2 "A"   "A"
#  3 "B"   "A"
#  4 "C"   "A"

And then, tabulate:

lvls = unique(df$state)
as.data.frame(table(factor(mat[, "Start"], lvls), 
                    factor(mat[, "End"], lvls)))
#  Var1 Var2 Freq
#1    C    C    0
#2    A    C    0
#3    B    C    0
#4    C    A    2
#5    A    A    1
#6    B    A    1
#7    C    B    0
#8    A    B    0
#9    B    B    0

and format as needed afterwards.

a (detailled) solution with data.table :)

Code:

library(data.table)
library(stringr)

# define the example
dt_example <- data.table(
  id = c(1,1,2,2,3,3,4,4),
  state = c("C","A", "A", "A", "B", "A", "C", "A"),
  period = rep(c("Start", "End"), 4)
)

# define the target output structure
dt_target <- data.table( 
  Start.Point = c( rep("A",3), rep("B",3),rep("C",3) ), 
  End.Point = rep(c("A", "B", "C"),3) 
)
dt_target[, Path := str_c(Start.Point, " -> ", End.Point)]

# work the data
dt_agregate <- merge( 
  dt_example[period == "Start", .(id, Start.Point = state)], 
  dt_example[period == "End", .(id, End.Point = state)], 
  by = "id", 
  allow.cartesian = T
)
dt_agregate[, Path := str_c(Start.Point, " -> ", End.Point)]

# attach the aggreagation results
dt_target <- merge(
   dt_target[,.(Path)],
   dt_agregate[, .(freq = .N), .(Path)],
   by = "Path", 
   all = T
 )

# replace NA by 0
setnafill(dt_target, fill = 0, cols = 2)
dt_target

Output:

Key: <Path>
     Path  freq
   <char> <int>
1: A -> A     1
2: A -> B     0
3: A -> C     0
4: B -> A     1
5: B -> B     0
6: B -> C     0
7: C -> A     2
8: C -> B     0
9: C -> C     0

本文标签: dataframeTransition tables from longitudinal data in the long format using RStack Overflow