admin管理员组文章数量:1022249
This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data
id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df
id state period
1 1 C Start
2 1 A End
3 2 A Start
4 2 A End
5 3 B Start
6 3 A End
7 4 C Start
8 4 A End
and the expected output
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to B 0
5 B to A 1
6 B to C 0
7 C to C 0
8 C to A 2
9 C to B 0
I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!
This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data
id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df
id state period
1 1 C Start
2 1 A End
3 2 A Start
4 2 A End
5 3 B Start
6 3 A End
7 4 C Start
8 4 A End
and the expected output
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to B 0
5 B to A 1
6 B to C 0
7 C to C 0
8 C to A 2
9 C to B 0
I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!
Share Improve this question edited Nov 19, 2024 at 13:27 T Richard asked Nov 19, 2024 at 13:08 T RichardT Richard 6013 silver badges12 bronze badges 7- Is it always 2 rows per ID? – zx8754 Commented Nov 19, 2024 at 13:14
- 1 @zx8754 it's possible to have many rows per ID, but this particular case its 2 rows per ID. Thanks! – T Richard Commented Nov 19, 2024 at 13:17
- "start A end B" same as "start B end A" ? – zx8754 Commented Nov 19, 2024 at 13:24
- @zx8754 Not same. – T Richard Commented Nov 19, 2024 at 13:25
- If there are many rows per ID, how do you know which Start matches with which End? – Allan Cameron Commented Nov 19, 2024 at 13:29
6 Answers
Reset to default 6A solution entirely within base R could be something like:
do.call("c",
split(df$state, df$id) |>
lapply(paste, collapse = " to ")) |>
factor(levels = sort(c(outer(unique(df$state), unique(df$state),
FUN = paste, sep = " to ")))) |>
table() |>
as.data.frame() |>
setNames(c("transition", "freq"))
#> transition freq
#> 1 A to A 1
#> 2 A to B 0
#> 3 A to C 0
#> 4 B to A 1
#> 5 B to B 0
#> 6 B to C 0
#> 7 C to A 2
#> 8 C to B 0
#> 9 C to C 0
You can try
states <- sort(unique(df$state))
transform(
aggregate(
id ~ .,
merge(
expand.grid(state.Start = states, state.End = states),
reshape(
df,
direction = "wide",
idvar = "id",
timevar = "period"
),
all = TRUE
),
list,
na.action = na.pass
),
freq = lengths(id) * !is.na(id)
)
gives
state.Start state.End id freq
1 A A 2 1
2 B A 3 1
3 C A 1, 4 2
4 A B NA 0
5 B B NA 0
6 C B NA 0
7 A C NA 0
8 B C NA 0
9 C C NA 0
You could merge
permutations with repetition to the first and last element (I interpreted that from your comment) of the state variable pasted together.
> data.frame(
+ Var1=RcppAlgos::permuteGeneral(sort(unique(df$state)), 2, rep=TRUE,
+ FUN=paste, collapse=' to ') |> do.call(what='rbind')
+ ) |>
+ merge(
+ split(df$state, df$id) |> lapply(\(x) toString(x[c(1, length(x))])) |>
+ sub(', ', ' to ', x=_) |> table() |> as.data.frame(),
+ all=TRUE
+ ) |>
+ transform(Freq=replace(Freq, is.na(Freq), 0)) |> setNames(c('transition', 'freq'))
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to A 1
5 B to B 0
6 B to C 0
7 C to A 2
8 C to B 0
9 C to C 0
With tidyverse
, using left_join
on the crossing
states
library(dplyr)
library(tidyr)
left_join(crossing(Start = df$state, End = df$state),
pivot_wider(df, names_from = period, values_from = state)) %>%
reframe(freq = id * 0 + n(), .by = c(Start, End)) %>%
distinct()
output
# A tibble: 9 × 3
Start End freq
<chr> <chr> <dbl>
1 A A 1
2 A B NA
3 A C NA
4 B A 1
5 B B NA
6 B C NA
7 C A 2
8 C B NA
9 C C NA
or, to get the complete desired format
left_join(crossing(Start = df$state, End = df$state),
pivot_wider(df, names_from = period, values_from = state)) %>%
mutate(freq = id * 0 + n(),
freq = replace(freq, is.na(freq), 0), .by = c(Start, End)) %>%
mutate(transition = purrr::map2_vec(Start, End, ~ paste(.x, "to", .y)), .before=1) %>%
select(-c(Start, End, id)) %>%
distinct()
Joining with `by = join_by(Start, End)`
# A tibble: 9 × 2
transition freq
<chr> <dbl>
1 A to A 1
2 A to B 0
3 A to C 0
4 B to A 1
5 B to B 0
6 B to C 0
7 C to A 2
8 C to B 0
9 C to C 0
An alternative format of the data is to store in a length(id) x length(period)
matrix:
rows = unique(df$id)
cols = unique(df$period)
mat = matrix(nrow = length(rows), ncol = length(cols),
dimnames = list(id = rows, period = cols))
mat[cbind(df$id, df$period)] = df$state
#mat
# period
#id Start End
# 1 "C" "A"
# 2 "A" "A"
# 3 "B" "A"
# 4 "C" "A"
And then, tabulate:
lvls = unique(df$state)
as.data.frame(table(factor(mat[, "Start"], lvls),
factor(mat[, "End"], lvls)))
# Var1 Var2 Freq
#1 C C 0
#2 A C 0
#3 B C 0
#4 C A 2
#5 A A 1
#6 B A 1
#7 C B 0
#8 A B 0
#9 B B 0
and format as needed afterwards.
a (detailled) solution with data.table
:)
Code:
library(data.table)
library(stringr)
# define the example
dt_example <- data.table(
id = c(1,1,2,2,3,3,4,4),
state = c("C","A", "A", "A", "B", "A", "C", "A"),
period = rep(c("Start", "End"), 4)
)
# define the target output structure
dt_target <- data.table(
Start.Point = c( rep("A",3), rep("B",3),rep("C",3) ),
End.Point = rep(c("A", "B", "C"),3)
)
dt_target[, Path := str_c(Start.Point, " -> ", End.Point)]
# work the data
dt_agregate <- merge(
dt_example[period == "Start", .(id, Start.Point = state)],
dt_example[period == "End", .(id, End.Point = state)],
by = "id",
allow.cartesian = T
)
dt_agregate[, Path := str_c(Start.Point, " -> ", End.Point)]
# attach the aggreagation results
dt_target <- merge(
dt_target[,.(Path)],
dt_agregate[, .(freq = .N), .(Path)],
by = "Path",
all = T
)
# replace NA by 0
setnafill(dt_target, fill = 0, cols = 2)
dt_target
Output:
Key: <Path>
Path freq
<char> <int>
1: A -> A 1
2: A -> B 0
3: A -> C 0
4: B -> A 1
5: B -> B 0
6: B -> C 0
7: C -> A 2
8: C -> B 0
9: C -> C 0
This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data
id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df
id state period
1 1 C Start
2 1 A End
3 2 A Start
4 2 A End
5 3 B Start
6 3 A End
7 4 C Start
8 4 A End
and the expected output
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to B 0
5 B to A 1
6 B to C 0
7 C to C 0
8 C to A 2
9 C to B 0
I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!
This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data
id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df
id state period
1 1 C Start
2 1 A End
3 2 A Start
4 2 A End
5 3 B Start
6 3 A End
7 4 C Start
8 4 A End
and the expected output
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to B 0
5 B to A 1
6 B to C 0
7 C to C 0
8 C to A 2
9 C to B 0
I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!
Share Improve this question edited Nov 19, 2024 at 13:27 T Richard asked Nov 19, 2024 at 13:08 T RichardT Richard 6013 silver badges12 bronze badges 7- Is it always 2 rows per ID? – zx8754 Commented Nov 19, 2024 at 13:14
- 1 @zx8754 it's possible to have many rows per ID, but this particular case its 2 rows per ID. Thanks! – T Richard Commented Nov 19, 2024 at 13:17
- "start A end B" same as "start B end A" ? – zx8754 Commented Nov 19, 2024 at 13:24
- @zx8754 Not same. – T Richard Commented Nov 19, 2024 at 13:25
- If there are many rows per ID, how do you know which Start matches with which End? – Allan Cameron Commented Nov 19, 2024 at 13:29
6 Answers
Reset to default 6A solution entirely within base R could be something like:
do.call("c",
split(df$state, df$id) |>
lapply(paste, collapse = " to ")) |>
factor(levels = sort(c(outer(unique(df$state), unique(df$state),
FUN = paste, sep = " to ")))) |>
table() |>
as.data.frame() |>
setNames(c("transition", "freq"))
#> transition freq
#> 1 A to A 1
#> 2 A to B 0
#> 3 A to C 0
#> 4 B to A 1
#> 5 B to B 0
#> 6 B to C 0
#> 7 C to A 2
#> 8 C to B 0
#> 9 C to C 0
You can try
states <- sort(unique(df$state))
transform(
aggregate(
id ~ .,
merge(
expand.grid(state.Start = states, state.End = states),
reshape(
df,
direction = "wide",
idvar = "id",
timevar = "period"
),
all = TRUE
),
list,
na.action = na.pass
),
freq = lengths(id) * !is.na(id)
)
gives
state.Start state.End id freq
1 A A 2 1
2 B A 3 1
3 C A 1, 4 2
4 A B NA 0
5 B B NA 0
6 C B NA 0
7 A C NA 0
8 B C NA 0
9 C C NA 0
You could merge
permutations with repetition to the first and last element (I interpreted that from your comment) of the state variable pasted together.
> data.frame(
+ Var1=RcppAlgos::permuteGeneral(sort(unique(df$state)), 2, rep=TRUE,
+ FUN=paste, collapse=' to ') |> do.call(what='rbind')
+ ) |>
+ merge(
+ split(df$state, df$id) |> lapply(\(x) toString(x[c(1, length(x))])) |>
+ sub(', ', ' to ', x=_) |> table() |> as.data.frame(),
+ all=TRUE
+ ) |>
+ transform(Freq=replace(Freq, is.na(Freq), 0)) |> setNames(c('transition', 'freq'))
transition freq
1 A to A 1
2 A to B 0
3 A to C 0
4 B to A 1
5 B to B 0
6 B to C 0
7 C to A 2
8 C to B 0
9 C to C 0
With tidyverse
, using left_join
on the crossing
states
library(dplyr)
library(tidyr)
left_join(crossing(Start = df$state, End = df$state),
pivot_wider(df, names_from = period, values_from = state)) %>%
reframe(freq = id * 0 + n(), .by = c(Start, End)) %>%
distinct()
output
# A tibble: 9 × 3
Start End freq
<chr> <chr> <dbl>
1 A A 1
2 A B NA
3 A C NA
4 B A 1
5 B B NA
6 B C NA
7 C A 2
8 C B NA
9 C C NA
or, to get the complete desired format
left_join(crossing(Start = df$state, End = df$state),
pivot_wider(df, names_from = period, values_from = state)) %>%
mutate(freq = id * 0 + n(),
freq = replace(freq, is.na(freq), 0), .by = c(Start, End)) %>%
mutate(transition = purrr::map2_vec(Start, End, ~ paste(.x, "to", .y)), .before=1) %>%
select(-c(Start, End, id)) %>%
distinct()
Joining with `by = join_by(Start, End)`
# A tibble: 9 × 2
transition freq
<chr> <dbl>
1 A to A 1
2 A to B 0
3 A to C 0
4 B to A 1
5 B to B 0
6 B to C 0
7 C to A 2
8 C to B 0
9 C to C 0
An alternative format of the data is to store in a length(id) x length(period)
matrix:
rows = unique(df$id)
cols = unique(df$period)
mat = matrix(nrow = length(rows), ncol = length(cols),
dimnames = list(id = rows, period = cols))
mat[cbind(df$id, df$period)] = df$state
#mat
# period
#id Start End
# 1 "C" "A"
# 2 "A" "A"
# 3 "B" "A"
# 4 "C" "A"
And then, tabulate:
lvls = unique(df$state)
as.data.frame(table(factor(mat[, "Start"], lvls),
factor(mat[, "End"], lvls)))
# Var1 Var2 Freq
#1 C C 0
#2 A C 0
#3 B C 0
#4 C A 2
#5 A A 1
#6 B A 1
#7 C B 0
#8 A B 0
#9 B B 0
and format as needed afterwards.
a (detailled) solution with data.table
:)
Code:
library(data.table)
library(stringr)
# define the example
dt_example <- data.table(
id = c(1,1,2,2,3,3,4,4),
state = c("C","A", "A", "A", "B", "A", "C", "A"),
period = rep(c("Start", "End"), 4)
)
# define the target output structure
dt_target <- data.table(
Start.Point = c( rep("A",3), rep("B",3),rep("C",3) ),
End.Point = rep(c("A", "B", "C"),3)
)
dt_target[, Path := str_c(Start.Point, " -> ", End.Point)]
# work the data
dt_agregate <- merge(
dt_example[period == "Start", .(id, Start.Point = state)],
dt_example[period == "End", .(id, End.Point = state)],
by = "id",
allow.cartesian = T
)
dt_agregate[, Path := str_c(Start.Point, " -> ", End.Point)]
# attach the aggreagation results
dt_target <- merge(
dt_target[,.(Path)],
dt_agregate[, .(freq = .N), .(Path)],
by = "Path",
all = T
)
# replace NA by 0
setnafill(dt_target, fill = 0, cols = 2)
dt_target
Output:
Key: <Path>
Path freq
<char> <int>
1: A -> A 1
2: A -> B 0
3: A -> C 0
4: B -> A 1
5: B -> B 0
6: B -> C 0
7: C -> A 2
8: C -> B 0
9: C -> C 0
本文标签: dataframeTransition tables from longitudinal data in the long format using RStack Overflow
版权声明:本文标题:dataframe - Transition tables from longitudinal data in the long format using R - Stack Overflow 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://it.en369.cn/questions/1745560494a2156134.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论