admin管理员组文章数量:1122846
I have a tibble like this -
# A tibble: 11 × 1
messycol
<chr>
1 ""
2 ""
3 ""
4 ""
5 ""
6 ""
7 "|15|16|32|33|36|39|40|41|"
8 "|15|16|32|39|"
9 "|10|29|32|38|39|40|"
10 "|32|39|"
11 ""
The dput()
output is here -
structure(list(messycol = structure(c("", "", "", "", "", "",
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|",
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
I want to create a dummy variable from the messycol
like this - if messycol
includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol
should be as it is - " "
So, my expected output is like this -
# A tibble: 11 × 2
messycol DUMMY
<chr>
1 "" ""
2 "" ""
3 "" ""
4 "" ""
5 "" ""
6 "" ""
7 "|15|16|32|33|36|39|40|41|" 1
8 "|15|16|32|39|" 1
9 "|10|29|32|38|39|40|" 0
10 "|32|39|" 0
11 "" ""
I have a tibble like this -
# A tibble: 11 × 1
messycol
<chr>
1 ""
2 ""
3 ""
4 ""
5 ""
6 ""
7 "|15|16|32|33|36|39|40|41|"
8 "|15|16|32|39|"
9 "|10|29|32|38|39|40|"
10 "|32|39|"
11 ""
The dput()
output is here -
structure(list(messycol = structure(c("", "", "", "", "", "",
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|",
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
I want to create a dummy variable from the messycol
like this - if messycol
includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol
should be as it is - " "
So, my expected output is like this -
# A tibble: 11 × 2
messycol DUMMY
<chr>
1 "" ""
2 "" ""
3 "" ""
4 "" ""
5 "" ""
6 "" ""
7 "|15|16|32|33|36|39|40|41|" 1
8 "|15|16|32|39|" 1
9 "|10|29|32|38|39|40|" 0
10 "|32|39|" 0
11 "" ""
Share
Improve this question
edited Nov 22, 2024 at 7:35
stefan
123k6 gold badges36 silver badges72 bronze badges
Recognized by R Language Collective
asked Nov 22, 2024 at 4:46
SharifSharif
3913 silver badges12 bronze badges
3 Answers
Reset to default 5df |>
mutate(dummy = case_when(
messycol == "" ~ "",
stringr::str_detect(messycol, "\\|15\\||\\|16\\|") ~ "1",
TRUE ~ "0"
))
messycol dummy
<chr> <chr>
1 "" ""
2 "" ""
3 "" ""
4 "" ""
5 "" ""
6 "" ""
7 "|15|16|32|33|36|39|40|41|" "1"
8 "|15|16|32|39|" "1"
9 "|10|29|32|38|39|169|" "0" # edited input to show rejection
10 "|115|39|" "0" # edited input to show rejection
11 "" ""
One option would be to use string::str_split
and a member of the purrr::map
family of functions:
library(tidyverse)
dat |>
mutate(
dummy = map_chr(
str_split(messycol, pattern = "\\|"),
~ as.character(+any(.x %in% c(15, 16)))
),
dummy = if_else(messycol == "", messycol, dummy)
)
#> # A tibble: 11 × 2
#> messycol dummy
#> <chr> <chr>
#> 1 "" ""
#> 2 "" ""
#> 3 "" ""
#> 4 "" ""
#> 5 "" ""
#> 6 "" ""
#> 7 "|15|16|32|33|36|39|40|41|" "1"
#> 8 "|15|16|32|39|" "1"
#> 9 "|10|29|32|38|39|40|" "0"
#> 10 "|32|39|" "0"
#> 11 "" ""
Or using base R you can achieve the same result e.g. like so:
dat |>
transform(dummy = vapply(
strsplit(messycol, split = "\\|"),
FUN = \(x) as.character(+any(x %in% c(15, 16))),
FUN.VALUE = character(1)
)) |>
transform(dummy = ifelse(messycol == "", messycol, dummy))
#> messycol dummy
#> 1
#> 2
#> 3
#> 4
#> 5
#> 6
#> 7 |15|16|32|33|36|39|40|41| 1
#> 8 |15|16|32|39| 1
#> 9 |10|29|32|38|39|40| 0
#> 10 |32|39| 0
#> 11
Finally, another more verbose option using the tidyverse
would be to use separate_longer_delim
:
dat |>
mutate(row = row_number()) |>
separate_longer_delim(messycol, delim = "|") |>
mutate(dummy = as.character(+any(messycol %in% c(15, 16))), .by = row) |>
summarise(messycol = paste0(messycol, collapse = "|"), .by = c(row, dummy)) |>
mutate(dummy = if_else(messycol == "", messycol, dummy))
#> # A tibble: 11 × 3
#> row dummy messycol
#> <int> <chr> <chr>
#> 1 1 "" ""
#> 2 2 "" ""
#> 3 3 "" ""
#> 4 4 "" ""
#> 5 5 "" ""
#> 6 6 "" ""
#> 7 7 "1" "|15|16|32|33|36|39|40|41|"
#> 8 8 "1" "|15|16|32|39|"
#> 9 9 "0" "|10|29|32|38|39|40|"
#> 10 10 "0" "|32|39|"
#> 11 11 "" ""
Although nice answers around {dplyr}
are given, I would like to add one which emphasizes the good practice to re-organise data for further analysis.
Base R misses a function like tidyr::separate_longer_delim()
, so a bit more work is needed. I often use the following function
strsplit2l = \(x, p, ...) { l=strsplit(x, p, ...); lapply(l, `length<-`, max(lengths(l))) }
What to do with the output depends on the structure. Keeping a list, doing list2DF()
or t
ransposing first etc. In your case, as often, do.call(what="rbind")
might be best. Storing variables in their natural format is good practice. Therefore, we convert from matrix
to data.frame
and apply type.convert
:
# Y =
cbind(X$messycol, strsplit2l(X$messycol, "\\|") |> do.call(what="rbind")) |>
data.frame() |>
type.convert(as.is=TRUE)
giving
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 NA NA NA NA NA NA NA NA NA
2 NA NA NA NA NA NA NA NA NA
3 NA NA NA NA NA NA NA NA NA
4 NA NA NA NA NA NA NA NA NA
5 NA NA NA NA NA NA NA NA NA
6 NA NA NA NA NA NA NA NA NA
7 |15|16|32|33|36|39|40|41| NA 15 16 32 33 36 39 40 41
8 |15|16|32|39| NA 15 16 32 39 NA NA NA NA
9 |10|29|32|38|39|40| NA 10 29 32 38 39 40 NA NA
10 |32|39| NA 32 39 NA NA NA NA NA NA
11 NA NA NA NA NA NA NA NA NA
You might want to code your dummy variable as follows. Assume we have stored above output in the variable Y
:
> Y$i = (rowSums(Y[-1]==15 | Y[-1]==16 , na.rm=TRUE)>0) * 1
> Y$i2 = replace(Y$i, Y$X1=="", 999)
> Y
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 i i2
1 NA NA NA NA NA NA NA NA NA 0 999
2 NA NA NA NA NA NA NA NA NA 0 999
3 NA NA NA NA NA NA NA NA NA 0 999
4 NA NA NA NA NA NA NA NA NA 0 999
5 NA NA NA NA NA NA NA NA NA 0 999
6 NA NA NA NA NA NA NA NA NA 0 999
7 |15|16|32|33|36|39|40|41| NA 15 16 32 33 36 39 40 41 1 1
8 |15|16|32|39| NA 15 16 32 39 NA NA NA NA 1 1
9 |10|29|32|38|39|40| NA 10 29 32 38 39 40 NA NA 0 0
10 |32|39| NA 32 39 NA NA NA NA NA NA 0 0
11 NA NA NA NA NA NA NA NA NA 0 999
If ""
is additional information and you wish to code 15
or 16
present as integer (0
, 1
), we might avoid resolving to character
. Instead we could use 999
. But, of course, if you like do replace(Y$i, Y$X1=="", 999)
instead.
Data
X = structure(
list(
messycol = structure(
c(
"",
"",
"",
"",
"",
"",
"|15|16|32|33|36|39|40|41|",
"|15|16|32|39|",
"|10|29|32|38|39|40|",
"|32|39|",
""
),
label = "Noteffectiveaccountingrule reason keys",
format.sas = "$"
)
),
row.names = c(NA, -11L),
class = c("tbl_df", "tbl", "data.frame")
)
本文标签: rcreate a dummy variable from a messy dataStack Overflow
版权声明:本文标题:r - create a dummy variable from a messy data - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1736305586a1932641.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论