admin管理员组

文章数量:1122846

I have a tibble like this -

# A tibble: 11 × 1
   messycol                   
   <chr>                      
 1 ""                         
 2 ""                         
 3 ""                         
 4 ""                         
 5 ""                         
 6 ""                         
 7 "|15|16|32|33|36|39|40|41|"
 8 "|15|16|32|39|"            
 9 "|10|29|32|38|39|40|"      
10 "|32|39|"                  
11 ""                         

The dput() output is here -

structure(list(messycol = structure(c("", "", "", "", "", "", 
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|", 
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

I want to create a dummy variable from the messycol like this - if messycol includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol should be as it is - " "

So, my expected output is like this -

# A tibble: 11 × 2
   messycol                         DUMMY                 
   <chr>                      
 1 ""                               ""                      
 2 ""                               ""                          
 3 ""                               ""                           
 4 ""                               ""                          
 5 ""                               ""                         
 6 ""                               ""                         
 7 "|15|16|32|33|36|39|40|41|"       1
 8 "|15|16|32|39|"                   1     
 9 "|10|29|32|38|39|40|"             0    
10 "|32|39|"                         0              
11 ""                               ""  

I have a tibble like this -

# A tibble: 11 × 1
   messycol                   
   <chr>                      
 1 ""                         
 2 ""                         
 3 ""                         
 4 ""                         
 5 ""                         
 6 ""                         
 7 "|15|16|32|33|36|39|40|41|"
 8 "|15|16|32|39|"            
 9 "|10|29|32|38|39|40|"      
10 "|32|39|"                  
11 ""                         

The dput() output is here -

structure(list(messycol = structure(c("", "", "", "", "", "", 
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|", 
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

I want to create a dummy variable from the messycol like this - if messycol includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol should be as it is - " "

So, my expected output is like this -

# A tibble: 11 × 2
   messycol                         DUMMY                 
   <chr>                      
 1 ""                               ""                      
 2 ""                               ""                          
 3 ""                               ""                           
 4 ""                               ""                          
 5 ""                               ""                         
 6 ""                               ""                         
 7 "|15|16|32|33|36|39|40|41|"       1
 8 "|15|16|32|39|"                   1     
 9 "|10|29|32|38|39|40|"             0    
10 "|32|39|"                         0              
11 ""                               ""  
Share Improve this question edited Nov 22, 2024 at 7:35 stefan 123k6 gold badges36 silver badges72 bronze badges Recognized by R Language Collective asked Nov 22, 2024 at 4:46 SharifSharif 3913 silver badges12 bronze badges
Add a comment  | 

3 Answers 3

Reset to default 5
df |>
  mutate(dummy = case_when(
    messycol == "" ~ "",
    stringr::str_detect(messycol, "\\|15\\||\\|16\\|") ~ "1",
    TRUE ~ "0"
  ))

   messycol                    dummy
   <chr>                       <chr>
 1 ""                          ""   
 2 ""                          ""   
 3 ""                          ""   
 4 ""                          ""   
 5 ""                          ""   
 6 ""                          ""   
 7 "|15|16|32|33|36|39|40|41|" "1"  
 8 "|15|16|32|39|"             "1"  
 9 "|10|29|32|38|39|169|"      "0"  # edited input to show rejection 
10 "|115|39|"                  "0"  # edited input to show rejection
11 ""                          ""

One option would be to use string::str_split and a member of the purrr::map family of functions:

library(tidyverse)

dat |>
  mutate(
    dummy = map_chr(
      str_split(messycol, pattern = "\\|"),
      ~ as.character(+any(.x %in% c(15, 16)))
    ),
    dummy = if_else(messycol == "", messycol, dummy)
  )
#> # A tibble: 11 × 2
#>    messycol                    dummy
#>    <chr>                       <chr>
#>  1 ""                          ""   
#>  2 ""                          ""   
#>  3 ""                          ""   
#>  4 ""                          ""   
#>  5 ""                          ""   
#>  6 ""                          ""   
#>  7 "|15|16|32|33|36|39|40|41|" "1"  
#>  8 "|15|16|32|39|"             "1"  
#>  9 "|10|29|32|38|39|40|"       "0"  
#> 10 "|32|39|"                   "0"  
#> 11 ""                          ""

Or using base R you can achieve the same result e.g. like so:

dat |>
  transform(dummy = vapply(
    strsplit(messycol, split = "\\|"),
    FUN = \(x) as.character(+any(x %in% c(15, 16))),
    FUN.VALUE = character(1)
  )) |>
  transform(dummy = ifelse(messycol == "", messycol, dummy))
#>                     messycol dummy
#> 1                                 
#> 2                                 
#> 3                                 
#> 4                                 
#> 5                                 
#> 6                                 
#> 7  |15|16|32|33|36|39|40|41|     1
#> 8              |15|16|32|39|     1
#> 9        |10|29|32|38|39|40|     0
#> 10                   |32|39|     0
#> 11

Finally, another more verbose option using the tidyverse would be to use separate_longer_delim :

dat |>
  mutate(row = row_number()) |>
  separate_longer_delim(messycol, delim = "|") |>
  mutate(dummy = as.character(+any(messycol %in% c(15, 16))), .by = row) |>
  summarise(messycol = paste0(messycol, collapse = "|"), .by = c(row, dummy)) |>
  mutate(dummy = if_else(messycol == "", messycol, dummy))
#> # A tibble: 11 × 3
#>      row dummy messycol                   
#>    <int> <chr> <chr>                      
#>  1     1 ""    ""                         
#>  2     2 ""    ""                         
#>  3     3 ""    ""                         
#>  4     4 ""    ""                         
#>  5     5 ""    ""                         
#>  6     6 ""    ""                         
#>  7     7 "1"   "|15|16|32|33|36|39|40|41|"
#>  8     8 "1"   "|15|16|32|39|"            
#>  9     9 "0"   "|10|29|32|38|39|40|"      
#> 10    10 "0"   "|32|39|"                  
#> 11    11 ""    ""

Although nice answers around {dplyr} are given, I would like to add one which emphasizes the good practice to re-organise data for further analysis.

Base R misses a function like tidyr::separate_longer_delim(), so a bit more work is needed. I often use the following function

strsplit2l = \(x, p, ...) { l=strsplit(x, p, ...); lapply(l, `length<-`, max(lengths(l))) }

What to do with the output depends on the structure. Keeping a list, doing list2DF() or transposing first etc. In your case, as often, do.call(what="rbind") might be best. Storing variables in their natural format is good practice. Therefore, we convert from matrix to data.frame and apply type.convert:

# Y =
cbind(X$messycol, strsplit2l(X$messycol, "\\|") |> do.call(what="rbind")) |>
  data.frame() |> 
  type.convert(as.is=TRUE) 

giving

                          X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1                            NA NA NA NA NA NA NA NA  NA
2                            NA NA NA NA NA NA NA NA  NA
3                            NA NA NA NA NA NA NA NA  NA
4                            NA NA NA NA NA NA NA NA  NA
5                            NA NA NA NA NA NA NA NA  NA
6                            NA NA NA NA NA NA NA NA  NA
7  |15|16|32|33|36|39|40|41| NA 15 16 32 33 36 39 40  41
8              |15|16|32|39| NA 15 16 32 39 NA NA NA  NA
9        |10|29|32|38|39|40| NA 10 29 32 38 39 40 NA  NA
10                   |32|39| NA 32 39 NA NA NA NA NA  NA
11                           NA NA NA NA NA NA NA NA  NA

You might want to code your dummy variable as follows. Assume we have stored above output in the variable Y:

> Y$i = (rowSums(Y[-1]==15 | Y[-1]==16 , na.rm=TRUE)>0) * 1
> Y$i2 = replace(Y$i, Y$X1=="", 999)
> Y

                          X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 i  i2
1                            NA NA NA NA NA NA NA NA  NA 0 999
2                            NA NA NA NA NA NA NA NA  NA 0 999
3                            NA NA NA NA NA NA NA NA  NA 0 999
4                            NA NA NA NA NA NA NA NA  NA 0 999
5                            NA NA NA NA NA NA NA NA  NA 0 999
6                            NA NA NA NA NA NA NA NA  NA 0 999
7  |15|16|32|33|36|39|40|41| NA 15 16 32 33 36 39 40  41 1   1
8              |15|16|32|39| NA 15 16 32 39 NA NA NA  NA 1   1
9        |10|29|32|38|39|40| NA 10 29 32 38 39 40 NA  NA 0   0
10                   |32|39| NA 32 39 NA NA NA NA NA  NA 0   0
11                           NA NA NA NA NA NA NA NA  NA 0 999

If "" is additional information and you wish to code 15 or 16 present as integer (0, 1), we might avoid resolving to character. Instead we could use 999. But, of course, if you like do replace(Y$i, Y$X1=="", 999) instead.


Data

X = structure(
  list(
    messycol = structure(
      c(
        "",
        "",
        "",
        "",
        "",
        "",
        "|15|16|32|33|36|39|40|41|",
        "|15|16|32|39|",
        "|10|29|32|38|39|40|",
        "|32|39|",
        ""
      ),
      label = "Noteffectiveaccountingrule reason keys",
      format.sas = "$"
    )
  ),
  row.names = c(NA, -11L),
  class = c("tbl_df", "tbl", "data.frame")
)

本文标签: rcreate a dummy variable from a messy dataStack Overflow