Variable recoding

National Data Management Center for Health (NDMC) at EPHI



Recoding Variables

  • use recode() inside a mutate() statement.

Example of Recoding

Code
library(tibble)
data_diet <- tibble(Diet = rep(c("A", "B", "B"), times = 4), 
                    Gender = c("Male","m","Other","F","Female","M",
                               "f","O","Man","f","F","O"), 
                    Weight_start = sample(100:250, size = 12),
                    Weight_change = sample(-10:20, size = 12))
head(data_diet)
# A tibble: 6 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     m               182            -7
3 B     Other           229            -4
4 A     F               158             1
5 B     Female          201            19
6 B     M               239            13
  • Say we have some data about samples in a diet study but this needs lots of recoding.
Code
library(dplyr)
data_diet |>
  count(Gender)
# A tibble: 9 × 2
  Gender     n
  <chr>  <int>
1 F          2
2 Female     1
3 M          1
4 Male       1
5 Man        1
6 O          2
7 Other      1
8 f          2
9 m          1

dplyr can help!

Using Excel to find all of the different ways gender has been coded, could be hectic!

In dplyr you can use the recode function (need mutate here too!):

Code
# General Format - this is not code!
data_input |>
  mutate(variable_to_fix = recode(Variable_fixing, old_value = new_value,
                                    another_old_value = new_value))
Code
data_diet |> 
  mutate(Gender = recode(Gender, M = "Male", m = "Male", Man = "Male",
                                 O = "Other",f = "Female",F = "Female")) |>
  count(Gender, Diet)
# A tibble: 5 × 3
  Gender Diet      n
  <chr>  <chr> <int>
1 Female A         3
2 Female B         2
3 Male   A         1
4 Male   B         3
5 Other  B         3

Or you can use case_when()

The case_when() function of dplyr can help us to do this as well.

  • Note that automatically values not reassigned explicitly by case_when() will be NA unless otherwise specified.
Code
data_diet |> 
  mutate(Gender = case_when(Gender == "M" ~ "Male"))
# A tibble: 12 × 4
   Diet  Gender Weight_start Weight_change
   <chr> <chr>         <int>         <int>
 1 A     <NA>            184            -2
 2 B     <NA>            182            -7
 3 B     <NA>            229            -4
 4 A     <NA>            158             1
 5 B     <NA>            201            19
 6 B     Male            239            13
 7 A     <NA>            156            17
 8 B     <NA>            117             3
 9 B     <NA>            211            11
10 A     <NA>            165            15
11 B     <NA>            128            -5
12 B     <NA>            209            20

Use of case_when() without automatic NA

  • Here we use the original values of Gender to replace all values of Gender that do not meet the condition == "M".
Code
data_diet |> 
  mutate(Gender = case_when(Gender == "M" ~ "Male", TRUE ~ Gender))
# A tibble: 12 × 4
   Diet  Gender Weight_start Weight_change
   <chr> <chr>         <int>         <int>
 1 A     Male            184            -2
 2 B     m               182            -7
 3 B     Other           229            -4
 4 A     F               158             1
 5 B     Female          201            19
 6 B     Male            239            13
 7 A     f               156            17
 8 B     O               117             3
 9 B     Man             211            11
10 A     f               165            15
11 B     F               128            -5
12 B     O               209            20

More complicated case_when()

Code
data_diet |> 
  mutate(Gender = case_when(
    Gender %in% c("M", "male", "Man", "m", "Male") ~ "Male",
    Gender %in% c("F", "Female", "f", "female") ~ "Female",
    Gender %in% c("O", "Other") ~ "Other")) |> head()
# A tibble: 6 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     Male            182            -7
3 B     Other           229            -4
4 A     Female          158             1
5 B     Female          201            19
6 B     Male            239            13

Another reason for case_when()

case_when can do very sophisticated comparisons

Code
data_diet1 <-data_diet |> 
      mutate(Effect = case_when(Weight_change > 0 ~ "Increase",
                                Weight_change == 0 ~ "Same",
                                Weight_change < 0 ~ "Decrease"))
head(data_diet)
# A tibble: 6 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     m               182            -7
3 B     Other           229            -4
4 A     F               158             1
5 B     Female          201            19
6 B     M               239            13
Code
data_diet1 |> 
  count(Diet, Effect)
Code
library(ggplot2)
data_diet1 |> count(Diet, Effect)|>
  ggplot(aes(x = Effect,y = n, fill = Diet)) + 
  geom_col(position = position_dodge()) +
  labs(y = "Individuals", title = "Effect of diet A & B on participants")

Creating new discrete column with two levels

  • The ifelse() statement can be used to turn a numeric column into a discrete one.
Code
data_diet |>
  mutate(Temp_cat = ifelse(Weight_change > 0, "Increased", "decreased")) |>
  head()
# A tibble: 6 × 5
  Diet  Gender Weight_start Weight_change Temp_cat 
  <chr> <chr>         <int>         <int> <chr>    
1 A     Male            184            -2 decreased
2 B     m               182            -7 decreased
3 B     Other           229            -4 decreased
4 A     F               158             1 Increased
5 B     Female          201            19 Increased
6 B     M               239            13 Increased

Working with strings by stringr package

The stringr package:

  • Modifying or finding part or all of a character string
  • We will not cover grep or gsub - base R functions
    • are used on forums for answers
  • Almost all functions start with str_*

str_detect()

  • str_detect, and str_replace search for matches to argument pattern within each element of a character vector (not data frame or tibble!).

  • str_detect - returns TRUE if pattern is found

  • str_replace - replaces pattern with replacement

  • The string argument specifies what to check
  • The pattern argument specifies what to check for
Code
library(stringr)
x<-c("cat", "dog", "mouse")
str_detect(string = x, pattern = "d")
[1] FALSE  TRUE FALSE

str_replace()

  • The replacement argument specifies what to replace the pattern with
Code
x<-c("cat", "dog", "mouse")
str_replace(string = x, pattern = "d", replacement = "D")
[1] "cat"   "Dog"   "mouse"

filter and stringr functions

Code
head(data_diet,n = 4)
# A tibble: 4 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     m               182            -7
3 B     Other           229            -4
4 A     F               158             1
Code
data_diet |> 
  filter(str_detect(string = Gender,
                    pattern = "M"))
# A tibble: 3 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     M               239            13
3 B     Man             211            11

case_when() improved with stringr

Code
data_diet |> 
  mutate(Gender = case_when(
    Gender %in% c("M", "male", "Man", "m", "Male") ~ "Male",
    Gender %in% c("F", "Female", "f", "female")~ "Female",
    Gender %in% c("O", "Other") ~ "Other")) |> head()
# A tibble: 6 × 4
  Diet  Gender Weight_start Weight_change
  <chr> <chr>         <int>         <int>
1 A     Male            184            -2
2 B     Male            182            -7
3 B     Other           229            -4
4 A     Female          158             1
5 B     Female          201            19
6 B     Male            239            13

case_when() improved with stringr

  • ^ indicates the beginning of a character string

  • $ indicates the end

Code
data_diet |> 
  mutate(Gender = case_when(
    str_detect(string = Gender, pattern = "^m|^M") ~ "Male",
    str_detect(string = Gender, pattern = "^f|^F") ~ "Female",
    str_detect(string = Gender, pattern = "^o|^O") ~ "Other")) |>
  count(Gender)
# A tibble: 3 × 2
  Gender     n
  <chr>  <int>
1 Female     5
2 Male       4
3 Other      3