Most efficient way to turn factor matrix into binary (indicator) matrix in R

I can think of several ways to turn matrix (data frame) of this type:

    dat = data.frame(
    x1 = rep(c('a', 'b'), 100),
    x2 = rep(c('x', 'y'), 100)
)

head(dat)
  x1 x2
1  a  x
2  b  y
3  a  x
4  b  y
5  a  x
6  b  y

Into a binary (indicator) matrix (or data frame) like this:

a  b  x  y
1  0  1  0
0  1  0  1
...

(This structure is, of course, trivial and only for illustrative purpose!)

Many thanks!

Answers


We can use table

tbl <- table(rep(1:nrow(dat),2),unlist(dat))
head(tbl, 2)
#    a b x y
#  1 1 0 1 0
#  2 0 1 0 1

Or a possibly efficient option would be

library(Matrix)
sM <- sparse.model.matrix(~ -1 + x1 +x2, dat, 
      contrasts.arg = lapply(dat, contrasts, contrasts = FALSE))
colnames(sM) <- sub(".*\\d", "", colnames(sM))
head(sM, 2)
# 2 x 4 sparse Matrix of class "dgCMatrix"
# a b x y
#1 1 . 1 .
#2 . 1 . 1

It can be converted to binary by converting to matrix

head(as.matrix(sM),2)  
#  a b x y
#1 1 0 1 0
#2 0 1 0 1

There are some good solutions posted already, but none are optimal for performance. We can optimize performance by looping over each input column, and then looping over each factor level index within each input column and doing a straight integer comparison of the factor indexes. It's not the most concise or elegant piece of code, but it's fairly straightforward and fast:

do.call(cbind,lapply(dat,function(col)
    `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i)
        as.integer(as.integer(col)==i)
    )),levels(col))
));

Performance:

library(Matrix);
library(data.table);
library(microbenchmark);

bgoldst <- function(dat) do.call(cbind,lapply(dat,function(col) `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i) as.integer(as.integer(col)==i))),levels(col))));
akrun1 <- function(dat) table(rep(1:nrow(dat),2),unlist(dat));
akrun2 <- function(dat) sparse.model.matrix(~-1+x1+x2,dat,contrasts.arg=lapply(dat,contrasts,contrasts=FALSE));
davidar <- function(dat) { dat[,rowid:=.I]; dcast(melt(dat,id='rowid'),rowid~value,length); }; ## requires a data.table
dataminer <- function(dat) t(apply(dat,1,function(x) as.numeric(unique(unlist(dat))%in%x)));

N <- 100L; dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
identical(unname(bgoldst(dat)),matrix(as.vector(akrun1(dat)),ncol=4L));
## [1] TRUE
identical(unname(bgoldst(dat)),unname(matrix(as.integer(as.matrix(akrun2(dat))),ncol=4L)));
## [1] TRUE
identical(bgoldst(dat),as.matrix(davidar(datDT)[,rowid:=NULL]));
## [1] TRUE
identical(unname(bgoldst(dat)),matrix(as.integer(dataminer(dat)),ncol=4L));
## [1] TRUE

N <- 100L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT),dataminer(dat));
## Unit: microseconds
##            expr       min        lq       mean     median         uq       max neval
##    bgoldst(dat)    67.570    92.374   106.2853    99.6440   121.2405   188.596   100
##     akrun1(dat)   581.182   652.386   773.6300   690.6605   916.4625  1192.299   100
##     akrun2(dat)  4429.208  4836.119  5554.5902  5145.3135  5977.0990 11263.537   100
##  davidar(datDT)  5064.273  5498.555  6104.7621  5664.9115  6203.9695 11713.856   100
##  dataminer(dat) 47577.729 49529.753 55217.3726 53190.8940 60041.9020 74346.268   100

N <- 1e4L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
##            expr       min        lq      mean   median        uq        max neval
##    bgoldst(dat)  1.775617  1.820949  2.299493  1.84725  1.972124   8.362336   100
##     akrun1(dat) 38.954524 41.109257 48.409613 45.60304 52.147633 162.365472   100
##     akrun2(dat) 16.915832 17.762799 21.288200 19.20164 23.775180  46.494055   100
##  davidar(datDT) 36.151684 38.366715 42.875940 42.38794 45.916937  58.695008   100

N <- 1e5L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
##            expr       min        lq      mean    median        uq      max neval
##    bgoldst(dat)  17.16473  22.97654  35.01815  26.76662  31.75562 152.6188   100
##     akrun1(dat) 501.72644 626.14494 671.98315 680.91152 727.88262 828.8313   100
##     akrun2(dat) 212.12381 242.65505 298.90254 272.28203 357.65106 429.6023   100
##  davidar(datDT) 368.04924 461.60078 500.99431 511.54921 540.39358 638.3840   100

If you have a data.frame as you are showing (not a matrix), you could as well recast the data

library(data.table)
setDT(dat)[, rowid := .I] # Creates a row index
res <- dcast(melt(dat, id = "rowid"), rowid ~ value, length) # long/wide format
head(res)
#   rowid a b x y
# 1     1 1 0 1 0
# 2     2 0 1 0 1
# 3     3 1 0 1 0
# 4     4 0 1 0 1
# 5     5 1 0 1 0
# 6     6 0 1 0 1

Some benchmarks

dat = data.frame(
  x1 = rep(c('a', 'b'), 1e3),
  x2 = rep(c('x', 'y'), 1e3)
)

library(data.table)
library(Matrix)
library(microbenchmark)

dat2 <- copy(dat)


microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
               "akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
               "DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
               "David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
               times = 10L)
# Unit: milliseconds
#          expr         min          lq        mean      median         uq        max neval cld
#     akrun1 :     3.826075    4.061904    6.654399    5.165376   11.26959   11.82029    10  a 
#     akrun2 :     5.269531    5.713672    8.794434    5.943422   13.34118   20.01961    10  a 
#  DatamineR :  3199.336286 3343.774160 3410.618547 3385.756972 3517.22133 3625.70909    10   b
#   David Ar :     8.092769    8.254682   11.030785    8.465232   15.44893   19.83914    10  a 

The apply solution is highly inefficient and will take forever on a bigger data set. Comparing for a bigger data set while excluding the apply solution

dat = data.frame(
  x1 = rep(c('a', 'b'), 1e4),
  x2 = rep(c('x', 'y'), 1e4)
)

dat2 <- copy(dat)

microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
               "akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
               #"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
               "David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
               times = 100L)
# Unit: milliseconds
#        expr      min       lq     mean   median       uq      max neval cld
#   akrun1 :  38.66744 41.27116 52.97982 42.72534 47.17203 161.0420   100   b
#   akrun2 :  17.02006 18.93534 27.27582 19.35580 20.72022 153.2397   100  a 
# David Ar :  34.15915 37.91659 46.11050 38.58536 41.40412 149.0038   100   b

Seems like the Matrix package shines for a bigger data sets.

It probably worth comparing different scenarios when there are more columns/unique values too.


One alternative using apply

head(t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))))
     [,1] [,2] [,3] [,4]
[1,]    1    0    1    0
[2,]    0    1    0    1
[3,]    1    0    1    0
[4,]    0    1    0    1
[5,]    1    0    1    0
[6,]    0    1    0    1

Need Your Help

How to merge two json string in Python?

python json string kazoo

I recently started working with Python and I am trying to concatenate one of my JSON String with existing JSON String. I am also working with Zookeeper so I get the existing json string from zookee...

How to open file in Emacs via eshell?

emacs buffer eshell

When in eshell is there a command for opening a file in another buffer?