"tfread"存在吗?

jan*_*glx 6 csv r data.table

In R有一种有效的方法来读取转置.csv文件吗?

例如,考虑以下文本文件:

Name,Peter,Paul,Marry
Age,40,9,38
Run Code Online (Sandbox Code Playgroud)

这可以通过以下方式读入data.table有用的列类:

library(data.table)    
file <-  tempfile("tmp.txt")
writeLines("Name,Peter,Paul,Mary\nAge,40,5,38\n", file)    

lines <- readLines(file)
lines <- lapply(lines, function(x) gsub(pattern=",", replacement="\n", x, fixed=TRUE))
lines <- lapply(lines[-3], fread)
do.call(cbind,lines)
#>     Name Age
#> 1: Peter  40
#> 2:  Paul   5
#> 3:  Mary  38
Run Code Online (Sandbox Code Playgroud)

有没有更简单的方法来实现这一目标?是否有更高效的版本(我的文件是1 GB)?

请注意,对于列式存储应该更容易阅读这样的列主存储,如a data.table.

jan*_*glx 2

不幸的是,tfread似乎不存在。

提出解决方案的时间安排200 observations of 20000 character/integer variables/ 20000 observations of 200 character/integer variables

  1. readLines-fread(@jan-glx):7 秒/1.2 秒
  2. fread- transpose- paste- read.table(@Onymambu):8 秒/36 秒
  3. a ll-within-j: fread- transpose- write.csv- paste- fread(@Clayton Stanley):5 分钟/12 秒
  4. 命令行转置- fread(@jan-glx):2.4 s / 1.6 s
  5. iotools-paste-paste-fread (@jan-glx):1.4 秒/1.2 秒
  6. fread- transpose- type.convert(@Frank): 4.2 秒 / 3.6 秒

代码:

library(data.table)    
file <-  tempfile("tmp.txt")
p <- 100 # = 200 lines/columns
n <- 10000 # = 20000 values per line / rows
writeLines(rep(c(paste("Name",paste0(rep(c("Peter","Paul"), n), collapse = ","), sep=","),
                 paste("Age",paste0(rep(c("40","5"), n), collapse = ","), sep=",")
               ), p), file(file,"wb"))    

system.time({ # 1
lines <- readLines(file)
lines <- lapply(lines, function(x) gsub(pattern=",", replacement="\n", x, fixed=TRUE))
lines <- lapply(lines, fread)
dt <- do.call(cbind,lines)
dim(dt)
})

system.time({ # 2
DT=setDT(read.table(text=do.call(paste,transpose(fread(file,h=F))),h=T,stringsAsFactors = F))
dim(DT)
})

system.time({ # 3
aTbl = fread(file, colClasses="character", header=F)
invisible(
  aTbl[, .SD
       ][, transpose(.SD)
         ][, setnames(.SD, .SD[1, t(.SD)])                                                                                                                   
           ][2:.N                                                                                                                  
             ][, fread(paste0(capture.output(write.csv(.SD, stdout(), row.names=F, quote=F)), collapse='\n'))                        
               ][, {bTbl <<- copy(.SD); .SD}                                                                                           
                 ]  
)
dim(bTbl)
})

system.time({ # 4 wide
  dt <- fread(paste0("transpose -t -l 20005x205 --fsep , \"", file, "\""))
  dim(dt)
})

system.time({ # 4 long
dt <- fread(paste0("transpose -t -l 205x20005 --fsep , \"", file, "\""))
dim(dt)
})

system.time({ # 5
  infile <- file(file, "rb")
  df <- iotools::chunk.tapply(infile, function(x) {
    fread(paste0(apply(iotools::mstrsplit(x, sep=","), 2, paste0, collapse = ","), collapse = "\n"))
  }, CH.MERGE = cbind)
  dim(df)
})

system.time({ # 6
d <-  fread(file, header=FALSE); 
d <- d[, lapply(transpose(.SD[,-1]), type.convert)][, setnames(.SD, d[[1]])]
dim(d)
})
Run Code Online (Sandbox Code Playgroud)