R:找到在每个观测值的时间范围内出现的变量的每个唯一值的计数

gan*_*wag 5 r time-series data.table

我有一个面板数据集,其中包含对个人随时间的观察。一共有三列:人员,日期和值。我想创建三个新变量。第一:在观察前一年内每个人日在值列中的唯一条目数。第二:一组虚拟变量,反映该人在过去一年中是否在值栏中有每个可能的条目。第三:一组计数变量,用于统计该人在上一年中在值列中每个可能条目的次数。

我有办法在玩具数据集上执行此操作,但是当我尝试将我的方法应用于大型数据集时,内存不足。

有没有更有效的方法来获取这三个变量?

我的玩具数据集方法有5个步骤:

  1. 创建两个新变量集,它们是每个人的滞后日期和值。我落后于人数最多的人。
  2. 我遍历新变量并将每个变量与当前日期进行比较。如果当前日期少于滞后时间,则我用相应的滞后值变量填充滞后日期变量。我填写而不是创建新列以节省内存。如果滞后变量超出范围,则我将字符串“ OOR”替换为滞后日期,以表示超出范围。
  3. 我将滞后的日期变量粘贴在一起。将有一些值输入和一堆“ OOR”。我用正则表达式删除了“ OOR”。我用唯一的字符(例如“-”)分隔字符串中的所有条目
  4. 我循环遍历value列中的所有可能条目,并为步骤3中粘贴的字符串是否匹配可能的条目创建一个新变量,并计算粘贴的字符串中可能条目的出现次数。
  5. 我计算了步骤3中粘贴的字符串中的唯一条目,键入了分隔字符串“-”。
# In this example, "Species" = "person", "Sepal.Width" = "value"
# I use 3 days instead of 1 year

library(data.table)
library(zoo)
library(stringr)

set.seed(481516)
dt <- as.data.table(iris)
dt[, date := as.Date(seq_len(.N), format = "%Y"), by = Species]
dt[, sepal_width_above_3 := ifelse(Sepal.Width > 3, 1, 0)]
dt[, random_drop := runif(nrow(dt))]
dt <- dt[random_drop >= 0.1]

dt_lag <- dt[, .(Species, date, Sepal.Width)]

#step 1: lag the date - a new var for every species-date combo
dt_lag[, species_count := .N, by = Species]
keep_names <- names(dt_lag)[names(dt_lag) != "Species"]
dt_lag <- cbind(dt_lag[, ..keep_names], 
                dt_lag[, shift(date, n=1:max(dt_lag$species_count), give.names = T), by = Species],
                dt_lag[, shift(Sepal.Width, n=1:max(dt_lag$species_count), give.names = T), by = Species])


#step 2: which values do we keep?
for (X in names(dt_lag)[names(dt_lag) %like% "^date_lag_"]){
  Y <- gsub("date","Sepal.Width", X)
  dt_lag[, (X) := ifelse(date - get(X) > 0 & date - get(X) <= 3, get(Y) , "OOR")]
  dt_lag[, (Y) := NULL]
}

#step 3: paste together
dt_lag[, sepal_width_values_within_lag_3 := gsub('-NA|-OOR','',do.call(paste, c(.SD, sep="-"))), .SDcols = names(dt_lag) %like% "date_lag_"]
for (X in names(dt_lag)[names(dt_lag) %like% "^date_lag_"]){
  dt_lag[, (X) := NULL]
}

#step 4: counts and dummies for each type
for (X in sort(unique(dt_lag$Sepal.Width))) {
  # X = 1
  spec_count = paste("sepal_width_count_lag_365_",X,sep="")
  spec_dummy = paste("sepal_width_dummy_lag_365_",X,sep="")
  pattern = as.character(X)

  dt_lag[, (spec_count) := str_count(sepal_width_values_within_lag_3, eval(pattern))]
  dt_lag[, (spec_dummy) := str_detect(sepal_width_values_within_lag_3, eval(pattern))]
}

#step 5: unique counts
dt_lag[, unique_sepal_width_values_within_lag_3_count := sapply(sepal_width_values_within_lag_3, function(x)
  length(    # count items
    unique(   # that are unique
      scan(   # when arguments are presented to scan as text
        text=x, what="", sep ="-",  # when separated by ","
        quiet=TRUE)))  )]
Run Code Online (Sandbox Code Playgroud)

以下是值列中1个条目的结果(宽度= 2)

head(dt_lag[,.(date, Species, sepal_width_values_within_lag_3, sepal_width_count_lag_365_2, sepal_width_dummy_lag_365_2, unique_sepal_width_values_within_lag_3_count)])

    date      Species sepal_width_values_within_lag_3 sepal_width_count_lag_365_2 sepal_width_dummy_lag_365_2 unique_sepal_width_values_within_lag_3_count
1: 1970-09-14  setosa                             3.5                           0                       FALSE                                            1
2: 1970-09-15  setosa                           3-3.5                           0                       FALSE                                            2
3: 1970-09-16  setosa                       3.2-3-3.5                           1                        TRUE                                            3
4: 1970-09-17  setosa                       3.1-3.2-3                           1                        TRUE                                            3
5: 1970-09-18  setosa                     3.6-3.1-3.2                           1                        TRUE                                            3
6: 1970-09-19  setosa                     3.9-3.6-3.1                           0                       FALSE                                            3
Run Code Online (Sandbox Code Playgroud)

Wim*_*pel 4

这只是部分答案,因为我没有完全理解你的第二个问题和第三个问题......

#create data.table with the correct names, based on your sample data (i think)
DT <- dt[, .(person = Species, date, value = Sepal.Width)]
#set keys
setkey(DT, person, date)
#create unique values of `value in the last year before the observation, for each `person
DT[ DT, 
    #get the unique values for the last year, suppress immediate output with {}
    unique_values_prev_year := {
      val = DT[ person == i.person & date %between% c( i.date - lubridate::years(1), i.date) ]$value
      unique_val = sort( unique( val ) )
      list( paste0( unique_val, collapse = "-" ) )
      }, 
    #do the above for each row
    by = .EACHI ]
Run Code Online (Sandbox Code Playgroud)

输出

#         person       date value                           unique_values_prev_year
# 1:      setosa 1970-09-14   3.5                                               3.5
# 2:      setosa 1970-09-15   3.0                                             3-3.5
# 3:      setosa 1970-09-16   3.2                                         3-3.2-3.5
# 4:      setosa 1970-09-17   3.1                                     3-3.1-3.2-3.5
# 5:      setosa 1970-09-19   3.9                                 3-3.1-3.2-3.5-3.9
# ---                                                                             
# 133: virginica 1970-10-28   3.3 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 134: virginica 1970-10-29   3.0 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 135: virginica 1970-10-30   2.5 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 136: virginica 1970-10-31   3.0 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 137: virginica 1970-11-01   3.4 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
Run Code Online (Sandbox Code Playgroud)