r - Efficient comparison of POSIXct in data.table -
hello looking efficient way of selecting posixct
rows data.table
such time of day less 12:00:00
(note millisecond not required, can use itime
example)
set.seed(1); n = 1e7; dt = data.table(dts = .posixct(1e5*rnorm(n), tz="gmt")) dt dts # 1: 1969-12-31 06:35:54.618925 # 2: 1970-01-01 05:06:04.332422 # --- # 9999999: 1970-01-03 00:37:00.035565 #10000000: 1969-12-30 08:30:23.624506
one solution (the problem here cast costly if n big)
f <- function(t, st, et) {time <- as.itime(t); return(time>=as.itime(st) & time<=as.itime(et))} p <- function(t, s) { #geektrader solution ep <- .parseiso8601(s) if(grepl('t[0-9]{2}:[0-9]{2}:[0-9]{2}/t[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){ first.time <- as.double(ep$first.time) last.time <- as.double(ep$last.time)-31449600 secofday <- as.double(t) %% 86400 return(secofday >= first.time & secofday <= last.time ) } else { return(t >= ep$first.time & t <= ep$last.time) } }
quick perf
system.time(resf <- dt[f(dts,'00:00:00','11:59:59')]) user system elapsed 1.01 0.28 1.29 system.time(resp <- dt[p(dts,'t00:00:00/t11:59:59')]) user system elapsed 0.64 0.13 0.76 identical(resf,resp) [1] true
p <- function(t, s) { ep <- .parseiso8601(s) if(grepl('t[0-9]{2}:[0-9]{2}:[0-9]{2}/t[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){ first.time <- as.double(ep$first.time) last.time <- as.double(ep$last.time)-31449600 secofday <- as.double(t) %% 86400 return(secofday >= first.time & secofday <= last.time ) } else { return(t >= ep$first.time & t <= ep$last.time) } } f <- function(t, st, et) { time <- as.itime(t) return(time>=as.itime(st) & time<=as.itime(et)) } sys.setenv(tz='gmt') n = 1e7; set.seed(1); dt <- data.table(dts = .posixct(1e5*rnorm(n), tz="gmt")) system.time(resp <- dt[p(dts, 't00:00:00/t12:00:00'), ]) ## user system elapsed ## 1.11 0.11 1.22 system.time(resf <- dt[f(dts,'00:00:00','12:00:00')]) ## user system elapsed ## 2.22 0.29 2.51 resp ## dts ## 1: 1969-12-31 06:35:54 ## 2: 1970-01-01 05:06:04 ## 3: 1969-12-31 00:47:17 ## 4: 1970-01-01 09:09:10 ## 5: 1969-12-31 01:12:33 ## --- ##5000672: 1970-01-01 06:08:15 ##5000673: 1970-01-01 05:02:27 ##5000674: 1969-12-31 02:25:24 ##5000675: 1970-01-03 00:37:00 ##5000676: 1969-12-30 08:30:23 resf ## dts ## 1: 1969-12-31 06:35:54 ## 2: 1970-01-01 05:06:04 ## 3: 1969-12-31 00:47:17 ## 4: 1970-01-01 09:09:10 ## 5: 1969-12-31 01:12:33 ## --- ##5000672: 1970-01-01 06:08:15 ##5000673: 1970-01-01 05:02:27 ##5000674: 1969-12-31 02:25:24 ##5000675: 1970-01-03 00:37:00 ##5000676: 1969-12-30 08:30:23 #check correctness resp[,list(mindts=max(dts)),by=list(as.date(dts))] ## as.date mindts ## 1: 1969-12-31 1969-12-31 12:00:00 ## 2: 1970-01-01 1970-01-01 12:00:00 ## 3: 1969-12-29 1969-12-29 12:00:00 ## 4: 1970-01-02 1970-01-02 12:00:00 ## 5: 1969-12-30 1969-12-30 12:00:00 ## 6: 1970-01-03 1970-01-03 12:00:00 ## 7: 1970-01-04 1970-01-04 11:59:59 ## 8: 1970-01-05 1970-01-05 11:59:45 ## 9: 1969-12-28 1969-12-28 12:00:00 ##10: 1969-12-27 1969-12-27 11:59:21 ##11: 1970-01-06 1970-01-06 10:53:21 ##12: 1969-12-26 1969-12-26 10:15:03 ##13: 1970-01-07 1970-01-07 08:21:55 resf[,list(mindts=max(dts)),by=list(as.date(dts))] ## as.date mindts ## 1: 1969-12-31 1969-12-31 12:00:00 ## 2: 1970-01-01 1970-01-01 12:00:00 ## 3: 1969-12-29 1969-12-29 12:00:00 ## 4: 1970-01-02 1970-01-02 12:00:00 ## 5: 1969-12-30 1969-12-30 12:00:00 ## 6: 1970-01-03 1970-01-03 12:00:00 ## 7: 1970-01-04 1970-01-04 11:59:59 ## 8: 1970-01-05 1970-01-05 11:59:45 ## 9: 1969-12-28 1969-12-28 12:00:00 ##10: 1969-12-27 1969-12-27 11:59:21 ##11: 1970-01-06 1970-01-06 10:53:21 ##12: 1969-12-26 1969-12-26 10:15:03 ##13: 1970-01-07 1970-01-07 08:21:55
now demo of nice xts
style subsetting
dt[p(dts, '1970')] ## dts ## 1: 1970-01-01 05:06:04 ## 2: 1970-01-02 20:18:48 ## 3: 1970-01-01 09:09:10 ## 4: 1970-01-01 13:32:22 ## 5: 1970-01-01 20:30:32 ## --- ##5001741: 1970-01-02 15:51:12 ##5001742: 1970-01-03 01:41:31 ##5001743: 1970-01-01 06:08:15 ##5001744: 1970-01-01 05:02:27 ##5001745: 1970-01-03 00:37:00 dt[p(dts, '197001')] ## dts ## 1: 1970-01-01 05:06:04 ## 2: 1970-01-02 20:18:48 ## 3: 1970-01-01 09:09:10 ## 4: 1970-01-01 13:32:22 ## 5: 1970-01-01 20:30:32 ## --- ##5001741: 1970-01-02 15:51:12 ##5001742: 1970-01-03 01:41:31 ##5001743: 1970-01-01 06:08:15 ##5001744: 1970-01-01 05:02:27 ##5001745: 1970-01-03 00:37:00 dt[p(dts, '19700102')] ## dts ## 1: 1970-01-02 20:18:48 ## 2: 1970-01-02 17:59:38 ## 3: 1970-01-02 07:14:53 ## 4: 1970-01-02 02:13:03 ## 5: 1970-01-02 01:31:37 ## --- ##1519426: 1970-01-02 11:25:24 ##1519427: 1970-01-02 10:00:21 ##1519428: 1970-01-02 05:21:25 ##1519429: 1970-01-02 05:11:26 ##1519430: 1970-01-02 15:51:12 dt[p(dts, '19700102 00:00:00/19700103 12:00:00')] ## dts ## 1: 1970-01-02 20:18:48 ## 2: 1970-01-02 17:59:38 ## 3: 1970-01-02 07:14:53 ## 4: 1970-01-02 02:13:03 ## 5: 1970-01-02 01:31:37 ## --- ##1785762: 1970-01-02 05:21:25 ##1785763: 1970-01-02 05:11:26 ##1785764: 1970-01-02 15:51:12 ##1785765: 1970-01-03 01:41:31 ##1785766: 1970-01-03 00:37:00 #check correctness again dt[p(dts, '19700102 00:00:00/19700103 12:00:00'), max(dts)] ##[1] "1970-01-03 12:00:00 gmt" dt[p(dts, '19700102 00:00:00/19700103 12:00:00'), min(dts)] ##[1] "1970-01-02 00:00:00 gmt"
Comments
Post a Comment