From 7bab1ebeac5eace118d312d774629e4d9f897b0e Mon Sep 17 00:00:00 2001 From: avimallu <66530011+avimallu@users.noreply.github.com> Date: Thu, 16 Jul 2020 19:18:28 +0530 Subject: [PATCH] Add files via upload --- agg_by_period.R | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ decile.R | 27 +++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 agg_by_period.R create mode 100644 decile.R diff --git a/agg_by_period.R b/agg_by_period.R new file mode 100644 index 0000000..4e15feb --- /dev/null +++ b/agg_by_period.R @@ -0,0 +1,79 @@ + +# Required libraries ------------------------------------------------------ + +library(data.table) +library(purrr) # to be removed once mergelist is released in data.table + +# Sample large data table ------------------------------------------------- + +DT = data.table( + dates = seq.Date(as.Date("2019-01-01"), by = "1 week", length.out = 200), + grp_1 = c("India", "Brazil", "China", "Russia", "South Africa"), + grp_2 = c("Buttons", "Salt"), + sales = runif(1e8, 0, 100) +) + +# Aggregation function ---------------------------------------------------- + +agg_by_period <- function(i, DT, j, by, index) { + + # sum is optimized internally in data.table + # between is parallelized in data.table + # key the data.table by the index to have even faster operation times + temp <- + setnames(DT[ + between(get(index), date_periods[[i]][[1]], date_periods[[i]][[2]]), + lapply(.SD, sum, na.rm = TRUE), keyby = by, .SDcols = j], + j, paste0(j, "_", names(date_periods[i]))) + + # scale the latter period by any amount if required + if(date_periods[[i]][[3]] != 1) { + + temp[, lapply(.SD, `*`, date_periods[[i]][[3]]), + keyby = by] + + } else { + + temp + } + +} + +# Aggregate By Period = agp, for data.tables ala DT +agp_DT <- function(DT, j, by, index) { + + + # to be replaced by mergelist when it is released in data.table + reduce( + lapply(seq_along(date_periods), agg_by_period, DT, j, by, index), + merge.data.table, allow.cartesian = TRUE, + by = by, all.x = TRUE, + .init = unique(DT[, ..by])) + +} + +# Create fixed tables and functions to ease date period creation ---------- + +# Only to ease recalculation of min and max value over and over again +max_date = max(DT[, dates]) +min_date = min(DT[, dates]) + +# Creating an alias to avoid using as.Date +d <- function(x) {as.Date(x)} + +# This list can be configured to calcaulte different aggregate +# time periods of interest +date_periods <- list( + # name = start_date, end_date, scaling + r3m = list(max_date - 12 * 7, max_date, 1), + p3m = list(max_date - 25 * 7, max_date - 13 * 7, 1), + r6m = list(max_date - 25 * 7, max_date, 1) , + p6m = list(max_date - 51 * 7, max_date - 26 * 7, 1), + ytd = list(d("2020-01-01"), max_date, 0.7) +) + +# Benchmark --------------------------------------------------------------- + +system.time(agp_DT(DT, "sales", "grp_1", "dates")) +system.time(agp_DT(DT, "sales", c("grp_1", "grp_2"), "dates"))• + diff --git a/decile.R b/decile.R new file mode 100644 index 0000000..ed1a057 --- /dev/null +++ b/decile.R @@ -0,0 +1,27 @@ + +# Required libraries ------------------------------------------------------ + +library(data.table) + +# Add deciles ------------------------------------------------------------- + +decile_dt <- function(x, decreasing = TRUE) { + + # browser() + + NA_positions = (is.na(x) | (x == 0)) + + x = as.data.table(x = x) + + x[is.na(x), x := 0][ + , num := .I][ + order(-x)][ + , cumsums := cumsum(x)][ + , tot := cumsums/max(cumsums)][ + , dec := (if(decreasing) 11 - ceiling(tot * 10) + else ceiling(tot * 10))][ + order(num)][ + , dec := fifelse(NA_positions, NA_real_, dec)][ + , dec] + +} \ No newline at end of file