10 Nonparametric Real Spend
The previous chapter, Chapter 9 , gave us the estimated value of the real spend CDF at 101 points. For the parametric fitting step, we need the probability mass function (PMF) value of the real spend age distribution at each of Monero’s 100 million spendable RingCT outputs. In this chapter we will also produce summary statistics of the estimated real spend age distribution.
10.1 Reverse the transformation of output age
In the previous chapter, Chapter 9 , we obtained an estimate of the real spend age distribution after it was transformed by the CDF of the decoy distribution in Chapter 8 . To get back to the un-transformed distribution, we need to compute , the inverse CDF (i.e. the quantile function) of the decoy distribution. Let be the estimated CDF of the transformed real spend distribution, evaluated at support points . The value of the estimated un-transformed real spend distribution at the th support point is defined in Equation 10.1 :
10.2 Interpolation of the CDF
We have evaluated the CDF at only 101 points. We want a smooth interpolation through all points, i.e. every transaction output. Hippel, Hunter, and Drown (2017) suggest fitting a cubic spline (piecewise polynomial) with monotonicity restrictions. We use the splinebins() function in the binsmooth package to accomplish the interpolation. Later, the PMF can be computed by taking the first difference of the CDF.
10.3 Computing summary statistics
Having produced the estimated real spend CDF , we can compute summary statistics. Quantiles, including the median, are computed easily by evaluating the inverse CDF at the desired quantile. Other statistics, such as mean, standard deviation, skewness, and kurtosis, can be computed by evaluating a Riemann–Stieltjes integral with the appropriate integrand.
10.4 Code
This code can be run in a new R session.
library(decoyanalysis)
library(data.table)
cdf.dir <- "weekly-weighted-cdf"
results.dir <- "results"
results.dir.run <- paste0(results.dir, "/results-01/")
exclusion.weeks <- NULL
nonparametric.real.spend <- list()
for (label in c("all", "rucknium", "rucknium_isthmus")) {
results.dir.run.label <- paste0(results.dir.run, "bjr/", label, "/")
week.set <- list.files(results.dir.run.label)
week.set <- week.set[grepl(".qs", week.set)]
weekly.real.spend.cdfs <- list()
summary.stats <- data.table(week = week.set, mean = 0, median = 0, sd = 0, skewness = 0, kurtosis = 0)
mixing.proportions <- data.table(week = week.set, component_1 = 0, component_2 = 0, component_3 = 0, component_4 = 0)
for (week.to.analyze in week.set) {
bjr.results <- qs::qread(paste0(results.dir.run.label, week.to.analyze))
cat(week.to.analyze, round(100 * rev(sort(bjr.results$mixing.proportions)), 3), "\n")
# print(round(100 * rev(sort(bjr.results$mixing.proportions)), 3))
mixing.proportions[week == week.to.analyze, component_1 := rev(sort(bjr.results$mixing.proportions))[1]]
mixing.proportions[week == week.to.analyze, component_2 := rev(sort(bjr.results$mixing.proportions))[2]]
mixing.proportions[week == week.to.analyze, component_3 := rev(sort(bjr.results$mixing.proportions))[3]]
mixing.proportions[week == week.to.analyze, component_4 := rev(sort(bjr.results$mixing.proportions))[4]]
}
support.max <- 0
all.weeks.weighted.v.mean <- c()
for (week.to.analyze in setdiff(week.set, exclusion.weeks)) {
if (week.to.analyze %in% summary.stats[mean != 0, week]) { next }
cat(week.to.analyze, "\n")
weekly.weighted.cdf <- qs::qread(paste0(cdf.dir, week.to.analyze))
weighted.v.mean <- weekly.weighted.cdf[[1]]$weighted.v.mean
all.weeks.weighted.v.mean[week.to.analyze] <- weighted.v.mean
weekly.z.max <- weekly.weighted.cdf[[1]]$weekly.z.max
support.max <- max(c(support.max, weekly.z.max))
weekly.weighted.cdf <- weekly.weighted.cdf[[1]]$weekly.weighted.cdf
intermediate.stepfun <- stepfun(seq_along(weekly.weighted.cdf), c(0, weekly.weighted.cdf))
weekly.weighted.inv.cdf <- Vectorize(function(x) { gbutils::cdf2quantile(x, cdf = intermediate.stepfun) })
bjr.results <- qs::qread(paste0(results.dir.run.label, week.to.analyze))
wallet2.dist.index <- which.max(bjr.results$mixing.proportions)
Fn.hat.value <- bjr.results$cdf$CDF[, wallet2.dist.index]
supp.points <- bjr.results$cdf$cdf.points
decoy <- punif
Fb <- decoy
M = 16
alpha <- 1/M
patra.sen.bjr.results <- patra.sen.bjr(Fn.hat.value, supp.points, Fb, alpha)
Fs.hat <- patra.sen.bjr.results$Fs.hat
supp.points.transformed <- vector("numeric", length(supp.points))
for (i in seq_along(supp.points)) {
if (supp.points[i] < 0.000001) {
supp.points.transformed[i] <- 0
next
}
if (supp.points[i] > max(weekly.weighted.cdf)) {
# Sometimes max(weekly.weighted.cdf) is not equal to 1
supp.points.transformed[i] <- length(weekly.weighted.cdf)
next
}
supp.points.transformed[i] <- weekly.weighted.inv.cdf(supp.points[i])
}
regularized.real.spend.cdf <- aggregate(Fs.hat, by = list(Fs.hat.transformed = supp.points.transformed), FUN = max)
Fs.hat.transformed.reg <- regularized.real.spend.cdf$Fs.hat.transformed
supp.points.reg <- regularized.real.spend.cdf$x
# Regularized eliminates duplicate support points
weekly.real.spend.cdfs[[week.to.analyze]] <- binsmooth::splinebins(
bEdges = c(Fs.hat.transformed.reg, max(Fs.hat.transformed.reg) + 1),
bCounts = c(diff(c(0, supp.points.reg)), 0))$splineCDF
real.spend.ecdf <- stepfun(c(Fs.hat.transformed.reg), c(0, supp.points.reg))
days.unit <- weighted.v.mean/(60^2*24)
est.mean <- spatstat.univar::stieltjes(function(x){x * days.unit}, real.spend.ecdf)[[1]]
est.var <- spatstat.univar::stieltjes(function(x){(x * days.unit - est.mean)^2}, real.spend.ecdf)[[1]]
if ( ! week.to.analyze %in% summary.stats$week) {
summary.stats <- rbind(summary.stats, data.table(week = week.to.analyze), fill = TRUE)
}
summary.stats[week == week.to.analyze, mean := est.mean]
summary.stats[week == week.to.analyze, median := Fs.hat.transformed.reg[findInterval(0.5, supp.points.reg)] * days.unit ]
summary.stats[week == week.to.analyze, percentile.25 := Fs.hat.transformed.reg[findInterval(0.25, supp.points.reg)] * days.unit ]
summary.stats[week == week.to.analyze, sd := sqrt(est.var)]
summary.stats[week == week.to.analyze, skewness := spatstat.univar::stieltjes(function(x){(x * days.unit - est.mean)^3}, real.spend.ecdf)[[1]] / est.var^(3/2)]
summary.stats[week == week.to.analyze, kurtosis := spatstat.univar::stieltjes(function(x){(x * days.unit - est.mean)^4}, real.spend.ecdf)[[1]] / est.var^(2)]
}
setorder(summary.stats, week)
print(summary.stats)
nonparametric.real.spend[[label]] <- list(weekly.real.spend.cdfs = weekly.real.spend.cdfs, all.weeks.weighted.v.mean = all.weeks.weighted.v.mean, support.max = support.max,
mixing.proportions = mixing.proportions, summary.stats = summary.stats)
}
qs::qsave(nonparametric.real.spend, file = paste0(results.dir.run, "nonparametric-real-spend.qs"))