# Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # RAPPOR simulation library. Contains code for encoding simulated data and # creating the map used to encode and decode reports. library(glmnet) library(parallel) # mclapply SetOfStrings <- function(num_strings = 100) { # Generates a set of strings for simulation purposes. strs <- paste0("V_", as.character(1:num_strings)) strs } GetSampleProbs <- function(params) { # Generate different underlying distributions for simulations purposes. # Args: # - params: a list describing the shape of the true distribution: # c(num_strings, prop_nonzero_strings, decay_type, # rate_exponetial). nstrs <- params[[1]] nonzero <- params[[2]] decay <- params[[3]] expo <- params[[4]] background <- params[[5]] probs <- rep(0, nstrs) ind <- floor(nstrs * nonzero) if (decay == "Linear") { probs[1:ind] <- (ind:1) / sum(1:ind) } else if (decay == "Constant") { probs[1:ind] <- 1 / ind } else if (decay == "Exponential") { temp <- seq(0, nonzero, length.out = ind) temp <- exp(-temp * expo) temp <- temp + background temp <- temp / sum(temp) probs[1:ind] <- temp } else { stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")') } probs } EncodeAll <- function(x, cohorts, map, params, num_cores = 1) { # Encodes the ground truth into RAPPOR reports. # # Args: # x: Observed strings for each report, Nx1 vector # cohort: Cohort assignment for each report, Nx1 vector # map: list of matrices encoding locations of hashes for each # string, for each cohort # params: System parameters # # Returns: # RAPPOR reports for each piece of data. p <- params$p q <- params$q f <- params$f k <- params$k qstar <- (1 - f / 2) * q + (f / 2) * p pstar <- (1 - f / 2) * p + (f / 2) * q candidates <- colnames(map[[1]]) if (!all(x %in% candidates)) { stop("Some strings are not in the map. set(X) - set(candidates): ", paste(setdiff(unique(x), candidates), collapse=" "), "\n") } bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE, USE.NAMES = FALSE) reports <- mclapply(bfs, function(x) { noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar)) ind <- which(x) noise[ind] <- sample(0:1, length(ind), replace = TRUE, prob = c(1 - qstar, qstar)) noise }, mc.cores = num_cores) reports } CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) { # Creates a list of 0/1 matrices corresponding to mapping between the strs and # Bloom filters for each instance of the RAPPOR. # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4, # the result could look this: # [[1]] # 1 0 0 0 # 0 1 0 0 # 0 0 0 1 # [[2]] # 0 1 0 0 # 0 0 0 1 # 0 0 1 0 # # Args: # strs: a vector of strings # params: a list of parameters in the following format: # (k, h, m, p, q, f). # generate_pos: Tells whether to generate an object storing the # positions of the nonzeros in the matrix # basic: Tells whether to use basic RAPPOR (only works if h=1). M <- length(strs) map_by_cohort <- list() k <- params$k h <- params$h m <- params$m for (i in 1:m) { if (basic && (h == 1) && (k == M)) { ones <- 1:M } else { ones <- sample(1:k, M * h, replace = TRUE) } cols <- rep(1:M, each = h) map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M)) colnames(map_by_cohort[[i]]) <- strs } all_cohorts_map <- do.call("rBind", map_by_cohort) if (generate_pos) { map_pos <- t(apply(all_cohorts_map, 2, function(x) { ind <- which(x == 1) n <- length(ind) if (n < h * m) { ind <- c(ind, rep(NA, h * m - n)) } ind })) } else { map_pos <- NULL } list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map, map_pos = map_pos) } GetSample <- function(N, strs, probs) { # Sample for the strs population with distribution probs. sample(strs, N, replace = TRUE, prob = probs) } GetTrueBits <- function(samp, map, params) { # Convert sample generated by GetSample() to Bloom filters where mapping # is defined in map. # Output: # - reports: a matrix of size [num_instances x size] where each row # represents the number of times each bit in the Bloom filter # was set for a particular instance. # Note: reports[, 1] contains the same size for each instance. N <- length(samp) k <- params$k m <- params$m strs <- colnames(map[[1]]) reports <- matrix(0, m, k + 1) inst <- sample(1:m, N, replace = TRUE) for (i in 1:m) { tab <- table(samp[inst == i]) tab2 <- rep(0, length(strs)) tab2[match(names(tab), strs)] <- tab counts <- apply(map[[i]], 1, function(x) x * tab2) # cat(length(tab2), dim(map[[i]]), dim(counts), "\n") reports[i, ] <- c(sum(tab2), apply(counts, 2, sum)) } reports } GetNoisyBits <- function(truth, params) { # Applies RAPPOR to the Bloom filters. # Args: # - truth: a matrix generated by GetTrueBits(). k <- params$k p <- params$p q <- params$q f <- params$f rappors <- apply(truth, 1, function(x) { # The following samples considering 4 cases: # 1. Signal and we lie on the bit. # 2. Signal and we tell the truth. # 3. Noise and we lie. # 4. Noise and we tell the truth. # Lies when signal sampled from the binomial distribution. lied_signal <- rbinom(k, x[-1], f) # Remaining must be the non-lying bits when signal. Sampled with q. truth_signal <- x[-1] - lied_signal # Lies when there is no signal which happens x[1] - x[-1] times. lied_nosignal <- rbinom(k, x[1] - x[-1], f) # Trtuh when there's no signal. These are sampled with p. truth_nosignal <- x[1] - x[-1] - lied_nosignal # Total lies and sampling lies with 50/50 for either p or q. lied <- lied_signal + lied_nosignal lied_p <- rbinom(k, lied, .5) lied_q <- lied - lied_p # Generating the report where sampling of either p or q occurs. rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p) }) cbind(truth[, 1], t(rappors)) } GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05, prop_missing = 0, correction = "Bonferroni") { # Simulate N reports with pop_params describing the population and # params describing the RAPPOR configuration. num_strings = pop_params[[1]] strs <- SetOfStrings(num_strings) probs <- GetSampleProbs(pop_params) samp <- GetSample(N, strs, probs) map <- CreateMap(strs, params) truth <- GetTrueBits(samp, map$map_by_cohort, params) rappors <- GetNoisyBits(truth, params) strs_apprx <- strs map_apprx <- map$all_cohorts_map # Remove % of strings to simulate missing variables. if (prop_missing > 0) { ind <- which(probs > 0) removed <- sample(ind, ceiling(prop_missing * length(ind))) map_apprx <- map$all_cohorts_map[, -removed] strs_apprx <- strs[-removed] } # Randomize the columns. ind <- sample(1:length(strs_apprx), length(strs_apprx)) map_apprx <- map_apprx[, ind] strs_apprx <- strs_apprx[ind] fit <- Decode(rappors, map_apprx, params, alpha = alpha, correction = correction) # Add truth column. fit$fit$Truth <- table(samp)[fit$fit$string] fit$fit$Truth[is.na(fit$fit$Truth)] <- 0 fit$map <- map$map_by_cohort fit$truth <- truth fit$strs <- strs fit$probs <- probs fit }