# Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Authors: vpihur@google.com (Vasyl Pihur), fanti@google.com (Giulia Fanti) library(RUnit) source("analysis/R/encode.R") source("analysis/R/decode.R") source("analysis/R/simulation.R") source("analysis/R/association.R") source("analysis/R/fast_em.R") source("analysis/R/util.R") SamplePopulations <- function(N, num_variables = 1, params, variable_opts) { # Samples a number of variables. User specifies the number of variables # and some desired properties of those variables. # # Args: # N: Number of reports to generate. # params: RAPPOR parameters, like Bloom filter size, number of # hash bits, etc. # variable_opts: List of options for generating the ground truth: # independent = whether distinct variables should be independently drawn # deterministic = whether the variables should be drawn from a # Poisson distribution or uniformly assigned across the range # of 1:num_strings # num_strings: Only does something if deterministic == TRUE, and # specifies how many strings to use in the uniform assignment # of ground truth strings. # # Returns: # RAPPOR simulated ground truth for each piece of data. m <- params$m num_strings <- variable_opts$num_strings if (variable_opts$deterministic) { # If a deterministic assignment is desired, evenly distribute # strings across all cohorts. reps <- ceiling(N / num_strings) variables <- lapply(1:num_variables, function(i) as.vector(sapply(1:num_strings, function(x) rep(x, reps)))[1:N]) cohorts <- lapply(1:num_variables, function(i) rep(1:m, ceiling(N / m))[1:N]) } else { # Otherwise, draw from a Poisson random variable variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1) # Randomly assign cohorts in each dimension cohorts <- lapply(1:num_variables, function(i) sample(1:params$m, N, replace = TRUE)) if (!variable_opts$independent) { # If user wants dependent RVs, subsequent variables are closely correlated # with the first variable in the foll. manner: # variable_i ~ variable_1 + (i-1) Bernoulli(0.5) bernoulli_corr <- function(x) { variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)} variables[2:num_variables] <- lapply(2:num_variables, function(x) bernoulli_corr(x)) } } list(variables = variables, cohorts = cohorts) } Simulate <- function(N, num_variables, params, variable_opts = NULL, truth = NULL, basic = FALSE) { if (is.null(truth)) { truth <- SamplePopulations(N, num_variables, params, variable_opts) } strs <- lapply(truth$variables, function(x) sort(seq(max(x)))) # strs <- lapply(truth$variables, function(x) sort(unique(x))) # strs <- lapply(truth$variables, function(x) 1:length(unique(x))) # Construct lists of maps and reports if (variable_opts$deterministic) { # Build the maps map <- CreateMap(strs[[1]], params, FALSE, basic = basic) maps <- lapply(1:num_variables, function(x) map) # Build the reports report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]], map$map_by_cohort, params) reports <- lapply(1:num_variables, function(x) report) } else { # Build the maps maps <- lapply(1:num_variables, function(x) CreateMap(strs[[x]], params, FALSE, basic = basic)) # Build the reports reports <- lapply(1:num_variables, function(x) EncodeAll(truth$variables[[x]], truth$cohorts[[x]], maps[[x]]$map_by_cohort, params)) } list(reports = reports, cohorts = truth$cohorts, truth = truth$variables, maps = maps, strs = strs) } # ----------------Actual testing starts here--------------- # TestComputeDistributionEM <- function() { # Test various aspects of ComputeDistributionEM in association.R. # Tests include: # Test 1: Compute a joint distribution of uniformly distributed, # perfectly correlated strings # Test 2: Compute a marginal distribution of uniformly distributed strings # Test 3: Check the "other" category estimation works by removing # a string from the known map. # Test 4: Test that the variance from EM algorithm is 1/N when there # is no noise in the system. # Test 5: Check that the right answer is still obtained when f = 0.2. num_variables <- 3 N <- 100 # Initialize the parameters params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0) variable_opts <- list(deterministic = TRUE, num_strings = 2, independent = FALSE) sim <- Simulate(N, num_variables, params, variable_opts) # Test 1: Delta function pmf joint_dist <- ComputeDistributionEM(sim$reports, sim$cohorts, sim$maps, ignore_other = TRUE, params = params, marginals = NULL, estimate_var = FALSE) # The recovered distribution should be close to the delta function. checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01) checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01) # Test 2: Now compute a marginal using EM dist <- ComputeDistributionEM(list(sim$reports[[1]]), list(sim$cohorts[[1]]), list(sim$maps[[1]]), ignore_other = TRUE, params = params, marginals = NULL, estimate_var = FALSE) checkTrue(abs(dist$fit["1"] - 0.5) < 0.01) # Test 3: Check that the "other" category is correctly computed # Build a modified map with no column 2 (i.e. we only know that string # "1" is a valid string map <- sim$maps[[1]] small_map <- map for (i in 1:params$m) { locs <- which(map$map_by_cohort[[i]][, 1]) small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)), dims = c(params$k, 1)) locs <- which(map$all_cohorts_map[, 1]) colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1] } small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort) dist <- ComputeDistributionEM(list(sim$reports[[1]]), list(sim$cohorts[[1]]), list(small_map), ignore_other = FALSE, params = params, marginals = NULL, estimate_var = FALSE) # The recovered distribution should be uniform over 2 strings. checkTrue(abs(dist$fit[1] - 0.5) < 0.1) # Test 4: Test the variance is 1/N variable_opts <- list(deterministic = TRUE, num_strings = 1) sim <- Simulate(N, num_variables = 1, params, variable_opts) dist <- ComputeDistributionEM(sim$reports, sim$cohorts, sim$maps, ignore_other = TRUE, params = params, marginals = NULL, estimate_var = TRUE) checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N) # Test 5: Check that when f=0.2, we still get a good estimate params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2) variable_opts <- list(deterministic = TRUE, num_strings = 2) sim <- Simulate(N, num_variables = 2, params, variable_opts) dist <- ComputeDistributionEM(sim$reports, sim$cohorts, sim$maps, ignore_other = TRUE, params = params, marginals = NULL, estimate_var = FALSE) checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15) checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15) # Test 6: Check the computed joint distribution with randomized # correlated inputs from the Poisson distribution # Expect to have correlation between strings n and n + 1 N <- 1000 params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1) variable_opts <- list(deterministic = FALSE, independent = FALSE) sim <- Simulate(N, num_variables = 2, params, variable_opts) dist <- ComputeDistributionEM(sim$reports, sim$cohorts, sim$maps, ignore_other = TRUE, params = params, marginals = NULL, estimate_var = FALSE) print_dist <- TRUE # to print joint distribution, set to TRUE if (print_dist) { # dist$fit[dist$fit<1e-4] <- 0 # Sort by row names and column names to visually see correlation print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))]) } # Check for correlations (constants chosen heuristically to get good # test confidence with small # of samples) # Should have mass roughly 1/2e and 1/2e each checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1) checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1) # Should have mass roughly 1/4e and 1/4e each checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06) # Check for lack of probability mass checkTrue(dist$fit["1", "3"] < 0.02) checkTrue(dist$fit["1", "4"] < 0.02) checkTrue(dist$fit["2", "1"] < 0.02) checkTrue(dist$fit["2", "4"] < 0.02) checkTrue(dist$fit["3", "1"] < 0.02) checkTrue(dist$fit["3", "2"] < 0.02) } MakeCondProb <- function() { d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2) d = d / sum(d) e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2) e = e / sum(e) list(d, e, d) # 3 reports } # Test the slow version in R. RunEmFunction <- function(cond_prob, max_em_iters) { cond_prob <- MakeCondProb() # Mechanical test of 4 iterations. em$hist has 5 elements. result <- EM(cond_prob, max_em_iters=max_em_iters) result$est } # Run a test of the EM executable RunEmExecutable <- function(em_executable, cond_prob, max_em_iters) { print(cond_prob) if (!file.exists(em_executable)) { stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable)) } em_iter_func <- ConstructFastEM(em_executable, "/tmp") result <- em_iter_func(cond_prob, max_em_iters=max_em_iters) result$est } TestCppImplementation <- function() { cond_prob <- MakeCondProb() max_em_iters <- 10 fit1 <- RunEmFunction(cond_prob, max_em_iters) # Assume we're in the repo root em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em") fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters) cpp_diff <- abs(fit1 - fit2) print(cpp_diff) Log("C++ implementation difference after %d iterations: %e", max_em_iters, sum(cpp_diff)) # After 10 iterations they should be almost indistinguishable. checkTrue(sum(cpp_diff) < 1e-10) } TestTensorFlowImplementation <- function() { cond_prob <- MakeCondProb() max_em_iters <- 10 fit1 <- RunEmFunction(cond_prob, max_em_iters) em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh") fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters) tf_diff <- abs(fit1 - fit2) print(tf_diff) Log("TensorFlow implementation difference after %d iterations: %e", max_em_iters, sum(tf_diff)) checkTrue(sum(tf_diff) < 1e-10) }