#!/usr/bin/env Rscript # # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Simple tool that wraps the analysis/R library. # # To run this you need: # - ggplot # - optparse # - glmnet -- dependency of analysis library library(optparse) # For unit tests is_main <- (length(sys.frames()) == 0) # Do command line parsing first to catch errors. Loading libraries in R is # slow. if (is_main) { option_list <- list( make_option(c("-t", "--title"), help="Plot Title") ) parsed <- parse_args(OptionParser(option_list = option_list), positional_arguments = 3) # input and output } library(ggplot2) # Use CairoPNG if available. Useful for headless R. if (library(Cairo, quietly = TRUE, logical.return = TRUE)) { png_func = CairoPNG cat('Using CairoPNG\n') } else { png_func = png cat('Using png\n') } source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") source("analysis/R/alternative.R") # temporary LoadContext <- function(prefix_case) { # Creates the context, filling it with privacy parameters # Arg: # prefix_case: path prefix to the test case, e.g. '_tmp/exp' p <- paste0(prefix_case, '_params.csv') params <- ReadParameterFile(p) ctx <- new.env() ctx$params <- params # so we can write it out later ctx } RunRappor <- function(prefix_case, prefix_instance, ctx) { # Reads counts, map files, runs RAPPOR analysis engine. # Args: # prefix_case: path prefix to the test case, e.g., '_tmp/exp' # prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1' # ctx: context file with params field filled in c <- paste0(prefix_instance, '_counts.csv') counts <- ReadCountsFile(c, ctx$params) m <- paste0(prefix_case, '_map.csv') # Switch to LoadMapFile if want to cache the result map <- ReadMapFile(m, ctx$params) # Main decode.R API timing <- system.time({ res <- Decode(counts, map$map, ctx$params) }) # The line is searched for, and elapsed time is extracted, by make_summary.py. # Should the formating or wording change, make_summary must be updated too. Log("Inference took %.3f seconds", timing[["elapsed"]]) if (is.null(res)) { stop("RAPPOR analysis failed.") } Log("Decoded results:") str(res$fit) res$fit } LoadActual <- function(prefix_instance) { hist_path <- paste0(prefix_instance, '_hist.csv') # case.csv # gen_counts.R (fast_counts mode) outputs this, since we never have true # client values. if (file.exists(hist_path)) { return(read.csv(hist_path)) } # Load ground truth into context input_path <- paste0(prefix_instance, '_true_values.csv') # case.csv client_values <- read.csv(input_path) # Create a histogram, or R "table". Column 2 is the true value. t <- table(client_values$value) d <- as.data.frame(t) # convert it to a data frame with 'string' and 'count' columns colnames(d) <- c('string', 'count') d # return this data frame } CompareRapporVsActual <- function(ctx) { # Prepare input data to be plotted actual <- ctx$actual # from the ground truth file rappor <- ctx$rappor # from output of AnalyzeRAPPOR # "s12" -> 12, for graphing StringToInt <- function(x) as.integer(substring(x, 2)) actual_values <- StringToInt(actual$string) rappor_values <- StringToInt(rappor$string) # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it # occurs too rarely) actual_only <- setdiff(actual_values, rappor_values) # False positives: AnalyzeRAPPOR attributed a proportion to a string in the # map that wasn't in the true input. rappor_only <- setdiff(rappor_values, actual_values) total <- sum(actual$count) a <- data.frame(index = actual_values, # Calculate the true proportion proportion = actual$count / total, dist = "actual") r <- data.frame(index = rappor_values, proportion = rappor$proportion, dist = rep("rappor", length(rappor_values))) # Extend a and r with the values that they are missing. if (length(rappor_only) > 0) { z <- data.frame(index = rappor_only, proportion = 0.0, dist = "actual") a <- rbind(a, z) } if (length(actual_only) > 0) { z <- data.frame(index = actual_only, proportion = 0.0, dist = "rappor") r <- rbind(r, z) } # IMPORTANT: Now a and r have the same rows, but in the wrong order. Sort by index. a <- a[order(a$index), ] r <- r[order(r$index), ] # L1 distance between actual and rappor distributions l1 <- sum(abs(a$proportion - r$proportion)) # The max L1 distance between two distributions is 2; the max total variation # distance is 1. total_variation <- l1 / 2 # Choose false positive strings and their proportion from rappor estimates false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')] false_neg <- a[a$index %in% actual_only, c('index', 'proportion')] Log("False positives:") str(false_pos) Log("False negatives:") str(false_neg) # NOTE: We should call Decode() directly, and then num_rappor is # metrics$num_detected, and sum_proportion is metrics$allocated_mass. metrics <- list( num_actual = nrow(actual), # data frames num_rappor = nrow(rappor), num_false_pos = nrow(false_pos), num_false_neg = nrow(false_neg), total_variation = total_variation, sum_proportion = sum(rappor$proportion) ) Log("Metrics:") str(metrics) # Return plot data and metrics list(plot_data = rbind(r, a), metrics = metrics) } # Colors selected to be friendly to the color blind: # http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/ palette <- c("#E69F00", "#56B4E9") PlotAll <- function(d, title) { # NOTE: geom_bar makes a histogram by default; need stat = "identity" g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist))) b <- geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8)) t <- ggtitle(title) g + b + t + scale_fill_manual(values=palette) } WritePlot <- function(p, outdir, width = 800, height = 600) { filename <- file.path(outdir, 'dist.png') png_func(filename, width=width, height=height) plot(p) dev.off() Log('Wrote %s', filename) } WriteSummary <- function(metrics, outdir) { filename <- file.path(outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) Log('Wrote %s', filename) } main <- function(parsed) { args <- parsed$args options <- parsed$options input_case_prefix <- args[[1]] input_instance_prefix <- args[[2]] output_dir <- args[[3]] # increase ggplot font size globally theme_set(theme_grey(base_size = 16)) # NOTE: It takes more than 2000+ ms to get here, while the analysis only # takes 500 ms or so (as measured by system.time). ctx <- LoadContext(input_case_prefix) ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx) ctx$actual <- LoadActual(input_instance_prefix) d <- CompareRapporVsActual(ctx) p <- PlotAll(d$plot_data, options$title) WriteSummary(d$metrics, output_dir) WritePlot(p, output_dir) } if (is_main) { main(parsed) }