#' Reads a dataset from Dataiku's Data Science Studio
#'
#' @param name name of dataset
#' @param columns a character vector of columns to read from dataset
#' @param partitions character vector of partitions to load
#' @param samplingMethod the sampling method to use, if necessary
#' @param nbRows An integer. The number of rounds used for sampling
#' @param ratio A numeric. The probability used for sampling each row. 0 < ratio < 1.
#' @param convertEmptyStrings Whether to convert empty strings to NAs
#' @param colClasses Manually-specified column classes. Default is to infer from dataset schema.
#' @param inferColClassesFromData If colClasses is not specified, infer column classes from data instead of dataset schema.
#' @param na.strings Optional list of strings to convert to NAs. Default is "NA".
#
#' @return A data.frame with the requested data
#' 
#' @import dplyr
#' @importFrom RJSONIO toJSON
#' @importFrom RJSONIO fromJSON
#' 
#' @examples
#' \dontrun{
#' d = dkuReadDataset("iris")
#' 
#' # read in two columns
#' d = dkuReadDataset("iris", columns=c("Sepal.Length", "Sepal.Width"))
#' 
#' # explicitly set colClasses
#' d = dkuReadDataset("iris", colClasses=c("numeric", "numeric", "numeric", "numeric", "character"))
#' 
#' # fixed sampling -- read 100 random rows from the iris dataset
#' d = dkuReadDataset("iris", samplingMethod="fixed", nbRows=100)
#' 
#' # head sampling -- read the first 100 rows from the iris dataset
#' d = dkuReadDataset("iris", samplingMethod="head", nbRows=100)
#' 
#' # ratio sampling -- read 30% of the rows (chosen randomly) from the iris dataset
#' d = dkuReadDataset("iris", samplingMethod="ratio", ratio=0.3)
#' }
#' @details
#'
#' Users can specify which partitions and columns to load, as well as a sampling scheme
#' if the dataset is too large to fit into memory. Possible sampling schemes are fixed sampling, where a set number of rows are 
#' randomly chosen from the dataset; head sampling, where the first *n* rows are sampled from the dataset; and ratio sampling, where
#' rows are included randomly with a probability.
#'
#' @export

dkuReadDataset <- function(name, 
                              partitions=NULL, 
                              samplingMethod=c("full", "fixed", "head", "ratio"), 
                              columns=NULL, 
                              nbRows=NULL,
                              ratio=NULL,
                              convertEmptyStrings=TRUE,
                              colClasses=NA,
                              inferColClassesFromData=TRUE,
                              na.strings="NA") {


  name <- dku__resolve_smart_name(name)
  samplingMethod <- match.arg(samplingMethod);

  parsedRSpec <- getDkuFlowSpec()
  if (is.null(parsedRSpec)) {
    # Notebook mode
    if (!is.null(partitions)) {
      if(!is.list(partitions)) {
        # We accept a single partition identifier (-> autoconvert to list)
        partitions = list(partitions)
      }
    }
    currentActivityId <- ""
  } else {
    # Flow mode
    if (!is.null(partitions)) {
      stop("You cannot explicitly set partitions when running within Dataiku Flow")
    }
    for(input in parsedRSpec[["in"]]) {
      if(input[["fullName"]] == name || input[["smartName"]] == name) {
        found = TRUE
        if(!('partitions' %in% names(input))) {
          partitions = list()
        } else if(is.null(input[["partitions"]])) {
          partitions = list()
        } else {
          partitions = input[["partitions"]]
        }
      }
    }
    currentActivityId <- parsedRSpec$currentActivityId
  }

  getSchemaUrl = dku_intercom__get_jek_or_backend_url("/datasets/get-schema/")
  readDataUrl = dku_intercom__get_jek_or_backend_url("/datasets/read-data/")
  getVerificationPathURL <- dku_intercom__get_jek_or_backend_url("/datasets/verify-read/")

  resp = POST(getSchemaUrl,body = list(fullDatasetName=name),
              encode="form", dku__get_auth_headers(), dku_intercom__get_httr_config())
  dku__check_api_error(resp, "Failed to read dataset")

  schema <- content(resp)

  if (length(schema$columns) == 0) {
    stop("Failed: empty schema")
  }

  names = sapply(schema$columns, function(x) x$name)

  infer_from_schema <- is.na(colClasses) && inferColClassesFromData == FALSE;
  if (infer_from_schema) {
    # infer from schema
    colClasses = dku__convert_schema_to_colClasses(schema)
  }

  splitted = strsplit(name,"\\.")

  library("digest");
  currentActivityIdPart <- digest(currentActivityId, "md5")
  if ( !(is.null(partitions)) ) {
    datasetSessionIdPart <- paste0(digest(name, "md5"), "-", digest(partitions, "md5"))
  } else {
    datasetSessionIdPart <- digest(name, "md5")
  }
  readSessionId <- sprintf("%s-%s-%s", currentActivityIdPart, datasetSessionIdPart, floor(runif(1, min=0, max=10e10))) # Attempt to make a unique read session id
  requestBody <- list(projectKey=splitted[[1]][1], datasetName=splitted[[1]][2], format="tsv-excel-header", readSessionId=readSessionId)
  if(!is.null(columns)) {
    if(length(columns)==0) {
      stop("You must select at least one column");
    }
    if(!all(columns %in% names)){
      missing_cols = columns[!(columns %in% names)]
      stop(paste("Column", missing_cols, "doesn't exist"))
    }
    names = columns
    requestBody[["columns"]] <- paste(names,collapse=',')
  }
  if ( !(is.null(partitions)) ) {
    requestBody[["partitions"]] <- paste(partitions,collapse=',')
  }
  sampling = switch(samplingMethod,
                    full=NULL,
                    fixed=fixed.sampling(round(nbRows)),
                    head=head.sampling(round(nbRows)),
                    ratio=ratio.sampling(ratio))
  if ( !(is.null(sampling)) ) {
    requestBody[["sampling"]] <- RJSONIO::toJSON(sampling)
  }

  handle <- curl::new_handle()
  location_data <- dku_intercom__get_location_info()
  if ("no_check_certificate" %in% names(location_data) && location_data$no_check_certificate) {
      curl::handle_setopt(handle, ssl_verifypeer=0)
  }
  httr_req <- httr:::request_build("POST",
                                    readDataUrl,
                                    httr:::body_config(requestBody, encode="form"),
                                    dku_intercom__get_auth_headers(),
                                    dku_intercom__get_httr_config())

  curl::handle_setopt(handle, .list = httr_req$options)
  curl::handle_setheaders(handle, .list = httr_req$headers)

  con <- curl::curl(readDataUrl, "rf", handle = handle)
  dku__check_curl_api_error(handle, con, "Failed to read dataset data")

  # data comes back as csv
  df <- read.delim(con,
                    header=TRUE,
                    stringsAsFactors=F,
                    na.strings=if(convertEmptyStrings) c(na.strings, "") else na.strings,
                    colClasses=colClasses) %>%
    mutate_all(funs(dku__convert_logical))

  if (infer_from_schema) {
    # dates cannot be converted in read.delim, so do it afterwards
    df <- dku__fixup_dates_for_schema(df, schema)
  }

  close(con)

  # verify that the data is not incomplete
  resp <- POST(getVerificationPathURL, body = list(readSessionId=readSessionId),
               encode="form", dku_intercom__get_auth_headers(), dku_intercom__get_httr_config())
  dku__check_api_error(resp, "Failed to verify dataset read")

  return(df)
}

#' Lists the partitions of a dataset
#' @param name name of dataset
#' @export
dkuListDatasetPartitions <- function(name) {
  name <- dku__resolve_smart_name(name)

  getSchemaUrl = dku_intercom__get_jek_or_backend_url("/datasets/list-partitions/")

  resp = POST(getSchemaUrl,body = list(fullDatasetName=name),
              encode="form", dku__get_auth_headers(), dku_intercom__get_httr_config())
  
  return(content(resp))
}

#' Gets information about the location of a dataset (path to files, table name, ...)
#' @param name name of dataset (may be prefixed by a project key)
#' @param sensitiveInfo should the call also retrieve information about the connection that might include passwords. This 
#'                         is only possible if you have access to the connection details.
#' @export
dkuGetDatasetLocationInfo <- function(name, sensitiveInfo=FALSE) {
  fullName <- dku__resolve_smart_name(name)
  id <- dku__ref_to_name(fullName)
  pkey <- dku__ref_to_pkey(fullName)

  getLocationInfoURL = dku_intercom__get_jek_or_backend_url("/datasets/get-location-info/")

  resp = POST(getLocationInfoURL,body = list(datasetName=id, projectKey=pkey, sensitiveInfo=sensitiveInfo),
              encode="form", dku__get_auth_headers(), dku_intercom__get_httr_config())
  dku__check_api_error(resp, "Failed to read dataset data")
  return(content(resp))
}
