useful_scripts/parse_prism_V10.R at master · Biomiha/useful_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
prism_tables <- function(path) {
  require(jsonlite)
  getExtension <- function(file){
    ex <- strsplit(basename(file), split="\\.")[[1]]
    return(ex[-1])
  }
  stopifnot(getExtension(path) == "prism")
  temp_dir <- tempdir()
  unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
  top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
  sheet_ids <- top_level_json$sheets$data
  sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
  return(as.vector(sheet_names))
}

extract_y_colnames <- function(path, table = 1) {
  require(jsonlite)
  require(vroom)
  getExtension <- function(file){
    ex <- strsplit(basename(file), split="\\.")[[1]]
    return(ex[-1])
  }
  stopifnot(getExtension(path) == "prism")
  temp_dir <- tempdir()
  unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
  top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
  sheet_ids <- top_level_json$sheets$data
  sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
  if(is.numeric(table)) table <- unname(sheet_names[table])
  stopifnot(table %in% sheet_names)
  table_sheet_id <- names(top_level_json$sheetAttributesMap[sheet_ids])[sheet_names %in% table]
  sheet_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
    grep(pattern = "data", value = TRUE) |>
    grep(pattern = "sheets", value = TRUE) |>
    grep(pattern = table_sheet_id, value = TRUE) |>
    grep(pattern = "sheet.json", value = TRUE) |>
    fromJSON()

  stopifnot(identical(sheet_json$title, table))
  yDataSets <- sheet_json$table$dataSets
  search_y <- function(x) {
    list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
      grep(pattern = "data", value = TRUE) |>
      grep(pattern = "sets", value = TRUE) |>
      grep(pattern = x, value = TRUE)
  }
  extract_title <- function(set_json) {
    if("string" %in% names(set_json$title)) {
      set_json$title$string
    } else set_json$title
  }

  y_colnames <- lapply(yDataSets, search_y) |>
    lapply(FUN = fromJSON) |>
    lapply(FUN = extract_title) |>
    unlist()

  y_has_replicates <- sheet_json$table$dataFormat == "y_replicates"

  y_replicates <- sheet_json$table$replicatesCount

  if(y_has_replicates) {
    y_colnames <- make.unique(rep(y_colnames, each = y_replicates))
  }
  return(y_colnames)
}

read_prism <- function(path, table = 1, use_colnames = FALSE, ...) {
  require(jsonlite)
  require(vroom)
  getExtension <- function(file){
    ex <- strsplit(basename(file), split="\\.")[[1]]
    return(ex[-1])
  }
  stopifnot(getExtension(path) == "prism")
  temp_dir <- tempdir()
  unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
  top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
  sheet_ids <- top_level_json$sheets$data
  sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
  if(is.numeric(table)) table <- unname(sheet_names[table])
  stopifnot(table %in% sheet_names)
  table_sheet_id <- names(top_level_json$sheetAttributesMap[sheet_ids])[sheet_names %in% table]
  sheet_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
    grep(pattern = "data", value = TRUE) |>
    grep(pattern = "sheets", value = TRUE) |>
    grep(pattern = table_sheet_id, value = TRUE) |>
    grep(pattern = "sheet.json", value = TRUE) |>
    fromJSON()

  stopifnot(identical(sheet_json$title, table))
  table_id <- sheet_json$table$uid

  if(!is.null(sheet_json$table$xDataSet)) {
    xDataSet <- sheet_json$table$xDataSet
    x_colname <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
      grep(pattern = "data", value = TRUE) |>
      grep(pattern = "sets", value = TRUE) |>
      grep(pattern = xDataSet, value = TRUE) |>
      fromJSON() |>
      _$title
  } else x_colname <- NULL

  yDataSets <- sheet_json$table$dataSets
  search_y <- function(x) {
    list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
      grep(pattern = "data", value = TRUE) |>
      grep(pattern = "sets", value = TRUE) |>
      grep(pattern = x, value = TRUE)
  }
  extract_title <- function(set_json) {
    if("string" %in% names(set_json$title)) {
      set_json$title$string
    } else set_json$title
  }

  y_colnames <- lapply(yDataSets, search_y) |>
    lapply(FUN = fromJSON) |>
    lapply(FUN = extract_title) |>
    unlist()

  y_has_replicates <- sheet_json$table$dataFormat == "y_replicates"

  y_replicates <- sheet_json$table$replicatesCount

  if(y_has_replicates) {
    y_colnames <- make.unique(rep(y_colnames, each = y_replicates))
    }

  table_data <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
    grep(pattern = "data", value = TRUE) |>
    grep(pattern = "tables", value = TRUE) |>
    grep(pattern = table_id, value = TRUE) |>
    grep(pattern = "data.csv", value = TRUE) |>
    vroom(col_names = FALSE, ...)

  dims_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
    grep(pattern = "data", value = TRUE) |>
    grep(pattern = "tables", value = TRUE) |>
    grep(pattern = table_id, value = TRUE) |>
    grep(pattern = "content.json", value = TRUE) |>
    fromJSON()

  dims_json <- c(dims_json$numberOfRows, dims_json$numberOfColumns)
  stopifnot(identical(dim(table_data), dims_json))


  if(use_colnames) {
    new_colnames <- c(x_colname, y_colnames)
    colnames(table_data) <- c(setdiff(colnames(table_data), tail(colnames(table_data), n = length(new_colnames))), new_colnames)
  }
  return(table_data)
}