Added functions to merge data in the main package

'merge_results()' merges biblioverlap's results into a single dataframe and 'merge_input_files()' merges multiple files from the same source into a single file
gavieira · Dec 13, 2023 · 670edba · 670edba
1 parent 1d9830a
commit 670edba
Show file tree

Hide file tree

Showing 11 changed files with 250 additions and 45 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,8 @@
 export("%>%")
 export(biblioverApp)
 export(biblioverlap)
+export(merge_input_files)
+export(merge_results)
 export(plot_matching_summary)
 export(plot_upset)
 export(plot_venn)

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@
 * Added 'quote' tabpanels alongside all `fileInput()` calls to determine quotation of input datasets
 * Added download buttons for plots in ShinyApp
 * Added package logo to plots
+* Added `merge_results`: a function to merge biblioverlap's results into a single dataframe
+* Added `merge_input_files`: a function to merge multiple files from the same source into a single file
 
 
 # biblioverlap 1.0.3

diff --git a/R/01-data_preprocessing.R b/R/01-data_preprocessing.R
@@ -45,3 +45,54 @@ data_preprocessing <- function(db_list, matching_fields) {
   } )
   return(preprocessed_data)
 }
+
+
+#' Merge multiple input files from the same source
+#'
+#' @details
+#' It is fairly common to retrieve data from a single bibliographic database in small chunks. Thus, this function is designed to merge multiple files from the same source into a single file while also removing duplicate records.
+#'
+#'
+#' @param input_files - an array containing the path to all input files
+#' @param sep - field separator. Default: comma (',')
+#' @param quote - quote type used for character fields. Default: Double quotes ('"')
+#'
+#' @return a single dataframe with all unique records from the input files
+#' @export
+#'
+#' @examples
+#'
+#' ## Generating tempfiles
+#' tempfile1 <- tempfile(fileext = ".csv")
+#' tempfile2 <- tempfile(fileext = ".csv")
+#' write.csv(ufrj_bio_0122$Biochemistry, file = tempfile1, row.names = FALSE)
+#' write.csv(ufrj_bio_0122$Genetics, file = tempfile2, row.names = FALSE)
+#'
+#' ## Testing function
+#' merged_files <- merge_input_files(c(tempfile1, tempfile2))
+#' dim(merged_files)
+#' head(merged_files)
+#'
+merge_input_files <- function(input_files,
+                              sep = ",",
+                              quote = '"') {
+  df_list <- lapply(input_files, function(input_file) {
+    utils::read.csv(input_file,
+             sep = sep,
+             quote = quote,
+             strip.white = TRUE,
+             check.names = FALSE) })
+  tryCatch({
+    df <- do.call(rbind, df_list)
+  }, error = function(err) {
+    stop('Failed to merge files. Are they from the same database and/or have the same columns?') }
+  )
+  df[] <- lapply(df, function(col) { #Cleaning data (one column at a time)
+    col <- trimws(as.character(col)) # Removing leading and trailing whitespaces
+    col[which(col == "" | is.null(col))] <- NA  # Convert empty or null values to NA
+    return(col)
+  })
+  df <- df[!duplicated(df), ]   # Removing duplicate records
+
+  return( df )
+}
diff --git a/R/05-biblioverlap.R b/R/05-biblioverlap.R
@@ -232,3 +232,44 @@ biblioverlap <- function(db_list, matching_fields = default_matching_fields, n_t
   #final_db_list$score_matrices <- score_matrices #For debugging
   return (final_db_list) #Returning db_list and summary
 }
+
+
+#' Merge biblioverlap's results into a single dataframe
+#'
+#' @param db_list - list of matched dataframes (with UUID column added by [`biblioverlap`])
+#' @param filter_distinct - boolean value determining whether to return only the subset containing distinct records (TRUE) or to keep overlapping records between datasets (FALSE). Default: FALSE
+#'
+#' @return a single dataframe containing data from db_list, featuring an additional 'SET_NAME' column to indicate from which dataset each record came
+#' @importFrom rlang .data
+#' @export
+#'
+#' @examples
+#'
+#' #Running document-level matching procedure for two datasets
+#' biblioverlap_results <- biblioverlap(ufrj_bio_0122[1:2])
+#'
+#' #Obtaining the results as a single dataframe (including overlapping records)
+#' all_data <- merge_results(biblioverlap_results$db_list)
+#'
+#' #Checking number of total rows and overlapping documents are in the dataframe
+#' nrow(all_data)
+#' sum(duplicated(all_data$UUID))
+#'
+#' #Obtaining only unique records as a single dataframe
+#' distinct_data <- merge_results(biblioverlap_results$db_list, filter_distinct = FALSE)
+#'
+#' #Checking number of total rows and overlapping documents are in the dataframe
+#' nrow(distinct_data)
+#' sum(duplicated(distinct_data$UUID))
+#'
+merge_results <- function(db_list, filter_distinct = FALSE){
+  db_list <- lapply(db_list, function(df) dplyr::mutate_all(df, as.character))
+  df <- dplyr::bind_rows(db_list, .id =  'SET_NAME') #Joining all info in a single table, while also adding a new column (SET_NAME) with the name of the set that record comes from
+  columns_to_front <- c("SET_NAME", "UUID") # Specifying the names of the columns to be moved to the front
+  df <- df[c(columns_to_front, setdiff(names(df), columns_to_front))] # Rearrange columns
+  if (filter_distinct) {
+    df <- dplyr::distinct(df, .data$UUID, .keep_all = TRUE)
+  }
+
+  return(df)
+}
diff --git a/R/06-plots_and_app.R b/R/06-plots_and_app.R
@@ -177,7 +177,7 @@ plot_upset <- function(db_list, ...) {
 #' Shiny App for the biblioverlap package
 #'
 #' @param port - port of the application
-#' @param max_upload_size - max upload size of documents (in MB) - Default 100
+#' @param max_upload_size - max upload size of documents (in MB) - Default 1000
 #' @param launch.browser - launch on browser - Default = TRUE
 #'
 #' @return opens a instance of the biblioverlap UI

diff --git a/data-raw/data_prep.R b/data-raw/data_prep.R
@@ -45,7 +45,48 @@ ufrj_bio_0122 <- lapply(df_list, function(df) {
     dplyr::mutate(dplyr::across(dplyr::where(is.character), ~ iconv(., to = "UTF-8"))) #Making sure that all character fields are UTF-8 encoded
 })
 
-
 #Saving as package data
 usethis::use_data(ufrj_bio_0122, compress = 'xz')
 
+
+
+#Generating a venn diagram that show the intersects calculated in each round of pairwise comparisons (at first, this image will only be displayed in the paper)
+
+#Source: https://stackoverflow.com/questions/72651478/how-do-i-make-certain-regions-of-of-my-venn-diagram-colored-and-have-the-rest-bl
+
+library(ggVennDiagram)
+library(ggplot2)
+
+test_data = list(A = 1:1, B = 1:1, C = 1:1, D = 1:1)
+
+p <- ggVennDiagram(test_data, label_size = 15,
+                   set_size = 15, label_alpha = 1) +
+  scale_color_manual(values = rep("black", 4))
+
+#Names to be plotted instead of intersection counts
+new_names <-c('A', 'B', 'C', 'D',
+              'AB', 'AC', 'AD', 'BC', 'BD', 'CD',
+              'ABC', 'ABD', 'ACD', 'BCD',
+              'ABCD')
+
+p$layers[[1]]$mapping <- aes(fill = name) #Each name (intersection) can have an associated color now
+p$layers[[4]]$data$both <- new_names #Changing 'both' data column to have the intersect names, which will then be plotted instead
+
+
+p + scale_fill_manual(values = c(A = 'red',
+                                 A..B = 'red',
+                                 A..C = 'red',
+                                 A..D = 'red',
+                                 A..B..C = 'red',
+                                 A..B..D = 'red',
+                                 A..C..D = 'red',
+                                 A..B..C..D = 'red',
+                                 B = 'blue',
+                                 B..C = 'blue',
+                                 B..D = 'blue',
+                                 B..C..D = 'blue',
+                                 C = 'darkgreen',
+                                 C..D = 'darkgreen',
+                                 D = 'darkgreen'
+) ) + #Specifying a color for each intersect
+  theme(legend.position = '') #removing legend
diff --git a/inst/biblioverApp/server.R b/inst/biblioverApp/server.R
@@ -41,16 +41,6 @@ server <- function(input, output, session) {
   }
 
 
-  # Merge files into a named list of dataframes
-  get_merged_db_list <- function(db_list) {
-    db_list <- lapply(db_list, function(df) dplyr::mutate_all(df, as.character))
-    df <- dplyr::bind_rows(db_list, .id =  'SET_NAME') #Joining all info in a single table, while also adding a new column (SET_NAME) with the name of the set that record comes from
-    columns_to_front <- c("SET_NAME", "UUID") # Specifying the names of the columns to be moved to the front
-    df <- df[c(columns_to_front, setdiff(names(df), columns_to_front))] # Rearrange columns
-
-    return( df )
-  }
-
  generate_dataset_input_fields <- function(id) {
    tagList(
       div(
@@ -160,13 +150,17 @@ server <- function(input, output, session) {
       )
   }
 
+  results_data_table <- reactive ({
+    return ( biblioverlap::merge_results(calculate_results()$db_list, filter_distinct = input$filter_distinct) )
+  }  )
+
 
   output$download_data <- downloadHandler(
     filename = function() {
       'result_data.csv'
     },
     content = function(file) {
-      write.csv(get_merged_db_list(calculate_results()$db_list), file, row.names = FALSE)
+      write.csv(results_data_table(), file, row.names = FALSE)
     }
   )
 
@@ -176,16 +170,13 @@ server <- function(input, output, session) {
       'summary.csv'
     },
     content = function(file) {
-      write.csv(calculate_results()$summary$df, file, row.names = FALSE)
+      write.csv(calculate_results()$summary, file, row.names = FALSE)
     }
   )
 
 
   output$full_table <- DT::renderDataTable({
-    table_list <- calculate_results()$db_list
-    table <- get_merged_db_list(table_list)
-
-    return( read_datatable(table) )
+    return( read_datatable(results_data_table()) )
   }, server = TRUE) ##Server is necessary because the db_list can be huge
 
   output$summary_table <- renderTable({
@@ -282,44 +273,32 @@ server <- function(input, output, session) {
   output$download_upset <- download_plot('upset_plot.png', upset_plot)
 
 
-  # Function to merge multiple input_files
-  merge_input_files <- reactive({
+  #Code for the 'Merge Files' tabset
+  merged_input_files <- reactive({
+
     input_files <- input$unmerged_files$datapath
     sep <- input$unmerged_sep
     quote <- input$unmerged_quote
 
-    df_list <- lapply(input_files, function(input_file) {
-      read.csv(input_file,
-               sep = sep,
-               quote = quote,
-               strip.white = TRUE,
-               check.names = FALSE) })
-    tryCatch({
-      df <- do.call(rbind, df_list)
-    }, error = function(err)
-      showNotification('Failed to merge files. Are they from the same database and/or have the same columns?', type = 'err', duration = NULL)
-    )
-    df[] <- lapply(df, function(col) { #Cleaning data (one column at a time)
-      col <- trimws(as.character(col)) # Removing leading and trailing whitespaces
-      col[which(col == "" | is.null(col))] <- NA  # Convert empty or null values to NA
-      return(col)
-    })
-    df <- df[!duplicated(df), ]   # Removing duplicate records
-
-    return( df )
+    merged_files <- biblioverlap::merge_input_files(input_files,
+                                    sep = sep,
+                                    quote = quote)
+    return(merged_files)
   })
 
+
+
   output$download_merged_file <- downloadHandler(
     filename = function() {
-      'merged_data.csv'
+      'merged_files.csv'
     },
     content = function(file) {
-      write.csv(merge_input_files(), file, row.names = FALSE)
+      write.csv(merged_input_files(), file, row.names = FALSE)
     }
   )
 
   output$merged_files_table <- DT::renderDataTable({
-      table <- merge_input_files()
+      table <- merged_input_files()
       return( read_datatable(table) )
     }, server = TRUE) ##Server is necessary because the db_list can be huge
 

diff --git a/inst/biblioverApp/ui.R b/inst/biblioverApp/ui.R
@@ -114,7 +114,9 @@ n_sets_ui <-
 
 
 results_data <-  tabPanel("Data",
-                          downloadButton("download_data", "Download Data", class = 'custom_button'),
+                          fluidRow(
+                          column(2, checkboxInput('filter_distinct', 'Filter distinct records')),
+                          column(2, downloadButton("download_data", "Download Data", class = 'custom_button')) ),
                           DT::dataTableOutput('full_table')
 )
 
@@ -208,7 +210,7 @@ results_venn <- tabPanel("Venn Diagram",
 )
 
 
-#Matybe split the results_upset into a part that contains the modify options and one for the plot itself
+#Maybe split the results_upset into a part that contains the modify options and one for the plot itself
 results_upset <-  tabPanel("UpSet Plot",
                            #tags$br(),
                            actionButton("modify_upset", "Modify plot", class = 'custom_button'),

diff --git a/man/biblioverApp.Rd b/man/biblioverApp.Rd
diff --git a/man/merge_input_files.Rd b/man/merge_input_files.Rd