Skip to content

Commit

Permalink
Added functions to merge data in the main package
Browse files Browse the repository at this point in the history
'merge_results()' merges biblioverlap's results into a single dataframe and 'merge_input_files()' merges multiple files from the same source into a single file
  • Loading branch information
gavieira committed Dec 13, 2023
1 parent 1d9830a commit 670edba
Show file tree
Hide file tree
Showing 11 changed files with 250 additions and 45 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
export("%>%")
export(biblioverApp)
export(biblioverlap)
export(merge_input_files)
export(merge_results)
export(plot_matching_summary)
export(plot_upset)
export(plot_venn)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* Added 'quote' tabpanels alongside all `fileInput()` calls to determine quotation of input datasets
* Added download buttons for plots in ShinyApp
* Added package logo to plots
* Added `merge_results`: a function to merge biblioverlap's results into a single dataframe
* Added `merge_input_files`: a function to merge multiple files from the same source into a single file


# biblioverlap 1.0.3
Expand Down
51 changes: 51 additions & 0 deletions R/01-data_preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,54 @@ data_preprocessing <- function(db_list, matching_fields) {
} )
return(preprocessed_data)
}


#' Merge multiple input files from the same source
#'
#' @details
#' It is fairly common to retrieve data from a single bibliographic database in small chunks. Thus, this function is designed to merge multiple files from the same source into a single file while also removing duplicate records.
#'
#'
#' @param input_files - an array containing the path to all input files
#' @param sep - field separator. Default: comma (',')
#' @param quote - quote type used for character fields. Default: Double quotes ('"')
#'
#' @return a single dataframe with all unique records from the input files
#' @export
#'
#' @examples
#'
#' ## Generating tempfiles
#' tempfile1 <- tempfile(fileext = ".csv")
#' tempfile2 <- tempfile(fileext = ".csv")
#' write.csv(ufrj_bio_0122$Biochemistry, file = tempfile1, row.names = FALSE)
#' write.csv(ufrj_bio_0122$Genetics, file = tempfile2, row.names = FALSE)
#'
#' ## Testing function
#' merged_files <- merge_input_files(c(tempfile1, tempfile2))
#' dim(merged_files)
#' head(merged_files)
#'
merge_input_files <- function(input_files,
sep = ",",
quote = '"') {
df_list <- lapply(input_files, function(input_file) {
utils::read.csv(input_file,
sep = sep,
quote = quote,
strip.white = TRUE,
check.names = FALSE) })
tryCatch({
df <- do.call(rbind, df_list)
}, error = function(err) {
stop('Failed to merge files. Are they from the same database and/or have the same columns?') }
)
df[] <- lapply(df, function(col) { #Cleaning data (one column at a time)
col <- trimws(as.character(col)) # Removing leading and trailing whitespaces
col[which(col == "" | is.null(col))] <- NA # Convert empty or null values to NA
return(col)
})
df <- df[!duplicated(df), ] # Removing duplicate records

return( df )
}
41 changes: 41 additions & 0 deletions R/05-biblioverlap.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,44 @@ biblioverlap <- function(db_list, matching_fields = default_matching_fields, n_t
#final_db_list$score_matrices <- score_matrices #For debugging
return (final_db_list) #Returning db_list and summary
}


#' Merge biblioverlap's results into a single dataframe
#'
#' @param db_list - list of matched dataframes (with UUID column added by [`biblioverlap`])
#' @param filter_distinct - boolean value determining whether to return only the subset containing distinct records (TRUE) or to keep overlapping records between datasets (FALSE). Default: FALSE
#'
#' @return a single dataframe containing data from db_list, featuring an additional 'SET_NAME' column to indicate from which dataset each record came
#' @importFrom rlang .data
#' @export
#'
#' @examples
#'
#' #Running document-level matching procedure for two datasets
#' biblioverlap_results <- biblioverlap(ufrj_bio_0122[1:2])
#'
#' #Obtaining the results as a single dataframe (including overlapping records)
#' all_data <- merge_results(biblioverlap_results$db_list)
#'
#' #Checking number of total rows and overlapping documents are in the dataframe
#' nrow(all_data)
#' sum(duplicated(all_data$UUID))
#'
#' #Obtaining only unique records as a single dataframe
#' distinct_data <- merge_results(biblioverlap_results$db_list, filter_distinct = FALSE)
#'
#' #Checking number of total rows and overlapping documents are in the dataframe
#' nrow(distinct_data)
#' sum(duplicated(distinct_data$UUID))
#'
merge_results <- function(db_list, filter_distinct = FALSE){
db_list <- lapply(db_list, function(df) dplyr::mutate_all(df, as.character))
df <- dplyr::bind_rows(db_list, .id = 'SET_NAME') #Joining all info in a single table, while also adding a new column (SET_NAME) with the name of the set that record comes from
columns_to_front <- c("SET_NAME", "UUID") # Specifying the names of the columns to be moved to the front
df <- df[c(columns_to_front, setdiff(names(df), columns_to_front))] # Rearrange columns
if (filter_distinct) {
df <- dplyr::distinct(df, .data$UUID, .keep_all = TRUE)
}

return(df)
}
2 changes: 1 addition & 1 deletion R/06-plots_and_app.R
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ plot_upset <- function(db_list, ...) {
#' Shiny App for the biblioverlap package
#'
#' @param port - port of the application
#' @param max_upload_size - max upload size of documents (in MB) - Default 100
#' @param max_upload_size - max upload size of documents (in MB) - Default 1000
#' @param launch.browser - launch on browser - Default = TRUE
#'
#' @return opens a instance of the biblioverlap UI
Expand Down
43 changes: 42 additions & 1 deletion data-raw/data_prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,48 @@ ufrj_bio_0122 <- lapply(df_list, function(df) {
dplyr::mutate(dplyr::across(dplyr::where(is.character), ~ iconv(., to = "UTF-8"))) #Making sure that all character fields are UTF-8 encoded
})


#Saving as package data
usethis::use_data(ufrj_bio_0122, compress = 'xz')



#Generating a venn diagram that show the intersects calculated in each round of pairwise comparisons (at first, this image will only be displayed in the paper)

#Source: https://stackoverflow.com/questions/72651478/how-do-i-make-certain-regions-of-of-my-venn-diagram-colored-and-have-the-rest-bl

library(ggVennDiagram)
library(ggplot2)

test_data = list(A = 1:1, B = 1:1, C = 1:1, D = 1:1)

p <- ggVennDiagram(test_data, label_size = 15,
set_size = 15, label_alpha = 1) +
scale_color_manual(values = rep("black", 4))

#Names to be plotted instead of intersection counts
new_names <-c('A', 'B', 'C', 'D',
'AB', 'AC', 'AD', 'BC', 'BD', 'CD',
'ABC', 'ABD', 'ACD', 'BCD',
'ABCD')

p$layers[[1]]$mapping <- aes(fill = name) #Each name (intersection) can have an associated color now
p$layers[[4]]$data$both <- new_names #Changing 'both' data column to have the intersect names, which will then be plotted instead


p + scale_fill_manual(values = c(A = 'red',
A..B = 'red',
A..C = 'red',
A..D = 'red',
A..B..C = 'red',
A..B..D = 'red',
A..C..D = 'red',
A..B..C..D = 'red',
B = 'blue',
B..C = 'blue',
B..D = 'blue',
B..C..D = 'blue',
C = 'darkgreen',
C..D = 'darkgreen',
D = 'darkgreen'
) ) + #Specifying a color for each intersect
theme(legend.position = '') #removing legend
59 changes: 19 additions & 40 deletions inst/biblioverApp/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,6 @@ server <- function(input, output, session) {
}


# Merge files into a named list of dataframes
get_merged_db_list <- function(db_list) {
db_list <- lapply(db_list, function(df) dplyr::mutate_all(df, as.character))
df <- dplyr::bind_rows(db_list, .id = 'SET_NAME') #Joining all info in a single table, while also adding a new column (SET_NAME) with the name of the set that record comes from
columns_to_front <- c("SET_NAME", "UUID") # Specifying the names of the columns to be moved to the front
df <- df[c(columns_to_front, setdiff(names(df), columns_to_front))] # Rearrange columns

return( df )
}

generate_dataset_input_fields <- function(id) {
tagList(
div(
Expand Down Expand Up @@ -160,13 +150,17 @@ server <- function(input, output, session) {
)
}

results_data_table <- reactive ({
return ( biblioverlap::merge_results(calculate_results()$db_list, filter_distinct = input$filter_distinct) )
} )


output$download_data <- downloadHandler(
filename = function() {
'result_data.csv'
},
content = function(file) {
write.csv(get_merged_db_list(calculate_results()$db_list), file, row.names = FALSE)
write.csv(results_data_table(), file, row.names = FALSE)
}
)

Expand All @@ -176,16 +170,13 @@ server <- function(input, output, session) {
'summary.csv'
},
content = function(file) {
write.csv(calculate_results()$summary$df, file, row.names = FALSE)
write.csv(calculate_results()$summary, file, row.names = FALSE)
}
)


output$full_table <- DT::renderDataTable({
table_list <- calculate_results()$db_list
table <- get_merged_db_list(table_list)

return( read_datatable(table) )
return( read_datatable(results_data_table()) )
}, server = TRUE) ##Server is necessary because the db_list can be huge

output$summary_table <- renderTable({
Expand Down Expand Up @@ -282,44 +273,32 @@ server <- function(input, output, session) {
output$download_upset <- download_plot('upset_plot.png', upset_plot)


# Function to merge multiple input_files
merge_input_files <- reactive({
#Code for the 'Merge Files' tabset
merged_input_files <- reactive({

input_files <- input$unmerged_files$datapath
sep <- input$unmerged_sep
quote <- input$unmerged_quote

df_list <- lapply(input_files, function(input_file) {
read.csv(input_file,
sep = sep,
quote = quote,
strip.white = TRUE,
check.names = FALSE) })
tryCatch({
df <- do.call(rbind, df_list)
}, error = function(err)
showNotification('Failed to merge files. Are they from the same database and/or have the same columns?', type = 'err', duration = NULL)
)
df[] <- lapply(df, function(col) { #Cleaning data (one column at a time)
col <- trimws(as.character(col)) # Removing leading and trailing whitespaces
col[which(col == "" | is.null(col))] <- NA # Convert empty or null values to NA
return(col)
})
df <- df[!duplicated(df), ] # Removing duplicate records

return( df )
merged_files <- biblioverlap::merge_input_files(input_files,
sep = sep,
quote = quote)
return(merged_files)
})



output$download_merged_file <- downloadHandler(
filename = function() {
'merged_data.csv'
'merged_files.csv'
},
content = function(file) {
write.csv(merge_input_files(), file, row.names = FALSE)
write.csv(merged_input_files(), file, row.names = FALSE)
}
)

output$merged_files_table <- DT::renderDataTable({
table <- merge_input_files()
table <- merged_input_files()
return( read_datatable(table) )
}, server = TRUE) ##Server is necessary because the db_list can be huge

Expand Down
6 changes: 4 additions & 2 deletions inst/biblioverApp/ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ n_sets_ui <-


results_data <- tabPanel("Data",
downloadButton("download_data", "Download Data", class = 'custom_button'),
fluidRow(
column(2, checkboxInput('filter_distinct', 'Filter distinct records')),
column(2, downloadButton("download_data", "Download Data", class = 'custom_button')) ),
DT::dataTableOutput('full_table')
)

Expand Down Expand Up @@ -208,7 +210,7 @@ results_venn <- tabPanel("Venn Diagram",
)


#Matybe split the results_upset into a part that contains the modify options and one for the plot itself
#Maybe split the results_upset into a part that contains the modify options and one for the plot itself
results_upset <- tabPanel("UpSet Plot",
#tags$br(),
actionButton("modify_upset", "Modify plot", class = 'custom_button'),
Expand Down
2 changes: 1 addition & 1 deletion man/biblioverApp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 44 additions & 0 deletions man/merge_input_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 670edba

Please sign in to comment.