-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathfun_nucleus.R
executable file
·148 lines (103 loc) · 4.95 KB
/
fun_nucleus.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#function upload a tab-delimited file and turn into a large list.
#Provides as argument the path containing the file and the type of structure
#contained in the title of the file ("nuclei" or "CCs")
import.large.list <- function(path, pattern){
#Create a large_list containing the Summary files for all accessions
##Get the name of the files in the directory of interest (files retrieved using bash script)
names.files <- list.files(path=path, pattern=pattern, full.names=TRUE)
##Create the large_list using read.delim function to import the files
list1 <- lapply(names.files, read.delim)
##Get the names of each files for the dataframe labelling
names(list1) <- list.files(path=path, pattern=pattern, full.names=FALSE)
#Return the large list generated
return(list1)
}
# Function returning a dataframe with a new column entitled "Accession", If argument "name" is provided, the date will be removed
# from accession name. If not, the date is included
get.df <- function(large.list, name_mode='default'){
#Tips from http://stackoverflow.com/questions/22689301/how-use-read-table-in-a-for-loop
#Turn the large list into a dataframe (combining by row)
df <- do.call(rbind.data.frame, large.list)
#Replace blanks by NA values in the dataframe
df[df==""] <- NA
#Get name of accession for each line from the "Label" column of the dataframe df
Label <- df$Label
#Convert the list into character to allow the parsing
Label <- as.character(Label)
#Combine the list by rows
Label <- rbind(Label)
#Parse the "Label" so that the number of the image + the extension are removed
# If argument name_mode="name", return only the accession name, otherwise, return name and date
if(name_mode == "name"){
Label <- (strsplit(Label, "_"))
} else {
Label <- (strsplit(Label, "(-|_)[0-9]{1,}.tif$"))
}
#Get all first element of the nested lists created (which should contain the accession name)
accession <- lapply(Label, `[[`, 1)
#add a new column in the dataframe df called "Accession" which contains all
#parsed names defined in acc
df$Accession <- accession
#Put the column "Accession" at the first position (choose last column (Accession)),
#New pipeline with nuclei results generated 47 columns
df <- subset(df, select=c(Accession, 1:47))
#Unlist column "Accession"
df$Accession <- unlist(df$Accession)
#Turn the column "Accession" into a factor
df$Accession <- as.factor(df$Accession)
#Return the dataframe
return(df)
}
#Summarize nuclei variables for each accession
summarize.nuclei <- function(df){
#Mean the different variables and select only the columns for nuclear characteristics
df_nuclei <- subset(df, select=c(1:26))
df_CC <- subset(df, select=c(1,2, 28:48))
df_mean_nuclei <- aggregate(df_nuclei[3:26], list(df$Accession), mean)
names(df_mean_nuclei)[1] <- "Accession"
df_mean_CC <- aggregate(df_CC[3:23], list(df$Accession), mean, na.rm = TRUE)
names(df_mean_CC)[1] <- "Accession"
#Keep accession tag, name, label, and type
df_type <- subset(df, select=c(49,1,27))
#Get HX values, 1 value per accession
df_hx <- hx.calculator(df_type)
#Merge dataframes
df_merged <- merge(df_mean_nuclei, df_mean_CC, by="Accession")
df_merged <- merge(df_merged, df_hx, by="Accession")
#Add tag
df_tag <- levels(df$tg_ecotypeid)
df_accession <- levels(df$Accession)
df_name <- cbind(df_tag, df_accession)
df_name <- as.data.frame(df_name)
names(df_name)[1] <- "tg_ecotypeid"
names(df_name)[2] <- "Accession"
df_merged <- merge(df_merged, df_name, by="Accession")
#Reorder columns
df_merged <- subset(df_merged, select=c(1, 48, 2:47))
return(df_merged)
}
hx.calculator <- function(df){
#Hx all accessions together
general_hx <- sum(df$Type=="compact")/sum(df$Type!="")
#Summarize the value of Type for each accession (creates a table)
table_type <- (table(df$Accession, df$Type))
#Convert in df
df_table_type <- as.data.frame.matrix(table_type)
#Convert row names into a column "Accession"
df_table_type <- tibble::rownames_to_column(df_table_type, "Accession")
#Add a 4th column with the ratio 'compact nuclei/all nuclei'
df_table_type[4] <- df_table_type[2]/(df_table_type[3]+df_table_type[2])
#Keep only the first and last column
df_hx <- df_table_type[c(1,4)]
#Rename the 2 columns of the dataframe
colnames(df_hx) <- c("Accession", "HX")
return(df_hx)
}
#Print a box plot of the chosen dataframe and variable. Title should be added.
ggplot.boxplot <- function(df, group, variable, title){
ggplot(data = df, aes_string(x=group, y=variable)) +
ggtitle(title) + #Title
theme(plot.title = element_text(hjust = 0.5)) + #Center the title
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_boxplot() #Type of plot (box plot)
}