PS117_proteomicsbioinformatics.Rmd

Load necessary packages
```{r}
library(dplyr)
library(ggplot2)
library(magrittr)
library(reshape2)
library(tidyr)
library(plotly)
library(tidyverse)
library(cowplot)
library(stringr)
library(cowplot)
library(ggpubr)
library(data.table)
library(gplots)
library(treemapify)

setwd("D:/School/PhD/PS117/data/open-MS_output/")
```

sample name map - this is to convert the sample numbers used on the mass spec to treatments and replicates etc.
```{r}
sample_namenumber_map <- c ("S01" = "BA1_T0_T0_T0_A",
                            "S02" = "BA1_T0_T0_T0_B",
                            "S03" = "BA1_T0_T0_T0_C",
                            "S04" = "BA1_LT_noFe_T8_B",
                            "S05" = "BA1_LT_noFe_T8_C",
                            "S06" = "BA1_LT_Fe_T8_A",
                            "S07" = "BA1_LT_Fe_T8_B",
                            "S08" = "BA1_LT_Fe_T8_C",
                            "S09" = "BA1_HT_noFe_T8_A",
                            "S10" = "BA1_HT_noFe_T8_B",
                            "S11" = "BA1_HT_noFe_T8_C",
                            "S12" = "BA1_HT_Fe_T8_A",
                            "S13" = "BA1_HT_Fe_T8_B",
                            "S14" = "BA1_HT_Fe_T8_C",
                            "S15" = "BA2_T0_T0_T0_A",
                            "S16" = "BA2_T0_T0_T0_B",
                            "S17" = "BA2_T0_T0_T0_C",
                            "S18" = "BA2_LT_noFe_T8_A",
                            "S19" = "BA2_LT_noFe_T8_B",
                            "S20" = "BA2_LT_noFe_T8_C",
                            "S21" = "BA2_LT_Fe_T8_A",
                            "S22" = "BA2_LT_Fe_T8_B",
                            "S23" = "BA2_LT_Fe_T8_C",
                            "S24" = "BA2_HT_noFe_T8_A",
                            "S25" = "BA2_HT_noFe_T8_B",
                            "S26" = "BA2_HT_noFe_T8_C",
                            "S27" = "BA2_HT_Fe_T8_A",
                            "S28" = "BA2_HT_Fe_T8_B",
                            "S29" = "BA2_HT_Fe_T8_C",
                            "S30" = "BA1_T0_T0_T0_D"
                            )
```

functional annotation with taxonomy
```{r}
#this script takes the final pipeline output (a.cvs file) after database searching, and assigns a taxonomic group and a functional annotation to each peptide using an annotation file. It does not assign taxonomic/functional annotation to any peptide that matches with more than one taxonomic group (I might tackle razor peptides, or peptides that match to more than one taxonomy at a later point). 

#It also aggregates taxonomies based on the lowest taxonomic resolution. e.g. if a peptide matches both centric and pennate diatoms, it will classify it as "unknown_Bacillariophyta", if that taxonomic classification is ambiguous, it will move up until it reaches the domain - if domains are ambiguous it will classify is 'ambiguous domain' -- all 'unknown' classification are peptides that matched to NA annotation. 

#It then normalizes the abundance of each peptide (MS1, ion intensity) to total peptide abundance in that injection. Total peptide abunance does not inclue CRAP (I remove CRAP matches early on), but it includes ambiguos (peptides that match to more than one annotation) and non-ambiguous peptides. 



annotations <- fread("D:/School/PhD/PS117/data/datannotations/annotation_allTFG.grpnorm_mmetsp_fc_pn_reclassified.edgeR.csv") [,c(1,2,4:9,12:15,22,26:29,44,43,45)]


filelist <- list.files(pattern = "211216_1037_097_LJ_S01_48.csv") 

lapply(filelist, function(x) {
      read.table(x, header=T) %>%  
      select(1,2,3,5) %>% #read only columns of interest
    
      rename(orf_id = protein) %>% #change name of 'protein' column to 'orf_id' for consistency with annotation file
      filter(!grepl("sp", orf_id)) %>% #delete matches with CRAP database
      separate_rows(orf_id, sep = "/") %>% #c  
      mutate (orf_id = gsub("XXX_","", as.character(orf_id))) %>% #d
      merge(annotations, by = "orf_id") %>% #e add taxomonic ID to each peptide. all.x =TRUE keeps all the peptides, even the ones that matched to the databse but are not in the annotation file. 
    
#separate the 'best_tax_string' into different taxonomic levels. The warning here is because some taxonomic strings don't go all the way down to species, or some ORFs don't have a taxonomy, so this code fills those unkowns with "NA" or just makes the cells empty. The next chuck of code replaces the "NA" and empty cells with 'unassigned'. 
      separate(best_tax_string, sep = ";", c("tax_a","tax_b","tax_c","tax_d","tax_e","tax_f","tax_g", "tax_h")) %>% #f

  #put 'unassigned' in all emptpy cells to make data management easier down the line
      mutate (cluster = sub("^$","unassigned_cluster", as.character(cluster))) %>% 
     
      mutate (kegg_hit = sub("^$","unassigned_kegg_hit", as.character(kegg_hit))) %>%
      #mutate (kegg_desc = sub("^$","unassigned_kegg_desc", as.character(kegg_desc))) %>%
     # mutate (kegg_pathway = sub("^$","unassigned_kegg_pathway", as.character(kegg_pathway))) %>%
     
      mutate (KO = sub("^$","unassigned_KO", as.character(KO))) %>%
      #mutate (KO_desc = sub("^$","unassigned_KO_desc", as.character(KO_desc))) %>%
      #mutate (KO_pathway = sub("^$","unassigned_KO_pathway", as.character(KO_pathway))) %>%
    
      mutate (KOG_id = sub("^$","unassigned_KOG_id", as.character(KOG_id))) %>%
     # mutate (KOG_desc = sub("^$","unassigned_KOG_desc", as.character(KOG_desc))) %>%
      mutate (KOG_class = sub("^$","unassigned_KOG_class", as.character(KOG_class))) %>%
      mutate (KOG_group = sub("^$","unassigned_KOG_group", as.character(KOG_group))) %>%
     
     mutate (PFams = sub("^$","unassigned_PFams", as.character(PFams))) %>%
     # mutate (PFams_desc = sub("^$","unassigned_PFams_desc", as.character(PFams_desc))) %>%
      
    mutate (TIGRFams = sub("^$","unassigned_TIGRFams", as.character(TIGRFams))) %>%
     # mutate (TIGRFams_desc = sub("^$","unassigned_TIGRFams_desc", as.character(TIGRFams_desc))) %>%
     
     mutate (best_hit_annotation = sub("^$","unassigned_best_hit_annot", as.character(best_hit_annotation))) %>%
      mutate (grpnorm_compartment = sub("^$","unassigned_grpnorm_comp", as.character(grpnorm_compartment))) %>%
      mutate (grpnorm_taxgrp = sub("^$","unassigned_grpnorm_taxgrp", as.character(grpnorm_taxgrp))) %>%

#some of the taxonmy cells have NA, and others don't so it's important to do the replace_na and the mutate so all empty and NA cells get an 'unassigned_tax' label.       
      replace_na(list(tax_a = "unassigned_tax")) %>%#i
      replace_na(list(tax_b = "unassigned_tax"))%>%  
      replace_na(list(tax_c = "unassigned_tax"))%>%  
      replace_na(list(tax_d = "unassigned_tax"))%>%  
      replace_na(list(tax_e = "unassigned_tax"))%>%  
      replace_na(list(tax_f = "unassigned_tax"))%>%  
      replace_na(list(tax_g = "unassigned_tax"))%>% 
      replace_na(list(tax_h = "unassigned_tax"))%>% 
 
      mutate (tax_a = sub("^$","unassigned_tax", as.character(tax_a))) %>% 
      mutate (tax_b = sub("^$","unassigned_tax", as.character(tax_b))) %>% 
      mutate (tax_c = sub("^$","unassigned_tax", as.character(tax_c))) %>% 
      mutate (tax_d = sub("^$","unassigned_tax", as.character(tax_d))) %>% 
      mutate (tax_e = sub("^$","unassigned_tax", as.character(tax_e))) %>% 
      mutate (tax_f = sub("^$","unassigned_tax", as.character(tax_f))) %>% 
      mutate (tax_g = sub("^$","unassigned_tax", as.character(tax_g))) %>% 
      mutate (tax_h = sub("^$","unassigned_tax", as.character(tax_h))) %>% 


      group_by(peptide,n_proteins,abundance) %>% #j 
      summarise(cluster = toString(cluster), 
                orf_id = toString(orf_id),
  #the str_flatten allows selection of separator. this is because many annotations have comas in them, so the script thinks it's more than one annotation    
  
                kegg_hit = toString(kegg_hit),
                #kegg_desc = str_flatten(kegg_desc, "---"),
                #kegg_pathway = toString(kegg_pathway),
                
                KO = toString(KO),
               # KO_desc = str_flatten(KO_desc, "---"),
               # KO_pathway = str_flatten(KO_pathway, "---"),
                
                KOG_id = toString(KOG_id),
               # KOG_desc = str_flatten(KOG_desc, "---"),
                KOG_class = str_flatten(KOG_class, "---"),
                KOG_group = toString(KOG_group),

                PFams = str_flatten(PFams, "---"),
               # PFams_desc = str_flatten(PFams_desc, "---"),
                
                TIGRFams = str_flatten(TIGRFams, "---"), 
                #TIGRFams_desc = str_flatten(TIGRFams_desc, "---"), 
                
                best_hit_annotation = str_flatten(best_hit_annotation, "---"),
                grpnorm_compartment = toString(grpnorm_compartment),
                
                tax_a = toString(tax_a), 
                tax_b = toString(tax_b),
                tax_c = toString(tax_c),
                tax_d = toString(tax_d),
                tax_e = toString(tax_e), 
                tax_f = toString(tax_f),
                tax_g = toString(tax_g),
                tax_h = toString(tax_h),
                grpnorm_taxgrp = toString(grpnorm_taxgrp)) %>% 
    
      ungroup() %>% 

#find out how many unique annotations / taxonomic group each peptide matches to. For example, if one peptide matches to several ORFs with different annotations, then it will have >1 unique annotations, which means it's an ambiguous peptide. 
      mutate (uniq_cluster =  lengths (lapply (strsplit(cluster, split = ", "), unique))) %>% 
      mutate (uniq_kegg_hit =  lengths (lapply (strsplit(kegg_hit, split = ", "), unique))) %>%
      mutate (uniq_KO =  lengths (lapply (strsplit(KO, split = ", "), unique))) %>% 
      mutate (uniq_KOG_id =  lengths (lapply (strsplit(KOG_id, split = ", "), unique))) %>% 
      mutate (uniq_KOG_class =  lengths (lapply (strsplit(KOG_class, split = "---"), unique))) %>% 
      mutate (uniq_KOG_group =  lengths (lapply (strsplit(KOG_group, split = ", "), unique))) %>% 

      mutate (uniq_PFams =  lengths (lapply (strsplit(PFams, split = "---"), unique))) %>% 
      mutate (uniq_TIGRFams =  lengths (lapply (strsplit(TIGRFams, split = "---"), unique))) %>%
      mutate (uniq_best_hit_annotation =  lengths (lapply (strsplit(best_hit_annotation, split = "---"), unique))) %>%               mutate (uniq_grpnorm_compartment =  lengths (lapply (strsplit(grpnorm_compartment, split = ", "), unique))) %>% 
    
      mutate (uniq_a =  lengths (lapply (strsplit(tax_a, split = ", "), unique))) %>% 
      mutate (uniq_b =  lengths (lapply (strsplit(tax_b, split = ", "), unique))) %>%
      mutate (uniq_c =  lengths (lapply (strsplit(tax_c, split = ", "), unique))) %>%
      mutate (uniq_d =  lengths (lapply (strsplit(tax_d, split = ", "), unique))) %>%
      mutate (uniq_e =  lengths (lapply (strsplit(tax_e, split = ", "), unique))) %>%
      mutate (uniq_f =  lengths (lapply (strsplit(tax_f, split = ", "), unique))) %>%
      mutate (uniq_g =  lengths (lapply (strsplit(tax_g, split = ", "), unique))) %>%
      mutate (uniq_h =  lengths (lapply (strsplit(tax_h, split = ", "), unique))) %>%
      mutate (uniq_grpnorm_taxgrp =  lengths (lapply (strsplit(grpnorm_taxgrp, split = ", "), unique))) %>%
    
#if a peptide matches with more than one cluster, name is as ambiguous, if it matches one cluster, annotate as such. repeat for other areas
      mutate(final_cluster = ifelse (uniq_cluster !=1, "ambiguous_cluster" ,
                                 word(cluster,1, sep = ","))) %>%
     
      mutate(final_kegg_hit = ifelse (uniq_kegg_hit !=1, "ambiguous_kegg_hit" ,
                                 word(kegg_hit,1, sep = ","))) %>%
    
      mutate(final_KO = ifelse (uniq_KO !=1, "ambiguous_KO" ,
                                 word(KO,1,sep = ","))) %>%
   
      mutate(final_KOG_id = ifelse (uniq_KOG_id !=1, "ambiguous_KOG_id" ,
                                 word(KOG_id,1,sep = ","))) %>%

      mutate(final_KOG_class = ifelse (uniq_KOG_class !=1, "ambiguous_KOG_class" ,
                                 word(KOG_class,1,sep = "---"))) %>%
        
      mutate(final_KOG_group = ifelse (uniq_KOG_group !=1, "ambiguous_KOG_group" ,
                                 word(KOG_group,1,sep = ","))) %>%
     
        #since there could be are multiple words even if peptide matches to one PFAM for example, I want to make sure the entire unique string is captured with the 'sep = "---"  '
      mutate(final_PFams = ifelse (uniq_PFams !=1, "ambiguous_PFams" ,
                                 word(PFams,1, sep = "---"))) %>%
    
      mutate(final_TIGRFams = ifelse (uniq_TIGRFams !=1, "ambiguous_TIGRFams" ,
                                 word(TIGRFams,1, sep = "---"))) %>%
    
      mutate(final_best_hit_annotation = ifelse (uniq_best_hit_annotation !=1, "ambiguous_best_hit_annotation" ,
                                 word(best_hit_annotation,1, sep = "---"))) %>%
    
      mutate(final_grpnorm_compartment = ifelse (uniq_grpnorm_compartment !=1, "ambiguous_grpnorm_compartment" ,
                                 word(grpnorm_compartment,1,sep = ","))) %>%
    
      mutate(final_grpnorm_taxgrp = ifelse (uniq_grpnorm_taxgrp !=1, "ambiguous_grpnorm_taxgrp" ,
                                 word(grpnorm_taxgrp,1, sep = ","))) %>%
    
      mutate(final_taxon_id = ifelse (uniq_g !=1 & uniq_f ==1, paste0(tax_f),
                          ifelse (uniq_f !=1 & uniq_e ==1, paste0(tax_e),
                          ifelse (uniq_e !=1 & uniq_d ==1, paste0(tax_d),  
                          ifelse (uniq_d !=1 & uniq_c ==1, paste0(tax_c), 
                          ifelse (uniq_c !=1 & uniq_b ==1, paste0(tax_c),
                          ifelse (uniq_b !=1 & uniq_a ==1, paste0(tax_a),
                          ifelse (uniq_a !=1, "ambiguous_domain", 
                              tax_g)))))))) %>%
      mutate (final_taxon_id =  word(final_taxon_id, 1, sep = ",")) %>% 

#for now, this uses zero-tolerance for assigning annotation and taxa. For example, if a cluster has 30 matched peptides to it, they all have to be from the same tax. group for the cluster to be assigned to that group. If 29 were assigned to one group, and 1 to another, then it will be 'ambiguous'. I might change this later. 
       
      mutate (taxon_function = paste(final_cluster, "---",final_kegg_hit, "---",final_KO, "---", final_KOG_id, "---", final_KOG_class, "---", final_KOG_group, "---",final_PFams, "---",final_TIGRFams, "---",final_best_hit_annotation,"---", final_grpnorm_compartment, "---", final_grpnorm_taxgrp,"---", final_taxon_id))  %>%
  
    
     #mutate (normalized_abundance = abundance /sum(abundance) *1000000 ) %>% #o

     group_by(peptide,taxon_function, ) %>% #j 
     summarise(non_norm_abundance = sum(abundance),
               orf_id = toString(orf_id),) %>%
     ungroup() %>%
      
     write.csv(paste0("PS117_non_normalized_tax-annotation_20220401_", x), row.names = FALSE)
        })

#mutate (normalized_abundance = abundance /sum(abundance) *1000000 ) %>% #o

#b - delete all the peptides with matches to CRAP database. Some peptides match to CRAP and actual sample, those will need to go as well. The normalization is done using only the final list of peptides (i.e. not including CRAP and other stuff)  
#c - peptides match to more than one ORF. This sparates out each of the matches into a row
#d - remove the 'XXX_' from the ORFs that match to both database and decoy (we will still use those)
#e - add taxomonic ID and functional annotation to each peptide. Some peptides with match to several ORFs with different tax. IDs, so they'll have more than one tax. ID. 
#f - separates the taxonomy string into different taxonomic levels. The warning here is because some taxonomic strings don't go all the way down to species, so this code filles those unkowns with "NA"
#g - gets rids of unnecessary columns and selects taxonomic resolution that I want
#h - rename the column with the taxonomic level with to make it more flexible to use incase I want to try more than one taxonomic resolution
#i - change all NAs to 'unkown', incase we are at a resolution where there are unkowns.
#j - puts things back into a list format
#k - adds a new column 'length' and shows the number of unique taxonomic groups that match to a peptide. So 1 means all the matches for that peptide came from one taxonomic group. 
#l - gets the peptides that matched to only one taxonomic group (note the ^and$)
#m - extracts the first word from the list (so this is the consensus taxa)
#n - some words end with comma and some dont. this just gets rid of the comma
#o - normalizes each row by the total sum and multiplies by a million 
```

change the abundance column name to include filename -this way we know which sample came from where when we combine all the files
```{r}
filenames <- list.files(pattern = "PS117*")

all_files <- lapply(setNames(nm=filenames), function(fn) {
  dat <- read.csv(fn)
  ind <- colnames(dat) == "non_norm_abundance"
  if (any(ind)) {
    colnames(dat)[ind] <- paste0(tools::file_path_sans_ext(basename(fn)), "_non_norm_abundance")
  }
  dat
})

Map(write.csv, all_files, names(all_files), row.names=FALSE)
```

combine all the different injections into one dataframe 
This chunk fixes the taxon string issue:
Some taxaonomic group names have spaces in them, and some don't, so this is causing some issues when getting the taxonomic string properly. 
```{r}
alldata <- lapply(list.files(pattern = "PS117") , read.csv) #normalized instead of test usually
combined <- reduce(alldata, full_join, by = c("peptide", "taxon_function","orf_id"), all=TRUE)#all=TRUE keeps unique values if they're not present in all replicates (e.g. if Fragilariopsis was detected in one replicate but not another, it will put NA as the abundance in the replicate where it is not found -as opposed to not including it at all-)

#changes the column names to include only sample number and injection number
names(combined) <- gsub (pattern = "PS117_non_normalized_tax.annotation_20220401_211216_1037_097_LJ_*", replacement = "", x = names (combined))


names(combined) <- gsub (pattern = "_non_norm_abundance*", replacement = "", x = names (combined))

combined_v2 <- melt(combined, id.vars=c("peptide", "taxon_function", "orf_id")) %>%  
               separate(variable, c("treatment", "injection_number")) %>% 
               rename (non_normalized_abundance = value)

combined_v2$treatment <- sample_namenumber_map [combined_v2$treatment]

combined_v2$treatment <- paste(combined_v2$treatment,combined_v2$injection_number, sep = "_")
combined_v2 <- combined_v2[c(1,3,2,4,6)]
combined_v3 <- dcast(combined_v2, peptide + orf_id + taxon_function ~ treatment, value.var = 'non_normalized_abundance'  )
combined_v3[is.na(combined_v3)] = 0 #replace NA's with 0 

combined_v3 <- combined_v3 %>% separate(taxon_function, into=c(
  "cluster", 
  "kegg_hit", 
  "KO",
  "KOG_id",
  "KOG_class", 
  "KOG_group",
  "PFams", 
  "TIGRFams", 
  "best_hit_annotation", 
  "grpnorm_compartment",
  "grpnorm_taxgrp",
  "lowest_res_taxon_id"), sep = "---")

     

#add the entire string of taxonomy down to the highest taxonomic resolution we can confidently ID
annotations <- read.csv("D:/School/PhD/PS117/data/datannotations/annotation_allTFG.grpnorm_mmetsp_fc_pn_reclassified.edgeR.csv") [,c(1,43,45)]

stringlist <- unique (annotations$best_tax_string)


#when I separated the taxonomy and function columns, it added a space to the taxonomy and that made adding the string of taxonomy based on lowest res a bit wonky. This only used the last word or the lowest res taxonmy to avoid those spaces. 
combined_v3$lowest_res_taxon_id_word <- word(combined_v3$lowest_res_taxon_id, -1)

combined_v3$taxon_string_new <- sapply(combined_v3$lowest_res_taxon_id_word, function(x) 
    sub(sprintf("(.*%s).*", x), "\\1", grep(x, stringlist, 
     value = TRUE)[1])) #this adds the taxonomic string based on the value in the 'highest_taxon_res' column. 


combined_v4 <- combined_v3 %>% separate(taxon_string_new, into=c("domain", "B", "C", "class", "class_x", "F", "G", "genus"), sep = ";")

combined_v5<- combined_v4 %>% 
  mutate(across(c(domain:genus),~ ifelse(is.na(.),combined_v4$lowest_res_taxon_id,.))) #replace NA's for the peptides without taxonomy with the highest tax. resolution  
combined_v6 <- combined_v5 [c(1:3, 13,14, 76:83, 4:12, 15:74)]

write.csv(combined_v6, "all_ps117_taxon_function_non-normalized_20220413.csv", row.names = FALSE) #this new version should have all the taxonomies corrected. 




```

get injection average
because we injected each biological replicate twice, we need to get the average of each of those injections for DE work and other stats
```{r}
proteomicsdata <- read.csv("all_ps117_taxon_function_non-normalized_20220413.csv", header = T)

melted_proteomicsdata <- melt(proteomicsdata, id.vars=c(1:22)) %>%
            rename(treatment = variable) %>%
            rename(non_norm_abundance = value ) %>%
            separate(treatment, into=c("bioassay", "temperature", "iron", "timepoint", "replicate", "injection"), sep = "_")

melted_proteomicsdata$treatment <- paste (melted_proteomicsdata$bioassay, melted_proteomicsdata$timepoint, melted_proteomicsdata$temperature, melted_proteomicsdata$iron, melted_proteomicsdata$replicate, sep = "_")

melted_proteomicsdata_av <- aggregate(melted_proteomicsdata$non_norm_abundance, 
                                      by=list(Category=
                                              melted_proteomicsdata$peptide, 
                                              melted_proteomicsdata$orf_id, 
                                              melted_proteomicsdata$cluster, 
                                              melted_proteomicsdata$grpnorm_taxgrp, 
                                              melted_proteomicsdata$lowest_res_taxon_id, 
                                              melted_proteomicsdata$domain, 
                                              melted_proteomicsdata$B, 
                                              melted_proteomicsdata$C, 
                                              melted_proteomicsdata$class, 
                                              melted_proteomicsdata$class_x, 
                                              melted_proteomicsdata$F, 
                                              melted_proteomicsdata$G, 
                                              melted_proteomicsdata$genus, 
                                              melted_proteomicsdata$kegg_hit, 
                                              melted_proteomicsdata$KOG_class, 
                                              melted_proteomicsdata$KOG_group, 
                                              melted_proteomicsdata$PFams, 
                                              melted_proteomicsdata$TIGRFams, 
                                              melted_proteomicsdata$best_hit_annotation, 
                                              melted_proteomicsdata$grpnorm_compartment, 
                                              melted_proteomicsdata$treatment), 
                                      FUN=mean) %>%
                  rename(peptide = Category) %>%
                  rename(orf_id = Group.2) %>%
                  rename(cluster = Group.3 ) %>%
                  rename(grpnorm_taxgrp= Group.4 ) %>%
                  rename(lowest_res_taxon_id= Group.5 ) %>%
                  rename(domain= Group.6) %>%
                  rename(B= Group.7 ) %>%
                  rename(C= Group.8 ) %>%
                  rename(class= Group.9) %>%
                  rename(class_x= Group.10) %>%
                  rename(F= Group.11 ) %>%
                  rename(G= Group.12 ) %>%
                  rename(genus= Group.13) %>%
                  rename(kegg_hit= Group.14 ) %>%
                  rename(KOG_class= Group.15) %>%
                  rename(KOG_group= Group.16 ) %>%
                  rename(PFams= Group.17 ) %>%
                  rename(TIGRFams= Group.18 ) %>%
                  rename(best_hit_annotation= Group.19 ) %>%
                  rename(grpnorm_compartment= Group.20 ) %>%
                  rename(treatment= Group.21 ) %>%
                  rename(non_norm_abundance = x) 
  
melted_proteomicsdata_av2 <- dcast(melted_proteomicsdata_av, peptide~treatment)

tax_func <- proteomicsdata [c(1:22)]

final <- merge (tax_func, melted_proteomicsdata_av2, by = "peptide")

write.csv(final, "all_ps117_taxon_function_non-normalized_injection_means_20220413.csv", row.names = FALSE)
```

add cluster annotations to clusters 
```{r}
mcl_annotation <- read.csv("D:/School/PhD/Data/FragTranscriptome/Transcriptome/MCL_tfg_de_annotation_allTFG.grpnorm_mmetsp_fc_pn_reclassified.csv") [,c(48, 50, 51, 52)]

mcl_annotation2 <- mcl_annotation %>% distinct(cluster, .keep_all = TRUE) #keep unique clusters 

mcl_annotation2$ann_type <- gsub("^$","noclusterannotation", as.character (mcl_annotation2$ann_type)) #make sure there's something for the clusters with no annotation 

mcl_annotation2$ann_id <- gsub("^$","noclusterannotation", as.character (mcl_annotation2$ann_id)) #make sure there's something for the clusters with no annotation 

mcl_annotation2$ann_desc <- gsub("^$","noclusterannotation", as.character (mcl_annotation2$ann_desc ))

mcl_annotation2$cluster_annotation <- paste(mcl_annotation2$ann_type, mcl_annotation2$ann_id, mcl_annotation2$ann_desc, sep = "_")

mcl_annotation3 <- mcl_annotation2 [c(1,5)]

mcl_annotation3$cluster <- gsub(" ","", as.character (mcl_annotation3$cluster))


#now we have a file with clusters and annotations, we want to combine it with the masterfile 
proteomicsdata <- read.csv("D:/School/PhD/PS117/data/all_ps117_taxon_function_non-normalized_injection_means_20220413.csv", header = T)

proteomicsdata$cluster <- gsub(" ","", as.character (proteomicsdata$cluster))


proteomicsdata2 <- merge(mcl_annotation3,proteomicsdata, by = "cluster", all.y = TRUE) #add cluster ID to each peptide. 


proteomicsdata2 <- proteomicsdata2[order(proteomicsdata2$peptide),] #to keep the same order as the original proteomics file.


proteomicsdata2$cluster_annotation <- ifelse (is.na(proteomicsdata2$cluster_annotation), proteomicsdata2$cluster, proteomicsdata2$cluster_annotation)

proteomicsdata3 <- proteomicsdata2[c(3,4,1,2,5:53)]

#there has to be a better way of removing the white spaces in the taxonomy columns. until I find it, I'm doing this manually
proteomicsdata3$grpnorm_taxgrp <- gsub(" ","", as.character (proteomicsdata3$grpnorm_taxgrp))
proteomicsdata3$lowest_res_taxon_id <- gsub(" ","", as.character (proteomicsdata3$lowest_res_taxon_id))
proteomicsdata3$domain <- gsub(" ","", as.character (proteomicsdata3$domain))
proteomicsdata3$B <- gsub(" ","", as.character (proteomicsdata3$B))
proteomicsdata3$C <- gsub(" ","", as.character (proteomicsdata3$C))
proteomicsdata3$class <- gsub(" ","", as.character (proteomicsdata3$class))
proteomicsdata3$class_x <- gsub(" ","", as.character (proteomicsdata3$class_x))
proteomicsdata3$F <- gsub(" ","", as.character (proteomicsdata3$F))
proteomicsdata3$G <- gsub(" ","", as.character (proteomicsdata3$G))
proteomicsdata3$genus <- gsub(" ","", as.character (proteomicsdata3$genus))

write.csv(proteomicsdata3, "all_ps117_taxon_function_non-normalized_injection_means_20220421.csv", row.names = FALSE)



```










scrap
```{r}
setwd("D:/School/PhD/PS117/data/open-MS_output/")
alldata <- read.csv("all_ps117_normalized_tax_annotation_20220127.csv", header = T)


  




isip <- filter(alldata, grepl('clust_1814 |clust_343 |clust_1154 ',taxon_function)) 

csp <- filter(alldata, grepl('cold ',taxon_function)) 

hsp <- filter(alldata, grepl('Hsp',taxon_function)) 

plastocyanin <- filter(alldata, grepl('clust_1820 ',taxon_function))

flavodoxin <- filter(alldata, grepl('clust_534 |clust_4660 ',taxon_function)) 
nitrogen <- filter(alldata, grepl('clust_411 ',taxon_function)) 
p700 <- filter(alldata, grepl('clust_55 |clust_211 |clust_42 ',taxon_function)) 

lhc <- filter(alldata, grepl('clust_1194 |clust_712 |clust_1236 |clust_332 |clust_80 |clust_402 ',taxon_function)) 

lhc <- filter(alldata, grepl('clust_332 ',taxon_function)) 

lhcfrag <- filter(lhc, grepl('Pseudo',taxon_function)) 

ribosomes <- filter(alldata, grepl('riboso',taxon_function))

isip1 <- melt(nitrogen, id.vars=c("taxon_function", "peptide")) %>%
            rename(treatment = variable) %>%
            rename(norm_abundance = value )


isip1 <- isip1 %>% separate(treatment, into=c("bioassay", "temperature", "iron", "timepoint", "replicate", "injection"), sep = "_")

isip1 <- isip1 %>% separate(taxon_function, into=c("cluster", "x", "y", "z", "zz", "zzz"), sep = ";")


isip1$replicate <- str_sub(isip1$replicate, -2,-2)
isip1$injection <- str_sub(isip1$injection, 2)

isip1$treatment <- paste (isip1$temperature, isip1$iron, sep ='_')


treatment_order <- c('T0_T0', 'LT_noFe', 'LT_Fe', 'HT_noFe', 'HT_Fe')

ggplot (isip1, aes (x = (factor(treatment, level = treatment_order)), y = norm_abundance, color = cluster)) +  
  #geom_jitter(size = 3, width = 0.05, height = 0.5, alpha = 0.1)+
  #geom_point(size = 4, alpha = 0.4)+
  stat_summary(fun = mean, geom = "point", size = 3, stroke = 1, alpha = 0.6)+
  stat_summary(fun.data =  mean_se, geom = "errorbar", size = 0.7, width = 0.1)+
  #scale_color_manual(name="",
                     #breaks = c("clust_1814 ", "clust_343 ", "clust_1154 "),
                     #labels = c("ISIP-1", "ISIP-2A", "ISIP-3"),
                     #values = c('red', 'black', 'blue')) +
     #scale_color_manual(name="",
                    # breaks = c("clust_55 ", "clust_42 ", "clust_211 "),
                     #labels = c("PSI-P700", "PSII-psbA", "PSII-psbC"),
                     #values = c('red', 'black', 'grey')) +
   scale_color_manual(name="",
                     breaks = c("clust_411 ", "clust_991 " ),
                    labels = c("Nitrate Reductase", "Nit. transporter"),
                     values = c('black', 'red')) +
  theme_bw()+
  #scale_y_log10()+
  #theme (legend.position = "none", 
        theme (axis.text.x = element_text(angle = 90)) +
  xlab("")+
  ylab ("normalized abundance")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", vjust = 1, hjust = 1, angle = 45),
        axis.title.y=element_text(size=20,face = "bold", color = "black"), 
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        strip.background =element_rect(fill = "white"),
        legend.text = element_text(size = 12), 
        strip.text.y = element_text(size = 15, face = "bold")) +
  facet_grid(bioassay ~.)

```

Total intensity for each sample
```{r}
setwd("D:/School/PhD/PS117/data/open-MS_output/")
#s1a <- read.table("210526_0977_097_S01_03.csv", header = TRUE)[,c(1,2,3,5)] 
#s1atop <- head(arrange(s1a, desc(abundance)), n = 1) #this subsets the dataframe and shows the top n most abundant peptides.

filelist <- list.files(pattern = ".csv") #this makes a list of all the csv files in the working directory so they can all be imported at once

intensity <- ''
for(i in 1:length(filelist)){
  data <- read.table(filelist[i], header = T) 
  intensity[i] <- sum(as.numeric(data$abundance), rm.NA=T)} #this adds the intensity ofeach peptide from each injection e.g. it does a column sum for the intnesities. 

totalintensity <- data.frame(filelist,intensity)

#just extracting the sample numbers here
totalintensity$sample_id <- str_sub(totalintensity$filelist, start=20, end = 22)
totalintensity$method <- str_sub(totalintensity$filelist, start=24, end = 26) #in here, the 'method' is for the injections that had a slightly different mass spec method.
totalintensity$method <- gsub('^\\.|\\.$', '', totalintensity$method) #gets rid of the '.' at the end of the method number.
totalintensity$method <- as.numeric(totalintensity$method)

ggplot(totalintensity, aes(x=sample_id, y= as.numeric(intensity)))+ 
  geom_point(size = 5, alpha =0.3, aes(color =method >113))+ #the blue here are the methods that Elden played with. 
  scale_colour_manual(values = c("black", "blue")) + 
  scale_y_log10(limits = c(10000, 10000000000000))+
  theme_bw()+
  ylab (expression ("total ion intensity"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")
```


total intensity sketchpad
```{r}
#totalintensity not including crap data
for(i in 1:length(filelist)){
  data1  <- dplyr::filter(read.table(filelist[i], header = T), !grepl('sp',protein)) 
  intensity[i] <- sum(as.numeric(data1$abundance), rm.NA=T)}

totalintensity_nocrap  <- data.frame(filelist,intensity)
totalintensity_nocrap$sample_id <- str_sub(totalintensity_nocrap$filelist, start=17, end = 19)
totalintensity_nocrap$method <- str_sub(totalintensity_nocrap$filelist, start=21, end = 22)

ggplot(totalintensity_nocrap, aes(x=sample_id, y= as.numeric(intensity)))+ 
  geom_point (color = "red", size = 3, alpha = 0.4)+
  geom_point(size = 3, alpha =0.3, data = totalintensity, aes(y = as.numeric(intensity), color =method >63))+
  scale_colour_manual(values = c("black", "blue")) + 
  scale_y_log10(limits = c(1000000, 1000000000000))+
  theme_bw()+
  ylab (expression ("total ion intensity"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")


#plot intensity of only the n most abundant peptides in each sample

mostabund <- map_df(filelist, ~ {
    dplyr::filter(read.table(.x, header = T), !grepl('sp',protein)) [,c(1,5)] %>% #this is so the nocrap DB is not selected
    #read.table(.x, header = T)[,c(1,5)]%>%
    slice_max(abundance, n = 50, with_ties = FALSE) %>% #can also use slice_min for lowest abundnace
    mutate(col = c(1:50)) %>%
    pivot_wider(names_from = col, values_from = c(peptide, abundance))
})


mostabund <-  data.frame(filelist, mostabund)
mostabund$sample_id <- str_sub(mostabund$filelist, start=17, end = 19)
mostabund$method <- str_sub(mostabund$filelist, start=21, end = 22)

mostabund$totalabund <- rowSums(mostabund[grep('abundance', names(mostabund))]) #this adds the abundance of all the top n PSMs

ggplot(mostabund, aes(x=sample_id, y= as.numeric(totalabund)))+ 
  geom_point(color = "red")+
  geom_point(size = 5, alpha =0.3, data = totalintensity_nocrap, aes(y = as.numeric(intensity), color =method >63))+
  scale_colour_manual(values = c("black", "blue")) + 
  scale_y_log10(limits = c(10, 1000000000000))+
  theme_bw()+
  ylab (expression ("total ion intensity"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")

##########################
##########################

#https://www.datanovia.com/en/blog/venn-diagram-with-r-or-rstudio-a-million-ways/

#now I want to see how many of those top most abundant peptides are shared among the samples. 
#focusing on the two different setting injections for sample 14 and 18 

s1418 <- filter(mostabund, grepl('S14', sample_id))

#first, I'm making a list of all the different top peptides in all the samples. 

x <- s1418 [c(1:51)]
x$sample_id <- str_sub(x$filelist, start=17, end = 22)


rownames(x) <- x [,52]

x <- x[c(2:51)]
  
unique (unlist(x))

xy.list <- as.list(as.data.frame(t(x)))

vtable <- venn(xy.list)
x<- venn.diagram(xy.list, filename = "x.png", 
                category.names = c("S14_LT_a","S14_LT_b","S14_HT_a","S14_HT_b" ),
                #category.names = c("S18_LT_a","S18_LT_b","S18_HT_a","S18_HT_b" ),
                fill = c("red", "orange", "blue", "green"), 
                cex = 2)

v.table <- venn(xy.list)
print (v.table)
```

take all the text files from the mass spec FileInfo output, and puts them into one dataframe where the information can be viewed and plotted. 
```{r}
setwd("D:/School/PhD/PS117/data/mass-spec-info/")

#creates a list of files names to be used
#reads files into a list of vectors
ps117_ms_info <- lapply(list.files(pattern = ".txt") , readLines) 

#convert each element of the list into a data frame
ps117_ms_info_df <- lapply(1:length(ps117_ms_info),function(i) data.frame(
  id = i,
                           rawdata=ps117_ms_info[[i]],
                           stringsAsFactors = FALSE))

#combines into a single dataframe
ps117_ms_info_df <- do.call(rbind,ps117_ms_info_df)

#split the rawdata at the first ':' into parameter and output, and trim spaces
ps117_ms_info_df[,c("parameter","output")] <- str_trim(str_split_fixed(ps117_ms_info_df$rawdata,":",2)) 

#convert from 'long' to 'wide' format - the parameter become column headings
ps117_ms_info_df_2 <- ps117_ms_info_df[,c("id","parameter","output")]

ps117_ms_info_df_3 <- dplyr::filter (ps117_ms_info_df_2, grepl('File name|Number of spectra$|Number of peaks|level 1|level 2',parameter))


ps117_ms_info_df_4 <- reshape(ps117_ms_info_df_3, idvar = "id", timevar = "parameter", direction = "wide")


ps117_ms_info_df_5 <- ps117_ms_info_df_4 [,c(2, 4:6)]

names(ps117_ms_info_df_5) <- c("sample", "number_of_peaks", "ms1_spectra", "ms2_spectra")

ps117_ms_info_df_5$sample_id <- str_sub(ps117_ms_info_df_5$sample, start=33, end = 34)
ps117_ms_info_df_5$sample_id <-  paste0("S", ps117_ms_info_df_5$sample_id)

#ps117_ms_info_df_5$samplenumber <- 1:nrow(ps117_ms_info_df_5)

#ps117_ms_info_df_5$method <- str_sub(ps117_ms_info_df_5$sample, start=33, end = 34)

#ps117_ms_info_df_5$samplename <- sample_namenumber_map [ps117_ms_info_df_5$sample_id]

ps117_ms_info_df_5 <- dplyr::filter (ps117_ms_info_df_5, !grepl('BSA|Blk|new',sample))


ms1 <- ggplot(ps117_ms_info_df_5, aes(x=sample_id,
                      y=as.numeric(ms1_spectra)))+ 
  geom_point(size = 5, alpha =0.3)+#,aes(color =method >80))+
  scale_colour_manual(values = c("black", "blue")) +  
  scale_y_continuous(limits = c(0, 10000))+
  theme_bw()+
  ylab (expression ("# of MS1 Spectra"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 8, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20, color = "black"), 
        legend.position = "none")

ms2 <- ggplot(ps117_ms_info_df_5, aes(x=sample_id,
                      y=as.numeric(ms2_spectra)))+ 
  geom_point(size = 5, alpha =0.3)+ #aes(color =method >80))+
  #scale_colour_manual(values = c("black", "blue")) +  
  scale_y_continuous(limits = c(0, 40000))+
  theme_bw()+
  ylab (expression ("# of MS2 Spectra"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 5, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")


ms1ms2ratio <- ggplot(ps117_ms_info_df_5, aes(x=sample_id,
                      y=(as.numeric(ms2_spectra)/as.numeric(ms1_spectra))))+ 
  geom_point(size = 5, alpha =0.3)+ #aes(color =method >80))+
  #scale_colour_manual(values = c("black", "blue")) + 
  scale_y_continuous(limits = c(0, 6))+
  theme_bw()+
  ylab (expression ("MS2 : MS1"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 12, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")

plot_grid(ms1 + rremove("x.text") , ms2 + rremove("x.text"), ms1ms2ratio, 
          nrow = 3, 
          align = "v")
```

output the number of peptide spectral matches (PSMs) identified for each injection after all the database searching is done.
```{r}
setwd("D:/School/PhD/PS117/data/open-MS_output/")

filelist<- list.files(pattern = ".csv") 

numberofrows <- lapply(X = filelist, FUN = function(x) {
  length(count.fields(x, skip = 1))
})

var <- do.call(rbind,numberofrows)
peps_total<- c(as.numeric(var))
samplenames <- filelist
ps117_npeptides <- data.frame(samplenames, peps_total)

ps117_npeptides$sample_id <- str_sub(ps117_npeptides$samplenames, start=20, end = 22)
ps117_npeptides$samplenumber <- 1:nrow(ps117_npeptides)

ps117_npeptides$method <- str_sub(ps117_npeptides$samplenames, start=21, end = 22)

ps117_npeptides <- dplyr::filter (ps117_npeptides, !grepl('_114|_115|_117|_118|_120|_121|_123|_124',samplenames))


psmtotal <- ggplot(ps117_npeptides, aes(x=sample_id, y= as.numeric(peps_total)))+ 
  geom_point(size = 5, alpha =0.3, aes(color =method >63))+
  scale_colour_manual(values = c("black", "blue")) +  
  scale_y_continuous(limits = c(0, 12000))+
  theme_bw()+
  ylab (expression ("PSMs total"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"), 
        legend.position = "none")


ps117_npeptides$injection <- str_sub(ps117_npeptides$samplenames, start=24, end = 26)

ps117_ms_info_df_5$injection <- str_sub(ps117_ms_info_df_5$sample, start=36, end = 38)


merged <- merge (ps117_npeptides, ps117_ms_info_df_5, by = c("sample_id", "injection"))

psmpercent <- ggplot(merged, aes(x=sample_id, y= 100*(as.numeric(peps_total)/as.numeric(ms2_spectra))))+ 
  geom_point(size = 5, alpha =0.3)+

  #geom_point(size = 5, alpha =0.3, aes(color =method.x >63))+
  scale_colour_manual(values = c("black", "blue")) +  
  scale_y_continuous(limits = c(1, 40))+
  theme_bw()+
  ylab (expression ("PSMs %"))+
  xlab("")+
  theme(axis.text.x=element_text(face = "bold", size = 15, color = "black", angle = 90, vjust =0.5, hjust = 1),
        axis.text.y=element_text(face = "bold", size = 15, color = "black"),
        axis.title.y=element_text(size=20,face="bold", color = "black"),
        legend.position = "none")


plot_grid(psmtotal + rremove("x.text") , psmpercent , 
          nrow = 2, 
          align = "v")
```