johanzi · Jul 26, 2018
diff --git a/‎.Rhistory
+256 b/‎.Rhistory
+256
diff --git a/‎assoc2qqman.py
+47 b/‎assoc2qqman.py
+47
@@ -0,0 +1,256 @@
+# Directory containing the file with phenotypes
+dir_gwas <- "T:/dep_coupland/grp_hancock/johan/GWAS/"
+# Name of the file to import
+file_name <- "test_export_df.txt"
+# Name of the phenotype indicated in the dataframe (see Lists of phenotypes below)
+phenotype <- "HX"
+# Name chosen for the output file
+name_phenotype <- "HX"
+# List of packages required for this analysis
+pkg <- c("ggplot2")
+# Check if packages are not installed and assign the
+# names of the packages not installed to the variable new.pkg
+new.pkg <- pkg[!(pkg %in% installed.packages())]
+# If there are any packages in the list that aren't installed,
+# install them
+if (length(new.pkg)) {
+install.packages(new.pkg, repos = "http://cran.rstudio.com")
+}
+#Library to plot Manhattan plots
+library(ggplot2)
+# fun_nucleus.R is located in same directory as this script
+source("fun_nucleus.R")
+file_path <- paste(dir_gwas, file_name, sep="")
+df_gwas <- read.table(file_path, header=TRUE, sep="\t")
+View(df_gwas)
+View(df_gwas)
+names(df_gwas)
+names(df_gwas)
+# Not that df_gwas$phenotype cannot be used as $ does not take variables
+variable <- df_gwas[,phenotype]
+name_file <- paste(name_phenotype,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file, sep="")
+write.table(variable, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+# Directory containing the file with phenotypes
+dir_gwas <- "T:/dep_coupland/grp_hancock/johan/GWAS/"
+# Name of the file to import. The file is generated by process_chromatinJ_output.Rmd
+file_name <- "test_export_df.txt"
+# Name of the phenotype indicated in the dataframe (see Lists of phenotypes below)
+phenotype <- "Area"
+# Name chosen for the output file
+name_phenotype <- "Area"
+# List of packages required for this analysis
+pkg <- c("ggplot2")
+# Check if packages are not installed and assign the
+# names of the packages not installed to the variable new.pkg
+new.pkg <- pkg[!(pkg %in% installed.packages())]
+# If there are any packages in the list that aren't installed,
+# install them
+if (length(new.pkg)) {
+install.packages(new.pkg, repos = "http://cran.rstudio.com")
+}
+#Library to plot Manhattan plots
+library(ggplot2)
+# fun_nucleus.R is located in same directory as this script
+source("fun_nucleus.R")
+file_path <- paste(dir_gwas, file_name, sep="")
+df_gwas <- read.table(file_path, header=TRUE, sep="\t")
+names(df_gwas)
+# Not that df_gwas$phenotype cannot be used as $ does not take variables
+variable <- df_gwas[,phenotype]
+name_file <- paste(name_phenotype,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file, sep="")
+write.table(variable, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+ggplot.boxplot(df_gwas, "Accession", variable, name_phenotype)+labs(x="Accession",y=name_phenotype)
+#Test normality of residual distribution
+hist(variable)
+#Test normality of residual distribution
+qqnorm(variable)
+qqline(variable)
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+name_phenotype <- "Area"
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+# Not that df_gwas$phenotype cannot be used as $ does not take variables
+variable <- df_gwas[,phenotype]
+name_file <- paste(name_phenotype,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file, sep="")
+write.table(variable, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+a <- c("something", "to", "paste")
+a <- c("Something", "to", "paste")
+paste(a, sep="_")
+paste(a, sep="_")
+paste(a, collapse="_")
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", collapse="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+paste(a, collapse="_")
+a <- "Scaled"
+b <- "Area"
+paste(a,b,sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+name_file_scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_phenotype_scaled
+variable_scaled <- scale(variable)
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_phenotype_scaled
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+name_file_scaled
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+path_file
+variable_scaled
+path_file
+# Directory containing the file with phenotypes
+dir_gwas <- "T:/dep_coupland/grp_hancock/johan/GWAS/"
+# Name of the file to import. The file is generated by process_chromatinJ_output.Rmd
+file_name <- "test_export_df.txt"
+# Name of the phenotype indicated in the dataframe (see Lists of phenotypes below)
+phenotype <- "Area"
+# Name chosen for the output file
+name_phenotype <- "Area"
+# List of packages required for this analysis
+pkg <- c("ggplot2")
+# Check if packages are not installed and assign the
+# names of the packages not installed to the variable new.pkg
+new.pkg <- pkg[!(pkg %in% installed.packages())]
+# If there are any packages in the list that aren't installed,
+# install them
+if (length(new.pkg)) {
+install.packages(new.pkg, repos = "http://cran.rstudio.com")
+}
+#Library to plot Manhattan plots
+library(ggplot2)
+# fun_nucleus.R is located in same directory as this script
+source("fun_nucleus.R")
+file_path <- paste(dir_gwas, file_name, sep="")
+df_gwas <- read.table(file_path, header=TRUE, sep="\t")
+names(df_gwas)
+# Not that df_gwas$phenotype cannot be used as $ does not take variables
+variable <- df_gwas[,phenotype]
+name_file <- paste(name_phenotype,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file, sep="")
+write.table(variable, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+ggplot.boxplot(df_gwas, "Accession", variable, name_phenotype)+labs(x="Accession",y=name_phenotype)
+#Test normality of residual distribution
+hist(variable)
+#Test normality of residual distribution
+qqnorm(variable)
+qqline(variable)
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+# Directory containing the file with phenotypes
+dir_gwas <- "T:/dep_coupland/grp_hancock/johan/GWAS/"
+# Name of the file to import. The file is generated by process_chromatinJ_output.Rmd
+file_name <- "test_export_df.txt"
+# Name of the phenotype indicated in the dataframe (see Lists of phenotypes below)
+phenotype <- "Area"
+# Name chosen for the output file
+name_phenotype <- "Area_nucleus"
+# List of packages required for this analysis
+pkg <- c("ggplot2")
+# Check if packages are not installed and assign the
+# names of the packages not installed to the variable new.pkg
+new.pkg <- pkg[!(pkg %in% installed.packages())]
+# If there are any packages in the list that aren't installed,
+# install them
+if (length(new.pkg)) {
+install.packages(new.pkg, repos = "http://cran.rstudio.com")
+}
+#Library to plot Manhattan plots
+library(ggplot2)
+# fun_nucleus.R is located in same directory as this script
+source("fun_nucleus.R")
+file_path <- paste(dir_gwas, file_name, sep="")
+df_gwas <- read.table(file_path, header=TRUE, sep="\t")
+names(df_gwas)
+# Not that df_gwas$phenotype cannot be used as $ does not take variables
+variable <- df_gwas[,phenotype]
+name_file <- paste(name_phenotype,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file, sep="")
+write.table(variable, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+#normalize data
+variable_scaled <- scale(variable)
+# Add automatically the suffix "_scaled" when the variable is scaled
+name_phenotype_scaled <- paste(name_phenotype, "scaled", sep="_")
+name_file_scaled <- paste(name_phenotype_scaled,".tsv", sep="")
+path_file <- paste(dir_gwas, name_file_scaled, sep="")
+write.table(variable_scaled, row.names = FALSE, col.names = FALSE, quote = FALSE, file = path_file, sep = "\t")
+ggplot.boxplot(df_gwas, "Accession", variable, name_phenotype)+labs(x="Accession",y=name_phenotype)
+#Test normality of residual distribution
+hist(variable)
+#Test normality of residual distribution
+qqnorm(variable)
+qqline(variable)
+knitr::opts_chunk$set(message=TRUE, warning=FALSE)
+# List of packages required for this analysis
+pkg <- c("qqman", "ggplot2")
+# Check if packages are not installed and assign the
+# names of the packages not installed to the variable new.pkg
+new.pkg <- pkg[!(pkg %in% installed.packages())]
+# If there are any packages in the list that aren't installed,
+# install them
+if (length(new.pkg)) {
+install.packages(new.pkg, repos = "http://cran.rstudio.com")
+}
+#Library to plot Manhattan plots
+library(qqman)
+library(ggplot2)
+dir_file <- "T:/dep_coupland/grp_hancock/johan/GWAS/output/"
+file.name <- "Area_nucleus.assoc.clean.txt"
+path.file <- paste(dir_file, file.name, sep="")
+gwas.results <- read.delim(path.file, sep="\t")
+qq(gwas.results$P, main=file.name)
+plot(-log(gwas.results$P)~gwas.results$CHR, main=file.name)
+hist(-log(gwas.results$P), main=file.name)
+as.data.frame(table(gwas.results$CHR))
+# Get positions of the chromosome with SNPs having a -log(P) > 5
+threshold <-  (10^-5)
+gwas_significant <- subset(gwas.results, P < threshold)
+# Get a vector of the SNPs with significant value
+SNP_significant <- as.vector(gwas_significant$SNP)
+manhattan(gwas.results, highlight=SNP_significant, main=file.name)
+#Check if dataframe is not empty (no SNPs above threshold value
+if(dim(gwas_significant)[[1]] != 0){
+gwas_significant
+}
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import sys
+import os
+import getopt
+import re
+
+'''
+        AUTHOR: Johan Zicola
+
+	USAGE: assoc2qqman.py <prefix.assoc.txt>
+    Read the output of gemma 'prefix.assoc.txt' derived 
+    from the association analysis command and performs 
+    readjustment of the file to be compatible for Read
+    R analysis with the package 'qqman'.
+'''
+
+
+def main():
+
+    # Open the file
+    input_file = sys.argv[1]
+    input_file = open(input_file, "r")
+    lines_file = input_file.read().splitlines()
+
+
+    for line in lines_file:
+        if line[:3] == "chr":  #First line should be header
+            header = "SNP\tCHR\tBP\tP\tzscore"
+            print header
+        else:
+            line = line.strip().split("\t")  # Get rid of EOL and Create a list based on \t separation
+            SNP = line[2]  # third column
+            chr_name = line[1].split(":")
+            #Keep only lines matching chromosomes (exclude Mt, Pt, ...)
+            if chr_name[0][0:3] == "Chr":
+                CHR = chr_name[0].replace("Chr", "")
+                BP = line[2]
+                P = line[8]
+                zscore = line[7]
+                new_line = SNP, CHR, BP, P, zscore
+                print "\t".join(new_line)
+    input_file.close()
+
+if __name__ == "__main__":
+    sys.exit(main())
+