-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_dotplot.py
executable file
·136 lines (124 loc) · 7.26 KB
/
make_dotplot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
#Copyright © 2020 Giulia I. Corsi
import argparse as ap
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap
import seaborn as sb
import os
import numpy as np
sb.set_style("whitegrid")
PALETTE = {'start':0.2, 'end':0.8} # used to smooth the palette and avoid very dark or very light colors that are at the edges of palettes
def parse_args():
parser = ap.ArgumentParser()
parser.add_argument('-e', '--enrichment_results', help='Path to table with enrichment results, in csv format ',
type=str, required=True)
parser.add_argument('-o', help='Path to output folder', type=str, required = True)
parser.add_argument('--GSEA', help='Source of the GSEA analysis', choices=['StringApp', 'Other'],
default='StringApp')
parser.add_argument('-c_n', help='Name of the column containing the number of background genes', type=str, required=False)
parser.add_argument('-c_nb', help='Name of the column containing the number of genes assigned to each term',
type=str, required=False)
parser.add_argument('-c_c', help='Name of the column containing a category, used to group enriched terms', type=str, required=False)
parser.add_argument('-c_d', help='Name of the column containing the term description', type=str, required=False)
parser.add_argument('-c_f', help='Name of the column containing the FDR value', type=str, required=False)
parser.add_argument('-n', help='Maximum number of terms to plot', type=int, default=20)
parser.add_argument('--ratio_min',
help='Consider only terms for which the gene-ratio term is above the given threshold',
type=float, default=0.00)
parser.add_argument('--palette', help='Color palette used', default='viridis')
parser.add_argument('--n_bins', help = 'Number of bins for grouping gene counts', default=4, type=int, choices = [2,3,4])
parser.add_argument('--groups', help = 'List of categories to be grouped together', default=['KEGG Pathways','Reactome Pathways'], nargs='+')
parser.add_argument('--increase_height', help = 'Integer, increases the height of the plot', type = int, default=0)
parser.add_argument('--increase_width', help = 'Integer, increases the width of the plot', type = int, default=0)
args = parser.parse_args()
if args.GSEA == 'Other':
if not args.c_n or not args.c_nb or not args.c_d: # c_c is optional
parser.error(
'If the GSEA source is unknown, names of the columns to be used in the table have to be provided (args.c_*)')
if args.GSEA == 'StringApp':
args.c_n = '# genes'
args.c_nb = '# background genes'
args.c_d = 'description'
args.c_c = 'category'
args.c_f = 'FDR value'
if not os.path.exists(args.o):
os.mkdir(args.o)
return args
def parse_data(path_enrichment: str, c_n: str, c_nb: str, ratio_min:float, c_f: str, c_d: str, c_c: str):
df = pd.read_csv(path_enrichment)
df['ratio'] = df[c_n] / df[c_nb]
if ratio_min:
df = df[df['ratio'] > ratio_min]
df = df[[c_n, c_f, c_d, c_c, 'ratio']]
df.columns = ['Gene Count', 'FDR', 'Description', 'Category', 'Gene Ratio']
return df
def dot_plot_enrichment(df: pd.DataFrame, title: str, out_folder: str, c_d: str, c_r: str, c_n: str, n_max: int,
c_f: str, palette:str, n_bins:int, height_add:int, width_add :int):
def get_exponent(x):
str_x = '%.2E' % x
str_x = str_x[str_x.find('E'):]
return str_x
def get_color_bins(ser: pd.Series):
min_exp = int(get_exponent(ser.min())[1:])
max_exp = int(get_exponent(ser.max())[1:])
exponents = ['E%.2i'%x for x in range(min_exp, max_exp+1,1) ]
color_bins = {x: id for id, x in enumerate(exponents)}
return color_bins
df.reindex()
df.sort_values(c_f)
df = df.iloc[:n_max].copy()
SIZES = {0:20,1:120,2:250,3:400}
width_xfigure = width_add+5+df[c_d].apply(lambda x: len(x)).max()/15
height_yfigure = height_add+len(df)/3.5
plt.figure(figsize=(width_xfigure, height_yfigure))
df['count_bins'] = (pd.cut(df[c_n], min(n_bins, len(df[c_n].unique()))).apply(lambda x: x.right)).astype(int)
if len(df)>=5: #len(df['count_bins'].unique()) == n_bins and
size_bins = {x: SIZES[i] for i,x in enumerate(sorted(df['count_bins'].unique()))}
df['count_bins_sizes'] = df['count_bins'].apply(lambda x:size_bins[x])
print(df)
print(size_bins)
color_bins = get_color_bins(df[c_f])
df[c_f] = df[c_f].apply(lambda x: get_exponent(x))
#color_bins = {x:id for id, x in enumerate(df[c_f].unique())}
df['color_bins']=df[c_f].apply(lambda x: color_bins[x])
newcmp = ListedColormap(cm.get_cmap(palette, 256)(np.linspace(PALETTE['start'], PALETTE['end'], 256)))
dot_plot = plt.scatter(x=df[c_r], y=range(len(df)), s=df['count_bins_sizes'], c=df['color_bins'], cmap=newcmp)
plt.gca().invert_yaxis()
plt.yticks(range(len(df[c_d])),df[c_d], fontsize=11)
markers=[]
lst_size_bins_counts = [df[c_n].min()]
lst_size_bins_counts.extend([x for x in sorted(size_bins.keys())])
for i, bin in enumerate(sorted(size_bins.keys())):
markers.append(plt.scatter([], [], c='grey', s=size_bins[bin], label=str(lst_size_bins_counts[i])+'-'+str(bin)))
plt.legend(title = 'Gene counts', handles=markers,labelspacing=1, borderpad=1, fontsize=11)
cbar = plt.colorbar(dot_plot, ticks = [0,len(color_bins)-1])
cbar.ax.set_xlabel(c_f)
cbar.ax.invert_yaxis()
c_f_range=df.sort_values(c_f)[c_f].unique()
cbar.set_ticklabels([c_f_range[-1],c_f_range[0]])
plt.xlabel('Gene Ratio', fontsize=11)
plt.tight_layout()
plt.savefig(os.path.join(out_folder,'%s_enrichment.svg'%title), format='svg')
else:
print('Figure for category %s is not produced because the number of terms is smaller than the number of gene count bins or because the number of terms is <5'%title)
if __name__ == '__main__':
args = parse_args()
df = parse_data(path_enrichment=args.enrichment_results, c_n=args.c_n, c_nb=args.c_nb, ratio_min=args.ratio_min,
c_f=args.c_f, c_d=args.c_d, c_c=args.c_c)
if args.c_c:
print(args.groups)
if args.groups:
category = '-'.join(args.groups)
print(category)
df['Category']=df['Category'].apply(lambda x: category if x in args.groups else x)
for category, df_category in df.groupby('Category'):
print(category)
dot_plot_enrichment(df=df_category.copy(), title=category.replace(' ','_'), out_folder=args.o, c_d='Description',
c_r='Gene Ratio', c_n='Gene Count', n_max=args.n, c_f='FDR', palette=args.palette, n_bins=args.n_bins, height_add=args.increase_height, width_add = args.increase_width)
else:
dot_plot_enrichment(df=df, title='All', out_folder=args.o, c_d='Description', c_r='Gene Ratio',
c_n='Gene Count', n_max=args.n, c_f='FDR', palette=args.palette, n_bins= args.n_bins, height_add=args.increase_height, width_add = args.increase_width)