-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_raw_biom_matrix.conseq
106 lines (96 loc) · 4.64 KB
/
preprocess_raw_biom_matrix.conseq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
rule process_expr_matrix:
inputs: in={"type": "raw-expr-matrix"},
script=fileref("scripts/process_expr_matrix.py"),
hdf5_utils=fileref('scripts/hdf5_utils.py', copy_to='hdf5_utils.py'),
cleanup_script=fileref("scripts/cleanup_hdf5.py"),
meta={'type': 'cell_line_metadata'}
outputs: {
'type': 'biomarker-matrix',
'source_dataset_id' : '{{ inputs.in.dataset_id }}',
'category': '{{inputs.in.category}}',
'filename': {"$filename": "out-c.hdf5"}
}
run "python3 {{inputs.script.filename}} {{ config.SCRIPT_DIR }} {{ inputs.in.dataset_id }} out.hdf5"
run "python3 {{inputs.cleanup_script.filename}} out.hdf5 out-c.hdf5 --add-arxspan-to-cols -m {{inputs.meta.filename}}"
rule download_biomarker_as_hdf5:
inputs:
in={"type": "biomarker-correctly-transposed"},
meta={'type': 'cell_line_metadata'},
hdf5_utils=fileref('scripts/hdf5_utils.py', copy_to='hdf5_utils.py'),
cleanup_script=fileref("scripts/cleanup_hdf5.py")
outputs: {
"type": "biomarker-matrix",
"source_dataset_id" : "{{ inputs.in.dataset_id }}",
"category": "{{ inputs.in.category }}",
"filename": {"$filename": "{{ inputs.in.category }}-c.hdf5"}
}
run "python" with """
import hdf5_utils
from taigapy import create_taiga_client_v3
tc = create_taiga_client_v3()
x = tc.get("{{ inputs.in.dataset_id }}")
hdf5_utils.write_hdf5(x, "{{inputs.in.category}}.hdf5")
"""
run "python3 {{inputs.cleanup_script.filename}} {{inputs.in.category}}.hdf5 {{inputs.in.category}}-c.hdf5 --add-arxspan-to-cols -m {{inputs.meta.filename}}"
rule transpose_biomarker:
inputs:
in={"type": "biomarker-needing-transpose"},
cleanup_script=fileref("scripts/cleanup_hdf5.py"),
hdf5_utils=fileref('scripts/hdf5_utils.py', copy_to='hdf5_utils.py'),
meta={'type': 'cell_line_metadata'}
outputs: {
"type": "biomarker-matrix",
"source_dataset_id" : "{{ inputs.in.dataset_id }}",
"category": "{{ inputs.in.category }}",
"filename": {"$filename": "{{inputs.in.category}}-c.hdf5"}
}
run "python3" with """
from hdf5_utils import write_hdf5
from taigapy import create_taiga_client_v3
import pandas as pd
tc = create_taiga_client_v3()
data = tc.get("{{inputs.in.dataset_id}}")
transposed_data = data.transpose()
write_hdf5(transposed_data, "{{inputs.in.category}}.hdf5")
"""
# it appears that the --add-arxspan-to-cols works fine even if the cols are already arxspan
run "python3 {{inputs.cleanup_script.filename}} {{inputs.in.category}}.hdf5 {{inputs.in.category}}-c.hdf5 --add-arxspan-to-cols -m {{inputs.meta.filename}}"
rule process_maf_file:
inputs: in={"type": "mutation-maf"}, meta={'type': 'cell_line_metadata'}
outputs: {"type": "mutation-maf-file", "filename": {"$filename": "maf.csv"}, "orig_dataset_id": "{{inputs.in.dataset_id}}"}
run "python3" with """
from taigapy import create_taiga_client_v3
import pandas as pd
tc = create_taiga_client_v3()
data = tc.get("{{inputs.in.dataset_id}}")
data.to_csv("maf.csv", index=False)
"""
rule process_mutations_matrices:
inputs:
mutation_bool={"type": "raw-mutations-bool-matrix"},
hgnc={"type": "hgnc-snapshot"},
script=fileref("scripts/download_mutation_boolean_matrix.py"),
hdf5_script=fileref("scripts/hdf5_utils.py")
outputs:
{
"type": "biomarker-matrix",
"source_dataset_id": "{{ inputs.mutation_bool.dataset_id }}",
"category": "mutations-{{ inputs.mutation_bool.category }}",
"filename": { "$filename": "out.hdf5" }
}
run """python {{ inputs.script.filename }} {{ inputs.mutation_bool.dataset_id }} \
{{ inputs.hgnc.dataset_id }} out.csv"""
run "python {{ inputs.hdf5_script.filename }} to_hdf5 out.csv csv out.hdf5"
rule create_fusions_matrix:
inputs: fusions={'type': 'other-taiga-dataset',
'category': 'fusions'},
script=fileref("scripts/make_fusions_matrix.py"),
hgnc={"type": "hgnc-snapshot"},
hdf5_script=fileref("scripts/hdf5_utils.py")
outputs: {'type': 'biomarker-matrix',
'category': 'fusions',
'filename': {"$filename": "fusions.hdf5"},
'source_dataset_id': '{{inputs.fusions.dataset_id}}'}
run """python {{ inputs.script.filename }} {{ inputs.fusions.dataset_id }} \
{{ inputs.hgnc.dataset_id }} out.csv"""
run "python {{ inputs.hdf5_script.filename }} to_hdf5 out.csv csv fusions.hdf5"