Skip to content

Commit c44037d

Browse files
authored
Merge pull request #39 from JLSteenwyk/efficient
Efficient
2 parents 0427f33 + 10b4db1 commit c44037d

File tree

94 files changed

+1318
-1482
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+1318
-1482
lines changed

change_log.txt

+9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
Major changes to PhyKIT are summarized here.
22

3+
1.21.0
4+
- The partition file outputted from the create_concat function has been updated
5+
- Column information in the partition file is as follows:
6+
- column 1: alignment name
7+
- column 2: # of taxa present
8+
- column 3: # of taxa missing
9+
- column 4: fraction of occupancy
10+
- column 5: names of missing taxa (; separated)
11+
312
1.20.0
413
- Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
514

docs/change_log/index.rst

+8
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ Change log
88

99
Major changes to PhyKIT are summarized here.
1010

11+
**1.21.0**:
12+
The partition file outputted from the create_concat function has been updated to the following format:
13+
- column 1: alignment name
14+
- column 2: # of taxa present
15+
- column 3: # of taxa missing
16+
- column 4: fraction of occupancy
17+
- column 5: names of missing taxa (; separated)
18+
1119
**1.20.0**:
1220
Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
1321

phykit/helpers/files.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from enum import Enum
22
import sys
3+
from typing import Tuple
34

45
from Bio import AlignIO
6+
from Bio.Align import MultipleSeqAlignment
57

68

79
class FileFormat(Enum):
@@ -15,12 +17,17 @@ class FileFormat(Enum):
1517
stockholm = "stockholm"
1618

1719

18-
def get_alignment_and_format(alignment_file_path: str):
19-
# if file format is provided, read the file according to the user's file format
20+
def get_alignment_and_format(
21+
alignment_file_path: str
22+
) -> Tuple[MultipleSeqAlignment, str, bool]:
23+
# if file format is provided, read the file
24+
# according to the user's file format
2025
for fileFormat in FileFormat:
2126
try:
22-
alignment = AlignIO.read(open(alignment_file_path), fileFormat.value)
23-
return alignment, fileFormat.value
27+
alignment = AlignIO.read(
28+
open(alignment_file_path), fileFormat.value
29+
)
30+
return alignment, fileFormat.value, is_protein_alignment(alignment)
2431
# the following exceptions refer to skipping over errors
2532
# associated with reading the wrong input file
2633
except ValueError:
@@ -33,6 +40,21 @@ def get_alignment_and_format(alignment_file_path: str):
3340
sys.exit()
3441

3542

43+
def is_protein_alignment(alignment: MultipleSeqAlignment) -> bool:
44+
nucleotide_set = {
45+
"A", "C", "G", "T", "U", "-", "N", "?", "*"
46+
}
47+
48+
for record in alignment:
49+
seq_set = set(record.seq.upper())
50+
if seq_set - nucleotide_set:
51+
# if there are chars that are not in the nucl set,
52+
# it's likely a protein sequence
53+
return True
54+
55+
return False
56+
57+
3658
def read_single_column_file_to_list(single_col_file_path: str) -> list:
3759
try:
3860
with open(single_col_file_path) as f:

phykit/phykit.py

+5
Original file line numberDiff line numberDiff line change
@@ -2563,6 +2563,11 @@ def create_concatenation_matrix(argv):
25632563
2) A partition file ready for input into RAxML or IQ-tree.
25642564
3) An occupancy file that summarizes the taxon occupancy
25652565
per sequence.
2566+
- column 1: alignment name
2567+
- column 2: # of taxa present
2568+
- column 3: # of taxa missing
2569+
- column 4: fraction of occupancy
2570+
- column 5: names of missing taxa (; separated)
25662571
25672572
Aliases:
25682573
create_concatenation_matrix, create_concat, cc
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
from .base import Alignment
22

3+
from typing import Dict
4+
35

46
class AlignmentLength(Alignment):
57
def __init__(self, args) -> None:
68
super().__init__(**self.process_args(args))
79

8-
def run(self):
9-
alignment, _ = self.get_alignment_and_format()
10+
def run(self) -> None:
11+
alignment, _, _ = self.get_alignment_and_format()
1012
aln_len = alignment.get_alignment_length()
1113
print(aln_len)
1214

13-
def process_args(self, args):
15+
def process_args(self, args) -> Dict[str, str]:
1416
return dict(alignment_file_path=args.alignment)
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,62 @@
1+
from argparse import Namespace
2+
from typing import Dict, Tuple
3+
4+
from Bio.Align import MultipleSeqAlignment
5+
16
from .base import Alignment
27

38

49
class AlignmentLengthNoGaps(Alignment):
510
def __init__(self, args) -> None:
611
super().__init__(**self.process_args(args))
712

8-
def run(self):
9-
alignment, alignment_format = self.get_alignment_and_format()
13+
def run(self) -> None:
14+
alignment, _, is_protein = self.get_alignment_and_format()
1015
(
1116
aln_len_no_gaps,
1217
aln_len,
1318
aln_len_no_gaps_per,
14-
) = self.calculate_alignment_length_no_gaps(alignment)
19+
) = self.calculate_alignment_length_no_gaps(alignment, is_protein)
1520
print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}")
1621

17-
def process_args(self, args):
22+
def process_args(
23+
self,
24+
args: Namespace,
25+
) -> Dict[str, str]:
1826
return dict(alignment_file_path=args.alignment)
1927

20-
def calculate_alignment_length_no_gaps(self, alignment):
28+
def calculate_alignment_length_no_gaps(
29+
self,
30+
alignment: MultipleSeqAlignment,
31+
is_protein: bool,
32+
) -> Tuple[int, int, float]:
2133
aln_len = alignment.get_alignment_length()
22-
aln_len_no_gaps = self.get_sites_no_gaps_count(alignment, aln_len)
34+
aln_len_no_gaps = self.get_sites_no_gaps_count(
35+
alignment,
36+
aln_len,
37+
is_protein
38+
)
2339

24-
# calculate percent of variable sites
2540
aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100
2641

2742
return aln_len_no_gaps, aln_len, aln_len_no_gaps_per
2843

29-
def get_sites_no_gaps_count(self, alignment, aln_len):
44+
def get_sites_no_gaps_count(
45+
self,
46+
alignment: MultipleSeqAlignment,
47+
aln_len: int,
48+
is_protein: bool,
49+
) -> int:
3050
"""
3151
Count sites in the alignment with no gaps
3252
"""
3353
aln_len_no_gaps = 0
34-
for i in range(0, aln_len):
35-
seq_at_position = ""
36-
seq_at_position += alignment[:, i]
37-
if "-" not in seq_at_position:
54+
55+
gap_chars = self.get_gap_chars()
56+
57+
for i in range(aln_len):
58+
column = set(alignment[:, i])
59+
if column.isdisjoint(gap_chars):
3860
aln_len_no_gaps += 1
3961

4062
return aln_len_no_gaps

phykit/services/alignment/alignment_recoding.py

+48-43
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from os import path
22
import sys
3+
from typing import Dict, List
4+
5+
from Bio.Align import MultipleSeqAlignment
36

47
from .base import Alignment
58

@@ -10,34 +13,42 @@ class AlignmentRecoding(Alignment):
1013
def __init__(self, args) -> None:
1114
super().__init__(**self.process_args(args))
1215

13-
def run(self):
14-
alignment, _ = self.get_alignment_and_format()
15-
16+
def run(self) -> None:
17+
alignment, _, is_protein = self.get_alignment_and_format()
18+
1619
recoding_table = self.read_recoding_table(self.code[0])
1720

18-
recoded_alignment = self.recode_alignment_as_dict(
19-
alignment, recoding_table
21+
recoded_alignment = self.recode_alignment(
22+
alignment, recoding_table, is_protein
2023
)
2124

2225
for k, v in recoded_alignment.items():
2326
print(f">{k}\n{''.join(v)}")
2427

25-
def recode_alignment_as_dict(self, alignment, recoding_table: dict) -> dict:
28+
def recode_alignment(
29+
self,
30+
alignment: MultipleSeqAlignment,
31+
recoding_table: Dict[str, str],
32+
is_protein: bool,
33+
) -> Dict[str, List[str]]:
34+
35+
gap_chars = self.get_gap_chars()
2636
recoded_alignment = dict()
27-
for i in range(0, len(alignment)):
28-
recoded_sequence_i = []
29-
for j in range(alignment.get_alignment_length()):
30-
sequence_ij = alignment[i, j].upper()
31-
if sequence_ij in ["?", "-", "X"]:
32-
recoded_sequence_i.append(sequence_ij)
33-
else:
34-
recoded_sequence_i.append(recoding_table[sequence_ij])
3537

36-
recoded_alignment[alignment[i].id] = recoded_sequence_i
38+
for record in alignment:
39+
recoded_sequence = [
40+
recoding_table.get(base.upper(), base)
41+
if base not in gap_chars else base
42+
for base in record.seq
43+
]
44+
recoded_alignment[record.id] = recoded_sequence
3745

3846
return recoded_alignment
3947

40-
def read_recoding_table(self, recoding: str) -> dict:
48+
def read_recoding_table(
49+
self,
50+
recoding: str
51+
) -> Dict[str, str]:
4152
"""
4253
return translation table with codons as keys and amino acids as values
4354
"""
@@ -47,33 +58,27 @@ def read_recoding_table(self, recoding: str) -> dict:
4758
if recoding is None:
4859
print("Please specify a recoding table")
4960
sys.exit()
50-
elif recoding == "RY-nucleotide":
51-
pathing = path.join(here, "../../recoding_tables/RY-nucleotide.txt")
52-
elif recoding == "SandR-6":
53-
pathing = path.join(here, "../../recoding_tables/S_and_R-6.txt")
54-
elif recoding == "KGB-6":
55-
pathing = path.join(here, "../../recoding_tables/KGB-6.txt")
56-
elif recoding == "Dayhoff-6":
57-
pathing = path.join(here, "../../recoding_tables/Dayhoff-6.txt")
58-
elif recoding == "Dayhoff-9":
59-
pathing = path.join(here, "../../recoding_tables/Dayhoff-9.txt")
60-
elif recoding == "Dayhoff-12":
61-
pathing = path.join(here, "../../recoding_tables/Dayhoff-12.txt")
62-
elif recoding == "Dayhoff-15":
63-
pathing = path.join(here, "../../recoding_tables/Dayhoff-15.txt")
64-
elif recoding == "Dayhoff-18":
65-
pathing = path.join(here, "../../recoding_tables/Dayhoff-18.txt")
66-
# handling case of a custom translation table
67-
else:
68-
pathing = str(recoding)
69-
70-
with open(pathing) as code:
71-
for line in code:
72-
line = line.split()
73-
if line[1].upper() in recoding_table.keys():
74-
recoding_table[line[1]].upper().append(line[0].upper())
75-
else:
76-
recoding_table[line[1]] = line[0].upper()
61+
62+
recoding_paths = {
63+
"RY-nucleotide": "../../recoding_tables/RY-nucleotide.txt",
64+
"SandR-6": "../../recoding_tables/S_and_R-6.txt",
65+
"KGB-6": "../../recoding_tables/KGB-6.txt",
66+
"Dayhoff-6": "../../recoding_tables/Dayhoff-6.txt",
67+
"Dayhoff-9": "../../recoding_tables/Dayhoff-9.txt",
68+
"Dayhoff-12": "../../recoding_tables/Dayhoff-12.txt",
69+
"Dayhoff-15": "../../recoding_tables/Dayhoff-15.txt",
70+
"Dayhoff-18": "../../recoding_tables/Dayhoff-18.txt",
71+
}
72+
pathing = recoding_paths.get(recoding, str(recoding))
73+
74+
try:
75+
with open(path.join(here, pathing)) as code:
76+
for line in code:
77+
parts = line.split()
78+
recoding_table[parts[1].upper()] = parts[0].upper()
79+
except FileNotFoundError:
80+
print(f"Recoding table file '{pathing}' not found.")
81+
sys.exit()
7782

7883
return recoding_table
7984

phykit/services/alignment/base.py

+26-32
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
from collections import Counter
12
import sys
23

4+
from typing import List
5+
36
from ..base import BaseService
47
from ...helpers.files import (
58
get_alignment_and_format as get_alignment_and_format_helper
@@ -49,47 +52,38 @@ def get_alignment_and_format(self):
4952
print("Please double check pathing and filenames")
5053
sys.exit()
5154

52-
def calculate_rcv(self):
53-
alignment, _ = self.get_alignment_and_format()
55+
def calculate_rcv(self) -> float:
56+
alignment, _, _ = self.get_alignment_and_format()
5457
aln_len = alignment.get_alignment_length()
5558

56-
# string to hold all sequences
57-
concat_seq = ""
58-
# initialize a counter for the number of sequences in the input fasta file
59-
num_records = 0
59+
concat_seq = []
60+
num_records = len(alignment)
6061

61-
# for each record join concatSeq string and sequence as well as keeping track
62-
# of the number of records
63-
for record in alignment:
64-
concat_seq += record.seq
65-
num_records += 1
62+
concat_seq = "".join(str(record.seq) for record in alignment)
6663

67-
# dictionary to hold the average occurence of each sequence letter
68-
average_d = {}
69-
# loop through the different sequences that appear in the fasta file
70-
# population dictionary with how many times that sequence appears
71-
for seq in set(concat_seq):
72-
average_d[seq] = concat_seq.count(seq) / num_records
64+
total_counts = Counter(concat_seq)
65+
66+
average_d = {
67+
seq: total_counts[seq] / num_records for seq in total_counts
68+
}
7369

74-
# intiailize list to hold the RCV values per ith taxa
75-
# that will later be summed
7670
indiv_rcv_values = []
7771

78-
# loop through records again and calculate RCV for
79-
# each taxa and append to indivRCVvalues
8072
for record in alignment:
81-
# temp holds a temporary value of the numerator before appending
82-
# to numeratorRCVvalues and then is reassigned to 0 when it goes
83-
# through the loop again
84-
temp = 0
85-
# calculates the absolute value of the ith sequence letter minus the average
86-
for seq_letter in set(concat_seq):
87-
temp += abs(
88-
record.seq.count(seq_letter) - average_d[seq_letter]
89-
)
90-
indiv_rcv_values.append(temp / (num_records * aln_len))
73+
record_counts = Counter(record.seq)
74+
temp_rcv = sum(
75+
abs(
76+
record_counts[seq_letter] - average_d[seq_letter]
77+
) for seq_letter in total_counts
78+
)
79+
indiv_rcv_values.append(temp_rcv / (num_records * aln_len))
9180

9281
relative_composition_variability = sum(indiv_rcv_values)
9382

94-
# print the sum of all RCV values
9583
return relative_composition_variability
84+
85+
def get_gap_chars(is_protein: bool) -> List[str]:
86+
if is_protein:
87+
return ["-", "?", "*", "X", "x"]
88+
else:
89+
return ["-", "?", "*", "X", "x", "N", "n"]

0 commit comments

Comments
 (0)