Skip to content

Commit

Permalink
Merge pull request #7 from mkandziora/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
mkandziora authored Nov 25, 2020
2 parents cf961d7 + 58b67db commit a69c304
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 19 deletions.
40 changes: 34 additions & 6 deletions PhylUp/phyl_up.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def __init__(self, id_to_spn, aln, aln_schema, tre, tre_schema, config, ignore_a
self.aln = DnaCharacterMatrix.get(path=self.aln_fn, schema=self.aln_schema)
self.tre_fn = tre
self.tre_schema = tre_schema
self.tre = None
if not self.tre_fn == None:
self.tre = Tree.get(path=os.path.abspath(self.tre_fn), schema=self.tre_schema,
taxon_namespace=self.aln.taxon_namespace, preserve_underscores=True)
self.ignore_acc_list = ignore_acc_list
sys.stdout.write('Translate input names to ncbi taxonomy...\n')
self.table = phylogenetic_helpers.build_table_from_file(id_to_spn, self.config, self.config.downtorank)
Expand Down Expand Up @@ -850,7 +852,8 @@ def prefer_taxa_from_locus(self, ns_txid_df):
for tax_id in preferred_taxa_ids:
# if tax_id_from_df is lower taxon from preferred ids:
part_tax_id_from_df = self.ncbi_parser.match_id_to_mrca(tax_id_from_df, tax_id)
if part_tax_id_from_df != 0:
# if part_tax_id_from_df != 0:
if part_tax_id_from_df is True:
new_filtered_taxid_df = ns_txid_df
else:
new_filtered_taxid_df = ns_txid_df[0:0]
Expand Down Expand Up @@ -1156,7 +1159,8 @@ def filter(self, new_seqs):
if type(self.mrca) == int:
# TODO: make self.mrca to be a set - single id is type int
print("DO I EVER GET HERE - MRCA IS INT")
if mrca_tx == self.mrca:
#if mrca_tx == self.mrca:
if mrca_tx == True:
to_add = new_seqs[select_tf]
assert len(to_add) != 0, len(to_add)
self.upd_new_seqs = pd.concat([self.upd_new_seqs, to_add], axis=0, ignore_index=True, sort=True)
Expand All @@ -1166,8 +1170,9 @@ def filter(self, new_seqs):
to_del.at[:, 'status'] = 'deleted - mrca'
self.del_table = pd.concat([self.del_table, to_del], axis=0, ignore_index=True, sort=True)
elif type(self.mrca) == set:
# print(mrca_tx in self.mrca, mrca_tx, self.mrca)
if mrca_tx in self.mrca:
#print(mrca_tx in self.mrca, mrca_tx, self.mrca)
# if mrca_tx in self.mrca:
if mrca_tx == True:
to_add = new_seqs[select_tf]
assert len(to_add) != 0, len(to_add)
self.upd_new_seqs = pd.concat([self.upd_new_seqs, to_add], axis=0, ignore_index=True, sort=True)
Expand Down Expand Up @@ -1227,7 +1232,12 @@ def __init__(self, config, table):
def filter(self, new_seqs):
debug('FilterUniqueAcc')
# delete things in table
new_seqs_unique = new_seqs.drop_duplicates(subset=['accession'], keep='first') #todo: keep longest - might affect length filter
new_seqs_unique = drop_shortest_among_duplicates(new_seqs)
#new_seqs_unique_old = new_seqs.drop_duplicates(subset=['accession'], keep='first') #todo: keep longest - might affect length filter
#assert new_seqs_unique_old['sseq'].isin(new_seqs_unique['sseq']).all()
#assert new_seqs_unique_old['accession'].isin(new_seqs_unique['accession']).all()
#assert new_seqs_unique.sseq == new_seqs_unique_new.sseq

drop = new_seqs.drop(new_seqs_unique.index.tolist())

self.upd_new_seqs = new_seqs_unique
Expand All @@ -1240,6 +1250,24 @@ def filter(self, new_seqs):
write_msg_logfile(msg, self.config.workdir)


def drop_shortest_among_duplicates(new_seqs):
"""
Keep the entry with the longest sequences from the duplicated accessions.
:param new_seqs:
:return:
"""
accs = new_seqs.drop_duplicates(subset=['accession'], keep='first')

new_seqs['seq_len'] = new_seqs['sseq'].apply(len)
grouped_df = new_seqs.groupby("accession")
maximums_idx = grouped_df.seq_len.idxmax()
maximums = new_seqs.loc[maximums_idx]

assert len(maximums) == len(accs.accession), (len(maximums), len(accs.accession))
return maximums


def check_df_index_unique(df):
assert df.index.is_unique == True, df.index

Expand Down
4 changes: 4 additions & 0 deletions install_req.bash
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,8 @@ echo "export PATH=$PWD:\$PATH" >> ~/.bashrctest
cd ..
cd ..

wget 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
gunzip -cd taxdump.tar.gz | (tar xvf - names.dmp nodes.dmp)
mv *.dmp ./data/

source ~/.bashrc
1 change: 1 addition & 0 deletions tests/test_filter_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,4 @@ def test_filterlen():

assert before > after
assert del_tab > 0
assert before == after + del_tab
39 changes: 38 additions & 1 deletion tests/test_filter_otu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pandas as pd
import pytest

from distutils.dir_util import copy_tree
Expand Down Expand Up @@ -36,22 +37,58 @@ def configure():
pytest.test = test

new_seqs = test.extend()
assert len(new_seqs) > 0, len(new_seqs)

aln = phylogenetic_helpers.read_in_aln(test.aln_fn, test.aln_schema)

new_seqs = new_seqs[~new_seqs['accession'].isin(test.table['accession'])] # ~ is the pd not in/!
new_seqs = test.basic_filters(aln, test.mrca, new_seqs)
assert len(new_seqs) > 0, len(new_seqs)

#new_seqs = test.basic_filters(aln, test.mrca, new_seqs)
orig_len = new_seqs
columns = ['ncbi_txn', 'ncbi_txid', 'status', 'status_note', "date",
'accession', 'pident', 'evalue', 'bitscore', 'sseq', 'title']
all_del = pd.DataFrame(columns=columns)

remove_basics = [phyl_up.FilterUniqueAcc(test.config, test.table),
# FilterBLASTThreshold(self.config),
phyl_up.FilterLength(test.config, aln),
phyl_up.FilterMRCA(test.config, test.mrca)
]
for f in remove_basics:
print(f)
internal_new_seq = new_seqs
f.filter(new_seqs)
new_seqs = f.upd_new_seqs
del_seq = f.del_table
# all_del = all_del.append(del_seq, ignore_index=True)
all_del = pd.concat([all_del, del_seq], ignore_index=True, sort=True)
phyl_up.check_filter_numbers(del_seq, new_seqs, internal_new_seq)
assert len(new_seqs) > 0, len(new_seqs)

if len(new_seqs) == 0:
return new_seqs # stop filtering if new seqs is 0
phyl_up.check_filter_numbers(all_del, new_seqs, orig_len)
assert len(new_seqs) > 0, len(new_seqs)

# next filter need infos in table
new_seqs = test.add_new_seqs(new_seqs)
pytest.new_seqs = new_seqs

assert len(pytest.new_seqs) > 0, len(pytest.new_seqs)


def test_filter_otu_no_rank():
test = pytest.test
print(test.table)
new_seqs = pytest.new_seqs
assert len(new_seqs) > 0, len(new_seqs)

conf = pytest.conf


before = len(new_seqs)
print(before)
new_seqs_org = new_seqs
before_table = deepcopy(test.table)

Expand Down
87 changes: 75 additions & 12 deletions tests/test_filter_seqident.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def test_filter_compare():
conf = config.ConfigObj(configfi, workdir, interactive=False)
conf.threshold = 2
conf.blast_folder = os.path.abspath("./data/blast_for_tests")
conf.identical_seqs = False

if not os.path.exists(workdir):
os.rename(workdir)
Expand All @@ -229,36 +230,62 @@ def test_filter_compare():

test = phyl_up.PhylogeneticUpdater(id_to_spn, seqaln, mattype, trfn, schema_trf, conf)

new_seqs = test.extend()
new_existing_seq = """AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTCCGATTCCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGTGGTGACTGCATTGAAACTTGCTTCTCTATAATTAATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAACCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACACATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGAAGATTGGTCTCCTGTTCCTAAGGTGCGGTTGGCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC"""

test.table.loc[1, 'sseq'] = new_existing_seq
test.table.loc[1, 'ncbi_txid'] = 123456
test.table.loc[1, 'status'] = 0

assert test.table['sseq'].str.contains(new_existing_seq).any() == True

new_existing_seq = """TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTCCGATTCCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGTGGTGACTGCATTGAAACTTGCTTCTCTATAATTAATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAACCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACACATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGAAGATTGGTCTCCTGTTCCTAAGGTGCGGTTGGCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC"""
new_seqs = test.extend()

new_seqs = new_seqs[~new_seqs['accession'].isin(test.table['accession'])] # ~ is the pd not in/!


f = phyl_up.FilterUniqueAcc(test.config, test.table)
f.filter(new_seqs)
new_seqs = f.upd_new_seqs
new_seqs.loc[1, 'sseq'] = new_existing_seq
new_seqs.loc[1, 'ncbi_txid'] = 123456
assert new_seqs.loc[1, 'sseq'] == new_existing_seq
assert new_seqs['sseq'].str.contains(new_existing_seq).any()
assert test.table['sseq'].str.contains(new_existing_seq).any()
assert new_seqs['sseq'].str.contains(new_existing_seq).any() == True
assert test.table['sseq'].str.contains(new_existing_seq).any() == True
new_seqs.loc[1, 'ncbi_txid'] = 123456
new_seq_acc = new_seqs.loc[1, 'accession']


new_seqs_before = new_seqs

len_table_before = len(test.table)
new_seqs = test.add_new_seqs(new_seqs)
assert new_seqs.index.tolist() != new_seqs_before.index.tolist()
# assert new_existing_seq in [new_seqs['sseq']]
assert new_seqs['sseq'].str.contains(new_existing_seq).any() == True
len_table_after = len(test.table)
assert len_table_before < len_table_after, (len_table_before, len_table_after)

assert test.table['sseq'].hasnans == False, test.table['sseq']

assert new_seqs['sseq'].str.contains(new_existing_seq).any() == True
assert new_seqs['accession'].str.contains(new_seq_acc).any() == True
assert test.table['sseq'].str.contains(new_existing_seq).any() == True
print(new_seqs.index[new_seqs['sseq'].str.contains(new_existing_seq).any()])

# new_seqs = test.compare_filter(new_seqs)
f = phyl_up.FilterSeqIdent(test.config, test.table, test.status)
before_filter = new_seqs
f.filter(new_seqs)
test.aln = f.aln
new_seqs = f.upd_new_seqs # assign new_seqs to last upd from filter before
test.table = f.table
del_seq = f.del_table
print(new_seq_acc)

new_seqs = test.compare_filter(new_seqs)
print(test.table['sseq'].str.contains(new_existing_seq).any())
print(new_seqs['sseq'].str.contains(new_existing_seq).any())
print(test.table.loc[1, ])
assert new_seqs['accession'].str.contains(new_seq_acc).any() == False

assert new_seqs['sseq'].str.contains(new_existing_seq).any() == False

all_avail_data = test.table[test.table['status'].between(0, test.status, inclusive=True)]
Expand All @@ -269,8 +296,7 @@ def test_filter_compare():
print(found.count())
assert found.count() == 1, found.count()



# todo check: seems to be testing the same as above but differnt and fails now
def test_filter_compare_shorter():
print('test_filter_seqident')
workdir = "tests/output/test_run_seqident"
Expand All @@ -284,11 +310,32 @@ def test_filter_compare_shorter():
conf = config.ConfigObj(configfi, workdir, interactive=False)
conf.threshold = 2
conf.blast_folder = os.path.abspath("./data/blast_for_tests")

if not os.path.exists(workdir):
os.rename(workdir)
if not os.path.exists(workdir):
os.mkdir(workdir)
tmp_folder = os.path.join(workdir, 'tmp')
if not os.path.exists(tmp_folder):
os.mkdir(tmp_folder)
# call(['cp', '-a', 'data/tmp_for_test/', tmp_folder])
copy_tree('data/tmp_for_test/', tmp_folder)
shutil.copyfile('data/tiny_test_example/updt_aln.fasta', os.path.join(workdir, 'updt_aln.fasta'))
shutil.copyfile('data/tiny_test_example/updt_tre.tre', os.path.join(workdir, 'updt_tre.tre'))

test = phyl_up.PhylogeneticUpdater(id_to_spn, seqaln, mattype, trfn, schema_trf, conf)

new_existing_seq = """AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTCCGATTCCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGTGGTGACTGCATTGAAACTTGCTTCTCTATAATTAATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAACCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACACATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGAAGATTGGTCTCCTGTTCCTAAGGTGCGGTTGGCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC"""

test.table.loc[1, 'sseq'] = new_existing_seq
test.table.loc[1, 'ncbi_txid'] = 123456
test.table.loc[1, 'status'] = 0

assert test.table['sseq'].str.contains(new_existing_seq).any() == True

new_seqs = test.extend()

new_existing_seq = """AGAACGACCTGTGAACATGTAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTCCGATTCCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGTGGTGACTGCATTGAAACTTGCTTCTCTATAATTAATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAACCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACACATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGAAGATTGGTCTCCTGTTCCTAAGGTGCGGTTGGCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC"""
# new_existing_seq = """AGAACGACCTGTGAACATGTAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTCCGATTCCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGTGGTGACTGCATTGAAACTTGCTTCTCTATAATTAATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAACCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACACATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGAAGATTGGTCTCCTGTTCCTAAGGTGCGGTTGGCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC"""

new_seqs = new_seqs[~new_seqs['accession'].isin(test.table['accession'])] # ~ is the pd not in/!

Expand All @@ -297,6 +344,11 @@ def test_filter_compare_shorter():
new_seqs = f.upd_new_seqs
new_seqs.loc[1, 'sseq'] = new_existing_seq
assert new_seqs.loc[1, 'sseq'] == new_existing_seq
assert new_seqs['sseq'].str.contains(new_existing_seq).any() == True
assert test.table['sseq'].str.contains(new_existing_seq).any() == True
new_seqs.loc[1, 'ncbi_txid'] = 123456
new_seq_acc = new_seqs.loc[1, 'accession']
assert new_seqs.loc[1, 'sseq'] == new_existing_seq
assert new_seqs['sseq'].str.contains(new_existing_seq).any()
assert test.table['sseq'].str.contains(new_existing_seq).any()

Expand All @@ -311,10 +363,21 @@ def test_filter_compare_shorter():

assert test.table['sseq'].hasnans == False, test.table['sseq']

new_seqs = test.compare_filter(new_seqs)
print(new_seqs['sseq'])
assert new_seqs['sseq'].str.contains(new_existing_seq).any() == True

# new_seqs = test.compare_filter(new_seqs)
f = phyl_up.FilterSeqIdent(test.config, test.table, test.status)
before_filter = new_seqs
f.filter(new_seqs)
test.aln = f.aln
new_seqs = f.upd_new_seqs # assign new_seqs to last upd from filter before
test.table = f.table
del_seq = f.del_table
print(new_seq_acc)

print(new_existing_seq)
print(test.table['sseq'].str.contains(new_existing_seq).any())
print(new_seqs['sseq'].str.contains(new_existing_seq).any())
assert new_seqs['accession'].str.contains(new_seq_acc).any() == False
assert new_seqs['sseq'].str.contains(new_existing_seq).any() == False

all_avail_data = test.table[test.table['status'].between(0, test.status, inclusive=True)]
Expand Down
51 changes: 51 additions & 0 deletions tests/test_filter_unique.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pandas as pd
from distutils.dir_util import copy_tree

from PhylUp import phyl_up, config
Expand Down Expand Up @@ -39,3 +40,53 @@ def test_filter_unique():
assert before > after
assert del_tab > 0
assert after + del_tab == before

def test_filters_to_longest():
print('test_filter_unique')
workdir = "tests/output/test_runs_unique"
trfn = "data/tiny_test_example/test.tre"
schema_trf = "newick"
id_to_spn = "data/tiny_test_example/test_nicespl.csv"
seqaln = "data/tiny_test_example/test.fas"
mattype = "fasta"
configfi = "data/localblast_test.config"

if not os.path.exists(workdir):
os.mkdir(workdir)
tmp_folder = os.path.join(workdir, 'tmp')
if not os.path.exists(tmp_folder):
os.mkdir(tmp_folder)
# call(['cp', '-a', 'data/tmp_for_test/', tmp_folder])
copy_tree('data/tmp_for_test/', tmp_folder)

conf = config.ConfigObj(configfi, workdir, interactive=False)
conf.threshold = 2
conf.blast_folder = os.path.abspath("./data/blast_for_tests")
test = phyl_up.PhylogeneticUpdater(id_to_spn, seqaln, mattype, trfn, schema_trf, conf)

new_seqs = test.extend()

# make own longest selection
accs = new_seqs.drop_duplicates(subset=['accession'], keep='first')

len_supposed = len(accs)

select = pd.DataFrame(columns=['ncbi_txn', 'ncbi_txid', 'status', 'status_note', "date", 'accession',
'pident', 'evalue', 'bitscore', 'sseq', 'title'])
for acc in accs.accession:
acc_df = new_seqs[new_seqs.accession == acc]
lenseq_acc_df = acc_df['sseq'].apply(len)
idxmax_val = lenseq_acc_df.idxmax()
select = select.append(acc_df.loc[idxmax_val])

assert len(select) == len_supposed, (len(select), len_supposed)

# run filter from phylup
before = len(new_seqs)

f = phyl_up.FilterUniqueAcc(test.config, test.table)
f.filter(new_seqs)
new_seqs = f.upd_new_seqs

assert new_seqs.accession.isin(select.accession).all()

Loading

0 comments on commit a69c304

Please sign in to comment.