Skip to content
This repository has been archived by the owner on Jan 26, 2022. It is now read-only.

Water attempt 2 #405

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions MSMBuilder/assigning.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tables
import warnings
from mdtraj import io
from msmbuilder import MSMLib as msmlib
import logging
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -240,3 +241,53 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa
"-- this function is deprecated"), DeprecationWarning)
assign_with_checkpoint(metric, project, generators, assignments_path,
distances_path, chunk_size, atom_indices_to_load)


def outer_product_assignment(assign1, assign2):
"""Combine the results of two clusterings.

Specifically, if a conformation is in state i after clustering
with metric 1, and it is in state j after clustering with metric 2,
we assign it to a new state (ij)

This is a naive way of combining two metrics to give a singular
assignment of states.

Parameters
----------
assign1 : np.ndarray
Assignment array 1
assign2 : np.ndarray
Assignment array 2

Returns
----------
new_assign : np.ndarray
New assignments array, renumbered from 0 ... N-1, N <= n1 * n2
"""

assert assign1.shape == assign2.shape, """Assignments must be
for the same set of trajectories."""
new_assign = -1 * np.ones_like(assign1, dtype=np.int)

nstates1 = np.max(assign1) + 1
nstates2 = np.max(assign2) + 1

translations = np.reshape(np.arange(nstates1 * nstates2),
(nstates1, nstates2))

ass_shape = assign1.shape
for i in xrange(ass_shape[0]):
for j in xrange(ass_shape[1]):
# TODO: vectorize this loop
if assign1[i, j] == -1:
# No assignment here
assert assign2[i, j] == -1, """Assignments must be for
the same set of trajectories."""
else:
new_assign[i, j] = translations[assign1[i, j], assign2[i, j]]

# Renumber states in place
msmlib.renumber_states(new_assign)
return new_assign

1 change: 1 addition & 0 deletions MSMBuilder/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@
from .hybrid import Hybrid, HybridPNorm
from .projection import RedDimPNorm
from .positions import Positions
from .solvent import SolventFp
36 changes: 29 additions & 7 deletions MSMBuilder/metrics/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact,
AtomPairs, ContinuousContact,
AbstractDistanceMetric, Vectorized,
RedDimPNorm, Positions)
RedDimPNorm, Positions, SolventFp)


def add_argument(group, *args, **kwargs):
Expand All @@ -22,8 +22,6 @@ def add_argument(group, *args, **kwargs):
kwargs['help'] = d
group.add_argument(*args, **kwargs)

################################################################################


def locate_metric_plugins(name):
if not name in ['add_metric_parser', 'construct_metric', 'metric_class']:
Expand All @@ -39,7 +37,7 @@ def add_metric_parsers(parser):

metric_subparser = parser.add_subparsers(dest='metric', description='Available metrics to use.')

#metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
# metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
rmsd = metric_subparser.add_parser('rmsd',
description='''RMSD: Root mean square deviation over a set of user defined atoms
(typically backbone heavy atoms or alpha carbons). To evaluate the distance
Expand Down Expand Up @@ -80,7 +78,7 @@ def add_metric_parsers(parser):
Furthermore, the sense in which the distance between two residues is computed can be
either specified as "CA", "closest", or "closest-heavy", which will respectively compute
("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of
atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"),
atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"),
or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to
one residue and j to the other residue. This code is executed in parallel on multiple cores (but
not multiple boxes) using OMP.''')
Expand All @@ -106,6 +104,21 @@ def add_metric_parsers(parser):
help='which distance metric', choices=AtomPairs.allowable_scipy_metrics)
metric_parser_list.append(atompairs)

solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame
calculate a 'solvent fingerprint' by considering the weighted pairwise distances
between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""")
add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat',
help='Indices of the solvent (water) atoms')
add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat',
help='Indices of the solute (protein) atoms')
add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5,
type=float, help='std. dev. of gaussian kernel')
add_argument(solventfp, '-p', dest='solventfp_p', default=2,
help='p used for metric=minkowski (otherwise ignored)')
add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean',
help='which distance metric', choices=SolventFp.allowable_scipy_metrics)
metric_parser_list.append(solventfp)

positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame
we represent the conformation as a vector of atom positions, where the atoms have been
aligned to a target structure.""")
Expand All @@ -121,10 +134,10 @@ def add_metric_parsers(parser):
help='which distance metric', choices=Positions.allowable_scipy_metrics)
metric_parser_list.append(positions)

tica = metric_subparser.add_parser( 'tica', description='''
tica = metric_subparser.add_parser('tica', description='''
tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f.
in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.)
for more details. In addition to these options, you must provide an additional
for more details. In addition to these options, you must provide an additional
metric you used to prepare the trajectories in the training step.''')

add_argument(tica, '-p', dest='p', help='p value for p-norm')
Expand Down Expand Up @@ -213,6 +226,15 @@ def construct_metric(args):
metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices,
metric=args.positions_metric, p=args.positions_p)

elif metric_name == 'solventfp':
proti = np.loadtxt(args.solventfp_proti, np.int)
wateri = np.loadtxt(args.solventfp_wateri, np.int)
metric = SolventFp(solute_indices=proti,
solvent_indices=wateri,
sigma=args.solventfp_sigma,
metric=args.solventfp_metric,
p=args.solventfp_p)

elif metric_name == "tica":
tica_obj = tICA.load(args.tica_fn)

Expand Down
106 changes: 106 additions & 0 deletions MSMBuilder/metrics/solvent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import print_function, absolute_import, division

import logging

from .baseclasses import Vectorized
import mdtraj as md
import numpy as np


logger = logging.getLogger(__name__)


class SolventFp(Vectorized):
"""Distance metric for calculating distances between frames based on their
solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.
"""

allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev',
'cityblock', 'correlation', 'cosine',
'euclidean', 'minkowski', 'sqeuclidean',
'seuclidean', 'mahalanobis', 'sqmahalanobis']

def __init__(self, solute_indices, solvent_indices, sigma,
metric='euclidean', p=2, V=None, VI=None):
"""Create a distance metric to capture solvent degrees of freedom

Parameters
----------
solute_indices : ndarray
atom indices of the solute atoms
solvent_indices : ndarray
atom indices of the solvent atoms
sigma : float
width of gaussian kernel
metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock',
'correlation', 'cosine', 'euclidean', 'minkowski',
'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'}
Distance metric to equip the vector space with.
p : int, optional
p-norm order, used for metric='minkowski'
V : ndarray, optional
variances, used for metric='seuclidean'
VI : ndarray, optional
inverse covariance matrix, used for metric='mahalanobi'

"""

# Check input indices
md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1,
name='solute', can_be_none=False)
md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1,
name='solvent', can_be_none=False)

super(SolventFp, self).__init__(metric, p, V, VI)
self.solute_indices = solute_indices
self.solvent_indices = solvent_indices
self.sigma = sigma

def __repr__(self):
"String representation of the object"
return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)'
% (self.metric, self.p, self.sigma))

def prepare_trajectory(self, trajectory):
"""Calculate solvent fingerprints
Parameters
----------
trajectory : msmbuilder.Trajectory
An MSMBuilder trajectory to prepare

Returns
-------
fingerprints : ndarray
A 2D array of fingerprint vectors of
shape (traj_length, protein_atom)
"""

# Give shorter names to these things
prot_indices = self.solute_indices
water_indices = self.solvent_indices
sigma = self.sigma

# The result vector
fingerprints = np.zeros((trajectory.n_frames, len(prot_indices)))

# Check for periodic information
if trajectory.unitcell_lengths is None:
logging.warn('No periodic information found for computing solventfp.')

for i, prot_i in enumerate(prot_indices):
# For each protein atom, calculate distance to all water
# molecules
atom_pairs = np.empty((len(water_indices), 2))
atom_pairs[:, 0] = prot_i
atom_pairs[:, 1] = water_indices
# Get a traj_length x n_water_indices vector of distances
distances = md.compute_distances(trajectory,
atom_pairs,
periodic=True)
# Calculate guassian kernel
distances = np.exp(-distances / (2 * sigma * sigma))

# Sum over water atoms for all frames
fingerprints[:, i] = np.sum(distances, axis=1)

return fingerprints
69 changes: 69 additions & 0 deletions scripts/AssignOuter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python
# This file is part of MSMBuilder.
#
# Copyright 2014 Stanford University
#
# MSMBuilder is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


#===============================================================================
# Imports
#===============================================================================

import logging

from mdtraj import io

from msmbuilder import arglib, assigning
from msmbuilder.metrics import solvent

#===============================================================================
# Globals
#===============================================================================

logger = logging.getLogger('msmbuilder.scripts.AssignOuter')
parser = arglib.ArgumentParser(description=
"""Combine the results of two clusterings. Specifically, if a conformation
is in state i after clustering with metric 1, and it is in state j
after clustering with metric 2, we assign it to a new state (ij)

This is a naive way of combining two metrics to give a singular
assignment of states.""")
parser.add_argument('assignment1', default='./Data1/Assignments.h5',
help='First assignment file')
parser.add_argument('assignment2', default='./Data2/Assignments.h5',
help='Second assignment file')
parser.add_argument('assignment_out', default='OuterProductAssignments.h5',
help='Output file')


def run(assign1_fn, assign2_fn, out_fn):
assign1 = io.loadh(assign1_fn, 'arr_0')
assign2 = io.loadh(assign2_fn, 'arr_0')

new_assignments = assigning.outer_product_assignment(assign1, assign2)
io.saveh(out_fn, new_assignments)
logger.info('Saved outer product assignments to %s', out_fn)


def entry_point():
args = parser.parse_args()
arglib.die_if_path_exists(args.assignment_out)

run(args.assignment1, args.assignment2, args.assignment_out)


if __name__ == "__main__":
entry_point()
Loading