msmbuilder · mpharrigan · Apr 4, 2014 · Apr 4, 2014 · Apr 4, 2014 · Apr 4, 2014
diff --git a/MSMBuilder/assigning.py b/MSMBuilder/assigning.py
@@ -7,6 +7,7 @@
 import tables
 import warnings
 from mdtraj import io
+from msmbuilder import MSMLib as msmlib
 import logging
 logger = logging.getLogger(__name__)
 
@@ -240,3 +241,53 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa
                    "-- this function is deprecated"), DeprecationWarning)
     assign_with_checkpoint(metric, project, generators, assignments_path,
                            distances_path, chunk_size, atom_indices_to_load)
+
+
+def outer_product_assignment(assign1, assign2):
+    """Combine the results of two clusterings.
+
+    Specifically, if a conformation is in state i after clustering
+    with metric 1, and it is in state j after clustering with metric 2,
+    we assign it to a new state (ij)
+
+    This is a naive way of combining two metrics to give a singular
+    assignment of states.
+
+    Parameters
+    ----------
+    assign1 : np.ndarray
+        Assignment array 1
+    assign2 : np.ndarray
+        Assignment array 2
+
+    Returns
+    ----------
+    new_assign : np.ndarray
+        New assignments array, renumbered from 0 ... N-1, N <= n1 * n2
+    """
+
+    assert assign1.shape == assign2.shape, """Assignments must be
+        for the same set of trajectories."""
+    new_assign = -1 * np.ones_like(assign1, dtype=np.int)
+
+    nstates1 = np.max(assign1) + 1
+    nstates2 = np.max(assign2) + 1
+
+    translations = np.reshape(np.arange(nstates1 * nstates2),
+                              (nstates1, nstates2))
+
+    ass_shape = assign1.shape
+    for i in xrange(ass_shape[0]):
+        for j in xrange(ass_shape[1]):
+            # TODO: vectorize this loop
+            if assign1[i, j] == -1:
+                # No assignment here
+                assert assign2[i, j] == -1, """Assignments must be for
+                    the same set of trajectories."""
+            else:
+                new_assign[i, j] = translations[assign1[i, j], assign2[i, j]]
+
+    # Renumber states in place
+    msmlib.renumber_states(new_assign)
+    return new_assign
+
diff --git a/MSMBuilder/metrics/__init__.py b/MSMBuilder/metrics/__init__.py
@@ -19,3 +19,4 @@
 from .hybrid import Hybrid, HybridPNorm
 from .projection import RedDimPNorm
 from .positions import Positions
+from .solvent import SolventFp
diff --git a/MSMBuilder/metrics/parsers.py b/MSMBuilder/metrics/parsers.py
@@ -10,7 +10,7 @@
 from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact,
                                 AtomPairs, ContinuousContact,
                                 AbstractDistanceMetric, Vectorized,
-                                RedDimPNorm, Positions)
+                                RedDimPNorm, Positions, SolventFp)
 
 
 def add_argument(group, *args, **kwargs):
@@ -22,8 +22,6 @@ def add_argument(group, *args, **kwargs):
             kwargs['help'] = d
     group.add_argument(*args, **kwargs)
 
-################################################################################
-
 
 def locate_metric_plugins(name):
     if not name in ['add_metric_parser', 'construct_metric', 'metric_class']:
@@ -39,7 +37,7 @@ def add_metric_parsers(parser):
 
     metric_subparser = parser.add_subparsers(dest='metric', description='Available metrics to use.')
 
-    #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
+    # metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
     rmsd = metric_subparser.add_parser('rmsd',
                                        description='''RMSD: Root mean square deviation over a set of user defined atoms
         (typically backbone heavy atoms or alpha carbons). To evaluate the distance
@@ -80,7 +78,7 @@ def add_metric_parsers(parser):
         Furthermore, the sense in which the distance between two residues is computed can be
         either specified as "CA", "closest", or "closest-heavy", which will respectively compute
         ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of
-        atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), 
+        atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"),
         or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to
         one residue and j to the other residue. This code is executed in parallel on multiple cores (but
         not multiple boxes) using OMP.''')
@@ -106,6 +104,21 @@ def add_metric_parsers(parser):
                  help='which distance metric', choices=AtomPairs.allowable_scipy_metrics)
     metric_parser_list.append(atompairs)
 
+    solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame
+        calculate a 'solvent fingerprint' by considering the weighted pairwise distances
+        between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""")
+    add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat',
+                help='Indices of the solvent (water) atoms')
+    add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat',
+                help='Indices of the solute (protein) atoms')
+    add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5,
+                 type=float, help='std. dev. of gaussian kernel')
+    add_argument(solventfp, '-p', dest='solventfp_p', default=2,
+                 help='p used for metric=minkowski (otherwise ignored)')
+    add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean',
+        help='which distance metric', choices=SolventFp.allowable_scipy_metrics)
+    metric_parser_list.append(solventfp)
+
     positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame
         we represent the conformation as a vector of atom positions, where the atoms have been
         aligned to a target structure.""")
@@ -121,10 +134,10 @@ def add_metric_parsers(parser):
                  help='which distance metric', choices=Positions.allowable_scipy_metrics)
     metric_parser_list.append(positions)
 
-    tica = metric_subparser.add_parser( 'tica', description='''
+    tica = metric_subparser.add_parser('tica', description='''
         tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f.
         in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.)
-        for more details. In addition to these options, you must provide an additional 
+        for more details. In addition to these options, you must provide an additional
         metric you used to prepare the trajectories in the training step.''')
 
     add_argument(tica, '-p', dest='p', help='p value for p-norm')
@@ -213,6 +226,15 @@ def construct_metric(args):
         metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices,
                            metric=args.positions_metric, p=args.positions_p)
 
+    elif metric_name == 'solventfp':
+        proti = np.loadtxt(args.solventfp_proti, np.int)
+        wateri = np.loadtxt(args.solventfp_wateri, np.int)
+        metric = SolventFp(solute_indices=proti,
+                           solvent_indices=wateri,
+                           sigma=args.solventfp_sigma,
+                           metric=args.solventfp_metric,
+                           p=args.solventfp_p)
+
     elif metric_name == "tica":
         tica_obj = tICA.load(args.tica_fn)
 

diff --git a/MSMBuilder/metrics/solvent.py b/MSMBuilder/metrics/solvent.py
@@ -0,0 +1,106 @@
+from __future__ import print_function, absolute_import, division
+
+import logging
+
+from .baseclasses import Vectorized
+import mdtraj as md
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class SolventFp(Vectorized):
+    """Distance metric for calculating distances between frames based on their
+    solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.
+    """
+
+    allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev',
+                               'cityblock', 'correlation', 'cosine',
+                               'euclidean', 'minkowski', 'sqeuclidean',
+                               'seuclidean', 'mahalanobis', 'sqmahalanobis']
+
+    def __init__(self, solute_indices, solvent_indices, sigma,
+                 metric='euclidean', p=2, V=None, VI=None):
+        """Create a distance metric to capture solvent degrees of freedom
+
+        Parameters
+        ----------
+        solute_indices : ndarray
+            atom indices of the solute atoms
+        solvent_indices : ndarray
+            atom indices of the solvent atoms
+        sigma : float
+                width of gaussian kernel
+        metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock',
+                  'correlation', 'cosine', 'euclidean', 'minkowski',
+                  'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'}
+            Distance metric to equip the vector space with.
+        p : int, optional
+            p-norm order, used for metric='minkowski'
+        V : ndarray, optional
+            variances, used for metric='seuclidean'
+        VI : ndarray, optional
+            inverse covariance matrix, used for metric='mahalanobi'
+
+        """
+
+        # Check input indices
+        md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1,
+                             name='solute', can_be_none=False)
+        md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1,
+                             name='solvent', can_be_none=False)
+
+        super(SolventFp, self).__init__(metric, p, V, VI)
+        self.solute_indices = solute_indices
+        self.solvent_indices = solvent_indices
+        self.sigma = sigma
+
+    def __repr__(self):
+        "String representation of the object"
+        return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)'
+                % (self.metric, self.p, self.sigma))
+
+    def prepare_trajectory(self, trajectory):
+        """Calculate solvent fingerprints
+        Parameters
+        ----------
+        trajectory : msmbuilder.Trajectory
+            An MSMBuilder trajectory to prepare
+
+        Returns
+        -------
+        fingerprints : ndarray
+            A 2D array of fingerprint vectors of
+            shape (traj_length, protein_atom)
+        """
+
+        # Give shorter names to these things
+        prot_indices = self.solute_indices
+        water_indices = self.solvent_indices
+        sigma = self.sigma
+
+        # The result vector
+        fingerprints = np.zeros((trajectory.n_frames, len(prot_indices)))
+
+        # Check for periodic information
+        if trajectory.unitcell_lengths is None:
+            logging.warn('No periodic information found for computing solventfp.')
+
+        for i, prot_i in enumerate(prot_indices):
+            # For each protein atom, calculate distance to all water
+            # molecules
+            atom_pairs = np.empty((len(water_indices), 2))
+            atom_pairs[:, 0] = prot_i
+            atom_pairs[:, 1] = water_indices
+            # Get a traj_length x n_water_indices vector of distances
+            distances = md.compute_distances(trajectory,
+                                             atom_pairs,
+                                             periodic=True)
+            # Calculate guassian kernel
+            distances = np.exp(-distances / (2 * sigma * sigma))
+
+            # Sum over water atoms for all frames
+            fingerprints[:, i] = np.sum(distances, axis=1)
+
+        return fingerprints
diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# This file is part of MSMBuilder.
+#
+# Copyright 2014 Stanford University
+#
+# MSMBuilder is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+#===============================================================================
+# Imports
+#===============================================================================
+
+import logging
+
+from mdtraj import io
+
+from msmbuilder import arglib, assigning
+from msmbuilder.metrics import solvent
+
+#===============================================================================
+# Globals
+#===============================================================================
+
+logger = logging.getLogger('msmbuilder.scripts.AssignOuter')
+parser = arglib.ArgumentParser(description=
+    """Combine the results of two clusterings. Specifically, if a conformation
+    is in state i after clustering with metric 1, and it is in state j
+    after clustering with metric 2, we assign it to a new state (ij)
+
+    This is a naive way of combining two metrics to give a singular
+    assignment of states.""")
+parser.add_argument('assignment1', default='./Data1/Assignments.h5',
+                    help='First assignment file')
+parser.add_argument('assignment2', default='./Data2/Assignments.h5',
+                    help='Second assignment file')
+parser.add_argument('assignment_out', default='OuterProductAssignments.h5',
+                    help='Output file')
+
+
+def run(assign1_fn, assign2_fn, out_fn):
+    assign1 = io.loadh(assign1_fn, 'arr_0')
+    assign2 = io.loadh(assign2_fn, 'arr_0')
+
+    new_assignments = assigning.outer_product_assignment(assign1, assign2)
+    io.saveh(out_fn, new_assignments)
+    logger.info('Saved outer product assignments to %s', out_fn)
+
+
+def entry_point():
+    args = parser.parse_args()
+    arglib.die_if_path_exists(args.assignment_out)
+
+    run(args.assignment1, args.assignment2, args.assignment_out)
+
+
+if __name__ == "__main__":
+    entry_point()