Source code for stacker.file_manipulation

"""Filter Trajectory Files

This module contains functions for manipulating trajectory
files. This includes filtering a trajectory to desired atoms,
converting trajectory filetype, and outputting Python trajectories
to other filetypes (eg. prmtop, mdcrd, pdb)
"""

import mdtraj as md
from numpy import typing
import argparse


[docs]
def filter_traj(trj_file : str, top_file : str, 
                residues : str | set = {}, atoms : set = {}) -> md.Trajectory:
    '''
    Filters an input trajectory to only the specified atoms and residues

    Filteres an input trajectory that contains all of the atoms in a topology to only
    the desired atoms at the desired residues (eg. the atoms necessary to find the 
    center of geometry of a residue). If ``residues`` or ``atoms`` are
    empty, all residues or atoms are included respectively.

    Parameters
    ----------
    trj_file : str
        filepath of the trajectory
    top_file : str
        filepath of the topology of the molecule
    residues : set or str
        1-indexed residue numbers of residues to keep in the trajectory.
        Accepts smart-indexed str representing a list of residues (e.g '1-5,6,39-48').
        If Empty, include all residues.
    atoms : set 
        atomnames to keep in the trajectory. If Empty, include all atoms.
        
    Returns
    -------
    filtered_trajectory : mdtraj.Trajectory
        a trajectory object representing the filtered structure across all frames

    See Also
    --------
    filter_traj_to_pdb : Filters an input trajectory to only the specified 
                         atoms and residues and outputs to pdb
    mdtraj.Trajectory : The Trajectory object in mdtraj package
    
    Notes
    -----
    Inputed trajectory should have 1-indexed Residue Indices, 
    Outputed trajectory object will be 0-indexed.

    Examples
    --------
    >>> import stacker as st
    >>> filtered_traj = st.filter_traj('stacker/testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.mdcrd', 
    ...                             'stacker/testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop', 
    ...                             residues = {426,427}, 
    ...                             atoms = {'C2','C4','C6'})
    WARNING: Residue Indices are expected to be 1-indexed
    Reading trajectory...
    Reading topology...
    Filtering trajectory...
    WARNING: Output filtered traj atom, residue, and chain indices are zero-indexed
    >>> table, bonds = filtered_traj.topology.to_dataframe()
    >>> print(table)
    serial name element  resSeq resName  chainID segmentID
    0   None   C6       C     425       G        0          
    1   None   C2       C     425       G        0          
    2   None   C4       C     425       G        0          
    3   None   C6       C     426       C        0          
    4   None   C4       C     426       C        0          
    5   None   C2       C     426       C        0       

    Residue Number support SmartIndexing
    
    >>> import stacker as st
    >>> filtered_traj = st.filter_traj('../testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.mdcrd', 
    ...                             '../testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop', 
    ...                              residues = '1-16,25,50-57', 
    ...                              atoms = {'C2','C4','C6'})
    WARNING: Residue Indices are expected to be 1-indexed
    Reading trajectory...
    Reading topology...
    Filtering trajectory...
    WARNING: Output filtered traj atom, residue, and chain indices are zero-indexed
    >>> filtered_traj
    <mdtraj.Trajectory with 10 frames, 75 atoms, 25 residues, without unitcells at 0x1156c3ed0>

    '''
    print("WARNING: Residue Indices are expected to be 1-indexed")
    
    print("Reading trajectory...")
    trajectory = md.load(trj_file, top = top_file)
    
    print("Reading topology...")
    topology = trajectory.topology
    
    print("Filtering trajectory...")
    residues = SmartIndexingAction.parse_smart_index(residues)

    # make resSeq 0-indexed for mdtraj query
    residues = {resnum-1 for resnum in residues} 

    atomnames_query = " or ".join([f"name == '{atom}'" for atom in atoms])
    residues_query = " or ".join([f"residue == {resnum}" for resnum in residues])

    if len(atomnames_query) == 0:
        if len(residues_query) == 0:
            filtered_trajectory = trajectory
        else:
            atom_indices_selection = topology.select(residues_query)
            filtered_trajectory = trajectory.atom_slice(atom_indices_selection)
    else:
        if len(residues_query) == 0:
            atom_indices_selection = topology.select(atomnames_query)
            filtered_trajectory = trajectory.atom_slice(atom_indices_selection)
        else:
            atom_indices_selection = topology.select('(' + atomnames_query + ') and (' + residues_query + ')')
            filtered_trajectory = trajectory.atom_slice(atom_indices_selection)
    print("WARNING: Output filtered traj atom, residue, and chain indices are zero-indexed")

    return filtered_trajectory




[docs]
def filter_traj_to_pdb(trj_file : str, top_file : str, 
                       pdb : str, residues : str | set = {},
                        atoms : set = {}) -> None:
    """
    Filters an input trajectory to only the specified atoms and residues and outputs to pdb

    Filteres an input trajectory that contains all of the atoms in a trajectory to only
    the desired atoms at the desired residues (eg. the atoms necessary to find the 
    center of geometry of a residue) and writes the output to a specified pdb file.
    If residues or atomnames are empty, all residues or atoms are included respectively.

    Parameters
    ----------
    trj_file : str
        path to file of the concatenated trajectory. Should be resampled to the
        1 in 50 frames sampled trajectories for each replicate.
    top_file : str
        path to file of the topology of the molecule
    pdb : str
        path to the output pdb file
    residues : set or str
        1-indexed residue numbers of residues to keep in the trajectory.
        Accepts smart-indexed str representing a list of residues (e.g '1-5,6,39-48')
        If Empty, include all residues.
    atomnames : set 
        atomnames to keep in the trajectory

    Returns
    -------
    None

    See Also
    --------
    filter_traj : Filters an input trajectory to only the specified atoms and residues
    
    Notes
    -----
    Inputed trajectory should have 1-indexed Residue Indices, 
    Outputed trajectory object will be 0-indexed.

    """
    residues = SmartIndexingAction.parse_smart_index(residues)

    filtered_trajectory = filter_traj(trj_file, top_file, residues, atoms)
    filtered_trajectory.save_pdb(pdb)
    print("WARNING: Output file atom, residue, and chain indices are zero-indexed")
    print("Filtered trajectory written to: ", pdb)




[docs]
def file_convert(trj_file: str, top_file: str, outfile: str) -> None:
    """
    Converts a trajectory input file to a new output type.

    The output file type is determined by the ``outfile`` extension. Uses ``mdtraj.save()`` commands to convert 
    trajectory files to various file types such as ``mdtraj.save_mdcrd()``, ``mdtraj.save_pdb()``, ``mdtraj.save_xyz()``, etc.

    Parameters
    ----------
    trj_file : str
        Path to the file of the concatenated trajectory (eg. .mdcrd file). 
    top_file : str
        Path to the file of the topology of the molecule (.prmtop file).
    outfile : str
        Output filename (include .mdcrd, .pdb, etc.).

    Returns
    -------
    None

    Examples
    --------
    >>> import stacker as st
    >>> import mdtraj as md
    >>> st.file_convert('stacker/testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.mdcrd', 
    ...                 'stacker/testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop', 
    ...                 'stacker/testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.xyz')
    WARNING: Output file atom, residue, and chain indices are zero-indexed
    Trajectory written to: stacker/testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.xyz
    >>> md.load_xyz('stacker/testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.xyz', 
    ...             top='stacker/testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop')
    <mdtraj.Trajectory with 10 frames, 12089 atoms, 494 residues, without unitcells at 0x10bb75cd0>

    Notes
    -----
    Output filetype determined from file extension of `output_file` parameter.
    
    See Also
    --------
    mdtraj.load : Load trajectory files
    mdtraj.save : Save md.Trajectory to file
    mdtraj.load_xyz : Load a .xyz trajectory file

    """
    print("WARNING: Output file atom, residue, and chain indices are zero-indexed")
    trajectory = md.load(trj_file, top = top_file)
    trajectory.save(outfile)
    print("Trajectory written to: ", outfile)



[docs]
class SmartIndexingAction(argparse.Action):
    '''
    Custom argparse action to handle smart indexing of frame numbers.

    Parses a comma-separated list of frame numbers with optional ranges (e.g., '1-20, 34, 25, 50-100')
    and generates a list of individual frame numbers. Modifies the namespace by setting the attribute specified by the 'dest' parameter to the
    list of individual frame numbers.

    Parameters
    ----------
    parser: argparse.ArgumentParser
        The argparse parser object.
    namespace: argparse.Namespace
        The argparse namespace.
    values: str
        The input string containing frame numbers and ranges.
    option_string: str, default=None
        The option string.

    Attributes
    ----------
    dest : str
        The attribute name in the namespace where the parsed list will be stored.

    Methods
    -------
    __call__(parser, namespace, values, option_string=None)
        Parses the provided string of values into a sorted list of integers and
        sets it as an attribute in the namespace.

    Examples
    --------
    >>> parser = argparse.ArgumentParser()
    >>> parser.add_argument("-fl", "--frame_list", metavar="FRAME_LIST", help="Smart-indexed list of 1-indexed Frame Numbers within trajectory to analyze", required=False, action=SmartIndexingAction)
    >>> args = parser.parse_args(["-fl", "1-20,34,25,50-100"])
    >>> print(args.frame_list)
    [1, 2, ..., 20, 34, 25, 50, 51, ..., 100]
    
    '''
    def __call__(self, parser, namespace, values, option_string=None):
        frame_list = []
        for item in values.split(','):
            if '-' in item:
                start, end = map(int, item.split('-'))
                frame_list.extend(range(start, end + 1))
            else:
                frame_list.append(int(item))
        frame_list.sort()
        setattr(namespace, self.dest, frame_list)
    

[docs]
    @staticmethod
    def parse_smart_index(value : str | set | typing.ArrayLike):
        """
        Checks that an inputted variable is a list that can be smart indexed
        and indexes it if necessary.

        Parameters
        ----------
        value : {str, set, list}
            The input string containing ranges (e.g., "1-5,10,15-20")
            or a set of integers.

        Returns
        -------
        set
            A set of integers parsed from the input.

        Examples
        --------
        >>> import stacker as st
        >>> st.SmartIndexingAction.parse_smart_index('1-5,8,12-17')
        {1, 2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 17}

        Raises
        ------
        ValueError
            If the input is not a string or set.
        """
        if isinstance(value, str):
            parsed_set = set()
            if value == '':
                return {}
            
            for item in value.split(','):
                if '-' in item:
                    start, end = map(int, item.split('-'))
                    parsed_set.update(range(start, end + 1))
                else:
                    parsed_set.add(int(item))
            return parsed_set
        elif isinstance(value, set) or isinstance(value, set) or isinstance(value, dict):
            return value
        elif isinstance(value, list):
            return value
        else:
            raise ValueError("Input must be a string, list, or set of integers.")




if __name__ == "__main__":
    # filter_traj tests
    print('Known Res: 426 = G and 427 = C')
    filtered_traj = filter_traj('testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.mdcrd', 'testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop', {426,427}, {'C2','C4','C6'})
    table, bonds = filtered_traj.topology.to_dataframe()
    print(table)

    ### No Filtering
    print("No Filtering, known trj has 12089 atoms")
    filtered_traj = filter_traj('testing/first10_5JUP_N2_tUAG_aCUA_+1GCU_nowat.mdcrd', 'testing/5JUP_N2_tUAG_aCUA_+1GCU_nowat.prmtop', residues={}, atoms={})
    table, bonds = filtered_traj.topology.to_dataframe()
    print(table)