From 561344d0f3d63eb902a825d63af694ed5dec1f08 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 8 Jul 2024 18:43:16 +0100 Subject: [PATCH] linting --- biopandas/mmcif/mmcif_parser.py | 21 +++- biopandas/mmcif/pandas_mmcif.py | 144 +++++++++++++++-------- biopandas/mmcif/tests/test_amino3to1.py | 8 +- biopandas/mmcif/tests/test_distance.py | 10 +- biopandas/mmcif/tests/test_read_mmcif.py | 21 ++-- biopandas/mmcif/tests/test_rmsd.py | 6 +- 6 files changed, 145 insertions(+), 65 deletions(-) diff --git a/biopandas/mmcif/mmcif_parser.py b/biopandas/mmcif/mmcif_parser.py index 96d0a31..91556bb 100644 --- a/biopandas/mmcif/mmcif_parser.py +++ b/biopandas/mmcif/mmcif_parser.py @@ -22,19 +22,28 @@ def __init__(self, parser_obj): self.names_defined = False def add_name(self, name): - cat_name = type(name) == str and partition_string(name, ".") or ["", "", ""] + cat_name = ( + type(name) == str and partition_string(name, ".") or ["", "", ""] + ) if cat_name[1]: if cat_name[0] not in self.parser_obj.current_target[-2]: self.parser_obj.current_target[-2][cat_name[0]] = {} - if cat_name[2] not in self.parser_obj.current_target[-2][cat_name[0]]: - self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]] = [] + if ( + cat_name[2] + not in self.parser_obj.current_target[-2][cat_name[0]] + ): + self.parser_obj.current_target[-2][cat_name[0]][ + cat_name[2] + ] = [] self.ref_list.append( self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]] ) else: if cat_name[0] not in self.parser_obj.current_target[-2]: self.parser_obj.current_target[-2][cat_name[0]] = [] - self.ref_list.append(self.parser_obj.current_target[-2][cat_name[0]]) + self.ref_list.append( + self.parser_obj.current_target[-2][cat_name[0]] + ) self.length = len(self.ref_list) def push_value(self, value): @@ -289,7 +298,9 @@ def __dump_str__(inp): return str(inp) if re.search(__CIF_STR_NL_CHECK__, inp) is not None: return "\n;%s\n;" % inp - return "'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp + return ( + "'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp + ) def __pad_string__(inp, flength): diff --git a/biopandas/mmcif/pandas_mmcif.py b/biopandas/mmcif/pandas_mmcif.py index 167b79e..e00c1f5 100644 --- a/biopandas/mmcif/pandas_mmcif.py +++ b/biopandas/mmcif/pandas_mmcif.py @@ -1,4 +1,5 @@ """Class for working with MMCIF files.""" + # BioPandas # Authors: Arian Jamasb , # Authors: Sebastian Raschka @@ -69,56 +70,76 @@ def read_mmcif(self, path): self.code = self.data["entry"]["id"][0].lower() return self - def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"): + def fetch_mmcif( + self, + pdb_code: Optional[str] = None, + uniprot_id: Optional[str] = None, + source: str = "pdb", + ): """Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/. -. + . - Parameters - ---------- - pdb_code : str, optional - A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`. + Parameters + ---------- + pdb_code : str, optional + A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`. - uniprot_id : str, optional - A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`. + uniprot_id : str, optional + A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`. - source : str - The source to retrieve the structure from - (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`. + source : str + The source to retrieve the structure from + (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`. - Returns - --------- - self + Returns + --------- + self """ # Sanitize input invalid_input_identifier_1 = pdb_code is None and uniprot_id is None - invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None - invalid_input_combination_1 = uniprot_id is not None and source == "pdb" + invalid_input_identifier_2 = ( + pdb_code is not None and uniprot_id is not None + ) + invalid_input_combination_1 = ( + uniprot_id is not None and source == "pdb" + ) invalid_input_combination_2 = pdb_code is not None and source in { - "alphafold2-v3", "alphafold2-v4"} + "alphafold2-v3", + "alphafold2-v4", + } if invalid_input_identifier_1 or invalid_input_identifier_2: raise ValueError( - "Please provide either a PDB code or a UniProt ID.") + "Please provide either a PDB code or a UniProt ID." + ) if invalid_input_combination_1: raise ValueError( - "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.") + "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'." + ) elif invalid_input_combination_2: raise ValueError( - f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.") + f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}." + ) if source == "pdb": self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code) elif source == "alphafold2-v3": af2_version = 3 - self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version) + self.mmcif_path, self.mmcif_text = self._fetch_af2( + uniprot_id, af2_version + ) elif source == "alphafold2-v4": af2_version = 4 - self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version) + self.mmcif_path, self.mmcif_text = self._fetch_af2( + uniprot_id, af2_version + ) else: - raise ValueError(f"Invalid source: {source}." - " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'.") + raise ValueError( + f"Invalid source: {source}." + " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'." + ) self._df = self._construct_df(text=self.mmcif_text) return self @@ -129,7 +150,8 @@ def _construct_df(self, text: str): self.data = data df: Dict[str, pd.DataFrame] = {} full_df = pd.DataFrame.from_dict( - data["atom_site"], orient="index").transpose() + data["atom_site"], orient="index" + ).transpose() full_df = full_df.astype(mmcif_col_types, errors="ignore") df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"]) df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"]) @@ -148,8 +170,9 @@ def _fetch_mmcif(pdb_code): response = urlopen(url) txt = response.read() txt = ( - txt.decode( - "utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii") + txt.decode("utf-8") + if sys.version_info[0] >= 3 + else txt.encode("ascii") ) except HTTPError as e: print(f"HTTP Error {e.code}") @@ -166,11 +189,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3): try: response = urlopen(url) txt = response.read() - txt = txt.decode('utf-8') if sys.version_info[0] >= 3 else txt.encode('ascii') + txt = ( + txt.decode("utf-8") + if sys.version_info[0] >= 3 + else txt.encode("ascii") + ) except HTTPError as e: - print(f'HTTP Error {e.code}') + print(f"HTTP Error {e.code}") except URLError as e: - print(f'URL Error {e.args}') + print(f"URL Error {e.args}") return url, txt @staticmethod @@ -184,7 +211,8 @@ def _read_mmcif(path): openf = gzip.open else: allowed_formats = ", ".join( - (".cif", ".cif.gz", ".mmcif", ".mmcif.gz")) + (".cif", ".cif.gz", ".mmcif", ".mmcif.gz") + ) raise ValueError( f"Wrong file format; allowed file formats are {allowed_formats}" ) @@ -194,8 +222,9 @@ def _read_mmcif(path): if path.endswith(".gz"): txt = ( - txt.decode( - "utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii") + txt.decode("utf-8") + if sys.version_info[0] >= 3 + else txt.encode("ascii") ) return path, txt @@ -271,14 +300,19 @@ def _get_mainchain( def _get_hydrogen(df, invert): """Return only hydrogen atom entries from a DataFrame""" return ( - df[(df["type_symbol"] != "H")] if invert else df[( - df["type_symbol"] == "H")] + df[(df["type_symbol"] != "H")] + if invert + else df[(df["type_symbol"] == "H")] ) @staticmethod def _get_heavy(df, invert): """Return only heavy atom entries from a DataFrame""" - return df[df["type_symbol"] == "H"] if invert else df[df["type_symbol"] != "H"] + return ( + df[df["type_symbol"] == "H"] + if invert + else df[df["type_symbol"] != "H"] + ) @staticmethod def _get_calpha(df, invert, atom_col: str = "auth_atom_id"): @@ -288,7 +322,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"): @staticmethod def _get_carbon(df, invert): """Return carbon atom entries from a DataFrame""" - return df[df["type_symbol"] != "C"] if invert else df[df["type_symbol"] == "C"] + return ( + df[df["type_symbol"] != "C"] + if invert + else df[df["type_symbol"] == "C"] + ) def amino3to1( self, @@ -339,8 +377,9 @@ def amino3to1( indices.append(ind) cmp = num - transl = tmp.iloc[indices][residue_col].map( - amino3to1dict).fillna(fillna) + transl = ( + tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna) + ) return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1) @@ -425,7 +464,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")): return np.sqrt( np.sum( - df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1 + df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) + ** 2, + axis=1, ) ) @@ -451,7 +492,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)): """ return np.sqrt( np.sum( - df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1 + df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) + ** 2, + axis=1, ) ) @@ -485,7 +528,11 @@ def read_mmcif_from_list(self, mmcif_lines): self.code = self.data["entry"]["id"][0].lower() return self - def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]) -> PandasPdb: + def convert_to_pandas_pdb( + self, + offset_chains: bool = True, + records: List[str] = ["ATOM", "HETATM"], + ) -> PandasPdb: """Returns a PandasPdb object with the same data as the PandasMmcif object. @@ -525,10 +572,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] = # Update atom numbers if offset_chains: - offsets = pandaspdb.df["ATOM"]["chain_id"].astype( - "category").cat.codes - pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets + offsets = ( + pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes + ) + pandaspdb.df["ATOM"]["atom_number"] = ( + pandaspdb.df["ATOM"]["atom_number"] + offsets + ) hetatom_offset = offsets.max() + 1 - pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset + pandaspdb.df["HETATM"]["atom_number"] = ( + pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset + ) return pandaspdb diff --git a/biopandas/mmcif/tests/test_amino3to1.py b/biopandas/mmcif/tests/test_amino3to1.py index a03c364..83a671c 100644 --- a/biopandas/mmcif/tests/test_amino3to1.py +++ b/biopandas/mmcif/tests/test_amino3to1.py @@ -805,8 +805,12 @@ def test_multichain(): expect_chain = ["A" for _ in range(88)] + ["B" for _ in range(94)] got_chain = list(transl["auth_asym_id"].values) - got_res_a = list(transl.loc[transl["auth_asym_id"] == "A", "auth_comp_id"].values) - got_res_b = list(transl.loc[transl["auth_asym_id"] == "B", "auth_comp_id"].values) + got_res_a = list( + transl.loc[transl["auth_asym_id"] == "A", "auth_comp_id"].values + ) + got_res_b = list( + transl.loc[transl["auth_asym_id"] == "B", "auth_comp_id"].values + ) assert expect_chain == got_chain assert expect_res_a == got_res_a diff --git a/biopandas/mmcif/tests/test_distance.py b/biopandas/mmcif/tests/test_distance.py index f827d01..e7cd116 100644 --- a/biopandas/mmcif/tests/test_distance.py +++ b/biopandas/mmcif/tests/test_distance.py @@ -18,7 +18,8 @@ def test_equal(): dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records=("ATOM",)) expect = pd.Series( - [2.533259, 1.520502, 0.000000, 1.257597, 1.252510], index=[12, 13, 14, 15, 16] + [2.533259, 1.520502, 0.000000, 1.257597, 1.252510], + index=[12, 13, 14, 15, 16], ) assert dist[dist < 3].all() == expect.all() @@ -31,7 +32,8 @@ def test_deprecated_str_arg(): dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records="ATOM") expect = pd.Series( - [2.533259, 1.520502, 0.000000, 1.257597, 1.252510], index=[12, 13, 14, 15, 16] + [2.533259, 1.520502, 0.000000, 1.257597, 1.252510], + index=[12, 13, 14, 15, 16], ) assert dist[dist < 3].all() == expect.all() @@ -44,5 +46,7 @@ def test_use_external_df(): new_df = p1t48.df["ATOM"].iloc[:-1, :].copy() dist = PandasMmcif.distance_df(df=new_df, xyz=(70.785, 15.477, 23.359)) - expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597], index=[12, 13, 14, 15]) + expect = pd.Series( + [2.533259, 1.520502, 0.000000, 1.257597], index=[12, 13, 14, 15] + ) assert dist[dist < 3].all() == expect.all() diff --git a/biopandas/mmcif/tests/test_read_mmcif.py b/biopandas/mmcif/tests/test_read_mmcif.py index 7189702..983e848 100644 --- a/biopandas/mmcif/tests/test_read_mmcif.py +++ b/biopandas/mmcif/tests/test_read_mmcif.py @@ -6,11 +6,11 @@ import os -import pytest -from urllib.error import HTTPError from pathlib import Path +from urllib.error import HTTPError import pandas as pd +import pytest from biopandas.mmcif import PandasMmcif from biopandas.pdb import PandasPdb from biopandas.testutils import assert_raises @@ -22,8 +22,12 @@ # TESTDATA_FILENAME2 = os.path.join( # os.path.dirname(__file__), "data", "4eiy_anisouchunk.cif" # ) -TESTDATA_FILENAME2 = os.path.join(os.path.dirname(__file__), "data", "4eiy.cif") -TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "data", "3eiy.cif.gz") +TESTDATA_FILENAME2 = os.path.join( + os.path.dirname(__file__), "data", "4eiy.cif" +) +TESTDATA_FILENAME_GZ = os.path.join( + os.path.dirname(__file__), "data", "3eiy.cif.gz" +) TESTDATA_FILENAME_AF2_V4 = os.path.join( os.path.dirname(__file__), "data", "AF-Q5VSL9-F1-model_v4.cif" @@ -90,7 +94,6 @@ af2_test_struct_v3 = f.read() - def test__read_pdb(): """Test private _read_pdb""" ppdb = PandasMmcif() @@ -334,7 +337,9 @@ def test_mmcif_pdb_conversion(): ) assert_frame_equal( pdb.df["HETATM"].drop(columns=["line_idx"]), - mmcif_pdb.df["HETATM"].drop(columns=["line_idx"]).reset_index(drop=True), + mmcif_pdb.df["HETATM"] + .drop(columns=["line_idx"]) + .reset_index(drop=True), ) # single chain test @@ -348,5 +353,7 @@ def test_mmcif_pdb_conversion(): ) assert_frame_equal( pdb.df["HETATM"].drop(columns=["line_idx"]), - mmcif_pdb.df["HETATM"].drop(columns=["line_idx"]).reset_index(drop=True), + mmcif_pdb.df["HETATM"] + .drop(columns=["line_idx"]) + .reset_index(drop=True), ) diff --git a/biopandas/mmcif/tests/test_rmsd.py b/biopandas/mmcif/tests/test_rmsd.py index 5507059..054f3b2 100644 --- a/biopandas/mmcif/tests/test_rmsd.py +++ b/biopandas/mmcif/tests/test_rmsd.py @@ -5,8 +5,8 @@ # Code Repository: https://github.com/rasbt/biopandas import os -import pytest +import pytest from biopandas.mmcif import PandasMmcif TESTDATA_1t48 = os.path.join(os.path.dirname(__file__), "data", "1t48.cif") @@ -48,7 +48,9 @@ def test_invalid_query(): def test_protein(): - r = PandasMmcif.rmsd(p1t48.df["ATOM"], p1t49.df["ATOM"], s="c-alpha", invert=False) + r = PandasMmcif.rmsd( + p1t48.df["ATOM"], p1t49.df["ATOM"], s="c-alpha", invert=False + ) assert r == 0.4923, r