From cd249c537c1f87d903f688e633eadc0705d223d7 Mon Sep 17 00:00:00 2001 From: jjmccollum Date: Thu, 16 Jan 2025 02:55:26 +1100 Subject: [PATCH 1/2] :zap: Added code, tests, and documentation for similarity tables and the --show-ext option for distance and similarity tables --- docs/advanced.rst | 12 ++- pyproject.toml | 2 +- teiphy/collation.py | 171 ++++++++++++++++++++++++++++++++++++---- teiphy/main.py | 7 +- tests/test_collation.py | 62 +++++++++++++++ tests/test_main.py | 80 ++++++++++++++++++- 6 files changed, 313 insertions(+), 21 deletions(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index 1727edf..b604028 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -626,7 +626,11 @@ Collations can also be converted to tabular formats. Within Python, the ``collation`` class's ``to_numpy`` method can be invoked to convert a collation to a NumPy ``array`` with rows for variant readings, columns for witnesses, and frequency values in the cells. Where a witness has missing data at a variation, its frequencies for different readings at this unit can be split evenly over 1 using the ``split_missing`` argument; otherwise, the witness will have frequencies of 0 for all readings at that unit. The same class's ``to_distance_matrix`` method produces a NumPy ``array`` with rows and columns for witnesses, where each cell contains the number of units where the row witness and column witness both have unambiguous readings and these readings disagree. -The cells can instead be populated with the proportion of disagreements to units where the row and column witnesses have readings with the ``proportion`` argument. +The cells can instead be populated with the proportion of disagreements among units where the row and column witnesses have readings with the ``proportion`` argument. +If you specify the ``show_ext`` argument as True, then each cell will be populated by the number or proportion of disagreements followed by the number of units where both witnesses have have unambiguous readings (e.g., 3/50 or 0.06/50). +The same class's ``to_similarity_matrix`` method produces a NumPy ``array`` with rows and columns for witnesses, where each cell contains the number of units where the row witness and column witness both have unambiguous readings and these readings agree. +The cells can instead be populated with the proportion of agreements among units where the row and column witnesses have readings with the ``proportion`` argument. +If you specify the ``show_ext`` argument as True, then each cell will be populated by the number or proportion of agreements followed by the number of units where both witnesses have have unambiguous readings (e.g., 47/50 or 0.94/50). The same class's ``to_nexus_table`` method produces a NumPy ``array`` with rows for witnesses, columns for variation unit IDs, and attested reading IDs in the cells, resembling a NEXUS sequence. By default, cells corresponding to ambiguous readings are written as space-separated sequences of readings between braces, but they can be written as missing states with the ``ambiguous_as_missing`` argument. The same class's ``to_long_table`` method produces a NumPy ``array`` with columns for witness ID, variation unit ID, reading index, and reading text and rows for all combinations of these values found in the collation. @@ -634,9 +638,11 @@ The ``to_dataframe`` method invokes ``to_numpy`` by default, but if the ``table_ It returns a Pandas ``DataFrame`` augmented with row and column labels (or, in the case of a long table, just column labels). From the command line, the standard reading-witness matrix or long table can be written to a specified CSV, TSV, or Excel (.xlsx) file. -If you specify the output filename with its extension, ``teiphy`` will infer which format to use. +If you specify the output filename with its extension, ``teiphy`` will infer which format to use. +If you want to write a distance matrix, a similarity matrix, a NEXUS-style table, or a long table to output instead of a reading-witness matrix, then you can do so by specifying the ``--table distance``, ``--table similarity``, ``--table nexus``, or ``--table long`` command-line argument, respectively. If you are writing a reading-witness matrix to output, you can set the method's ``split_missing`` argument using the ``--split-missing`` command-line flag. -If you want to write a distance matrix, a NEXUS-style table, or a long table to output instead of a reading-witness matrix, then you can do so by specifying the ``--table distance``, ``--table nexus``, or ``--table long`` command-line argument, respectively. +If you are writing a distance or similarity matrix to output, then you can set the method's ``proportion`` and ``show_ext`` arguments using using the ``--proportion`` and ``--show-ext`` command-line flags, respectively. +As with plain NEXUS outputs, if you are writing a NEXUS table to output, then you can set the method's ``ambiguous_as_missing`` argument using the ``--ambiguous-as-missing`` command-line flag. Other Options ------------- diff --git a/pyproject.toml b/pyproject.toml index 9bc2124..090c1c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "teiphy" -version = "0.1.18" +version = "0.1.19" description = "Converts TEI XML collations to NEXUS and other formats" authors = ["Joey McCollum and Robert Turnbull"] license = "MIT" diff --git a/teiphy/collation.py b/teiphy/collation.py index 25766f5..6ea6149 100644 --- a/teiphy/collation.py +++ b/teiphy/collation.py @@ -38,13 +38,14 @@ class ClockModel(str, Enum): class AncestralLogger(str, Enum): state = "state" - sequence = ("sequence",) + sequence = "sequence" none = "none" class TableType(str, Enum): matrix = "matrix" distance = "distance" + similarity = "similarity" nexus = "nexus" long = "long" @@ -1738,13 +1739,19 @@ def to_numpy(self, drop_constant: bool = False, split_missing: bool = True): col_ind += 1 return matrix, reading_labels, witness_labels - def to_distance_matrix(self, drop_constant: bool = False, proportion=False): + def to_distance_matrix(self, drop_constant: bool = False, proportion: bool = False, show_ext: bool = False): """Transforms this Collation into a NumPy distance matrix between witnesses, along with an array of its labels for the witnesses. Distances can be computed either as counts of disagreements (the default setting), or as proportions of disagreements over all variation units where both witnesses have singleton readings. + Optionally, the count of units where both witnesses have singleton readings can be included after the count/proportion of disagreements. Args: drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units. + Default value is False. + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of their extant, unambiguous variation units after the number of their disagreements. + Default value is False. Returns: A NumPy distance matrix with a row and column for each witness. @@ -1762,19 +1769,25 @@ def to_distance_matrix(self, drop_constant: bool = False, proportion=False): substantive_variation_unit_reading_tuples_set = set(self.substantive_variation_unit_reading_tuples) # Initialize the output array with the appropriate dimensions: witness_labels = [wit.id for wit in self.witnesses] - matrix = np.zeros((len(witness_labels), len(witness_labels)), dtype=float) + # The type of the matrix will depend on the input options: + matrix = None + if show_ext: + matrix = np.full((len(witness_labels), len(witness_labels)), "NA", dtype=object) # strings of the form "disagreements/extant" + elif proportion: + matrix = np.full((len(witness_labels), len(witness_labels)), 0.0, dtype=float) # floats of the form disagreements/extant + else: + matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form disagreements for i, wit_1 in enumerate(witness_labels): for j, wit_2 in enumerate(witness_labels): extant_units = 0 disagreements = 0 - # If both witnesses are the same, then the matrix entry is trivially 0: - if j == i: - matrix[i, j] = 0 - continue - # If either of the cells for this pair of witnesses has been populated already, then just copy the distance without recalculating: + # If either of the cells for this pair of witnesses has been populated already, + # then just copy the entry from the other side of the diagonal without recalculating: if i > j: matrix[i, j] = matrix[j, i] continue + # Otherwise, calculate the number of units where both witnesses have unambiguous readings + # and the number of units where they disagree: for k, vu_id in enumerate(self.variation_unit_ids): if vu_id not in substantive_variation_unit_ids_set: continue @@ -1787,12 +1800,91 @@ def to_distance_matrix(self, drop_constant: bool = False, proportion=False): extant_units += 1 if wit_1_rdg_inds[0] != wit_2_rdg_inds[0]: disagreements += 1 + cell_entry = None + if proportion: + cell_entry = disagreements / max( + extant_units, 1 + ) # the max in the denominator is to prevent division by 0; the distance entry will be 0 if the two witnesses have no overlap + else: + cell_entry = disagreements + if show_ext: + cell_entry = str(cell_entry) + "/" + str(extant_units) + matrix[i, j] = cell_entry + return matrix, witness_labels + + def to_similarity_matrix(self, drop_constant: bool = False, proportion: bool = False, show_ext: bool = False): + """Transforms this Collation into a NumPy similarity matrix between witnesses, along with an array of its labels for the witnesses. + Similarities can be computed either as counts of agreements (the default setting), or as proportions of agreements over all variation units where both witnesses have singleton readings. + Optionally, the count of units where both witnesses have singleton readings can be included after the count/proportion of agreements. + + Args: + drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. + proportion (bool, optional): An optional flag indicating whether or not to calculate similarities as proportions over extant, unambiguous variation units. + Default value is False. + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of their extant, unambiguous variation units after the number of agreements. + Default value is False. + + Returns: + A NumPy distance matrix with a row and column for each witness. + A list of witness ID strings. + """ + # Populate a list of sites that will correspond to columns of the sequence alignment: + substantive_variation_unit_ids = self.variation_unit_ids + if drop_constant: + substantive_variation_unit_ids = [ + vu_id + for vu_id in self.variation_unit_ids + if len(self.substantive_readings_by_variation_unit_id[vu_id]) > 1 + ] + substantive_variation_unit_ids_set = set(substantive_variation_unit_ids) + substantive_variation_unit_reading_tuples_set = set(self.substantive_variation_unit_reading_tuples) + # Initialize the output array with the appropriate dimensions: + witness_labels = [wit.id for wit in self.witnesses] + # The type of the matrix will depend on the input options: + matrix = None + if show_ext: + matrix = np.full((len(witness_labels), len(witness_labels)), "NA", dtype=object) # strings of the form "agreements/extant" + elif proportion: + matrix = np.full((len(witness_labels), len(witness_labels)), 0.0, dtype=float) # floats of the form agreements/extant + else: + matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form agreements + for i, wit_1 in enumerate(witness_labels): + for j, wit_2 in enumerate(witness_labels): + extant_units = 0 + agreements = 0 + # If either of the cells for this pair of witnesses has been populated already, + # then just copy the entry from the other side of the diagonal without recalculating: + if i > j: + matrix[i, j] = matrix[j, i] + continue + # Otherwise, calculate the number of units where both witnesses have unambiguous readings + # and the number of units where they agree: + for k, vu_id in enumerate(self.variation_unit_ids): + if vu_id not in substantive_variation_unit_ids_set: + continue + wit_1_rdg_support = self.readings_by_witness[wit_1][k] + wit_2_rdg_support = self.readings_by_witness[wit_2][k] + wit_1_rdg_inds = [l for l, w in enumerate(wit_1_rdg_support) if w > 0] + wit_2_rdg_inds = [l for l, w in enumerate(wit_2_rdg_support) if w > 0] + if len(wit_1_rdg_inds) != 1 or len(wit_2_rdg_inds) != 1: + continue + if i == 0 and j == 1: + print(vu_id, wit_1_rdg_inds[0], wit_2_rdg_inds[0]) + extant_units += 1 + if wit_1_rdg_inds[0] == wit_2_rdg_inds[0]: + agreements += 1 + cell_entry = None if proportion: - matrix[i, j] = disagreements / max( + cell_entry = agreements / max( extant_units, 1 ) # the max in the denominator is to prevent division by 0; the distance entry will be 0 if the two witnesses have no overlap else: - matrix[i, j] = disagreements + cell_entry = agreements + if show_ext: + cell_entry = str(cell_entry) + "/" + str(extant_units) + matrix[i, j] = cell_entry return matrix, witness_labels def to_nexus_table(self, drop_constant: bool = False, ambiguous_as_missing: bool = False): @@ -1800,7 +1892,9 @@ def to_nexus_table(self, drop_constant: bool = False, ambiguous_as_missing: bool Args: drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. ambiguous_as_missing (bool, optional): An optional flag indicating whether to treat all ambiguous states as missing data. + Default value is False. Returns: A NumPy array with rows for taxa, columns for characters, and reading IDs in cells. @@ -1875,6 +1969,7 @@ def to_long_table(self, drop_constant: bool = False): Args: drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. Returns: A NumPy array with columns for taxa, characters, reading indices, and reading values, and rows for each combination of these values in the matrix. @@ -1938,17 +2033,27 @@ def to_dataframe( proportion: bool = False, table_type: TableType = TableType.matrix, split_missing: bool = True, + show_ext: bool = False, ): """Returns this Collation in the form of a Pandas DataFrame array, including the appropriate row and column labels. Args: drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. ambiguous_as_missing (bool, optional): An optional flag indicating whether to treat all ambiguous states as missing data. + Default value is False. proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units. + Default value is False. table_type (TableType, optional): A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True. + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). + Default value is True. + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. + Only applicable for tabular output formats of type \"distance\" or \"similarity\". + Default value is False. Returns: A Pandas DataFrame corresponding to a collation matrix with reading frequencies or a long table with discrete reading states. @@ -1963,7 +2068,11 @@ def to_dataframe( df = pd.DataFrame(matrix, index=reading_labels, columns=witness_labels) elif table_type == TableType.distance: # Convert the collation to a NumPy array and get its row and column labels first: - matrix, witness_labels = self.to_distance_matrix(drop_constant=drop_constant, proportion=proportion) + matrix, witness_labels = self.to_distance_matrix(drop_constant=drop_constant, proportion=proportion, show_ext=show_ext) + df = pd.DataFrame(matrix, index=witness_labels, columns=witness_labels) + elif table_type == TableType.similarity: + # Convert the collation to a NumPy array and get its row and column labels first: + matrix, witness_labels = self.to_similarity_matrix(drop_constant=drop_constant, proportion=proportion, show_ext=show_ext) df = pd.DataFrame(matrix, index=witness_labels, columns=witness_labels) elif table_type == TableType.nexus: # Convert the collation to a NumPy array and get its row and column labels first: @@ -1985,6 +2094,7 @@ def to_csv( proportion: bool = False, table_type: TableType = TableType.matrix, split_missing: bool = True, + show_ext: bool = False, **kwargs ): """Writes this Collation to a comma-separated value (CSV) file with the given address. @@ -1994,12 +2104,21 @@ def to_csv( Args: file_addr: A string representing the path to an output CSV file; the file type should be .csv. drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data. + Default value is False. proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units. + Default value is False. table_type: A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True. + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). + Default value is True. + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. + Only applicable for tabular output formats of type \"distance\" or \"similarity\". + Default value is False. **kwargs: Keyword arguments for pandas.DataFrame.to_csv. """ # Convert the collation to a Pandas DataFrame first: @@ -2009,6 +2128,7 @@ def to_csv( proportion=proportion, table_type=table_type, split_missing=split_missing, + show_ext=show_ext, ) # Generate all parent folders for this file that don't already exist: Path(file_addr).parent.mkdir(parents=True, exist_ok=True) @@ -2029,6 +2149,7 @@ def to_excel( proportion: bool = False, table_type: TableType = TableType.matrix, split_missing: bool = True, + show_ext: bool = False, ): """Writes this Collation to an Excel (.xlsx) file with the given address. @@ -2037,12 +2158,21 @@ def to_excel( Args: file_addr: A string representing the path to an output Excel file; the file type should be .xlsx. drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data. + Default value is False. proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units. + Default value is False. table_type: A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True. + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). + Default value is True. + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. + Only applicable for tabular output formats of type \"distance\" or \"similarity\". + Default value is False. """ # Convert the collation to a Pandas DataFrame first: df = self.to_dataframe( @@ -2051,6 +2181,7 @@ def to_excel( proportion=proportion, table_type=table_type, split_missing=split_missing, + show_ext=show_ext, ) # Generate all parent folders for this file that don't already exist: Path(file_addr).parent.mkdir(parents=True, exist_ok=True) @@ -2244,6 +2375,7 @@ def to_file( clock_model: ClockModel = ClockModel.strict, ancestral_logger: AncestralLogger = AncestralLogger.state, table_type: TableType = TableType.matrix, + show_ext: bool = False, seed: int = None, ): """Writes this Collation to the file with the given address. @@ -2254,6 +2386,7 @@ def to_file( If None then it is infered from the file suffix. Defaults to None. drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. + Default value is False. split_missing (bool, optional): An optional flag indicating whether to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). @@ -2279,17 +2412,24 @@ def to_file( calibrate_dates (bool, optional): An optional flag indicating whether to add an Assumptions block that specifies date distributions for witnesses in NEXUS output. This option is intended for inputs to BEAST 2. + Default value is False. mrbayes (bool, optional): An optional flag indicating whether to add a MrBayes block that specifies model settings and age calibrations for witnesses in NEXUS output. This option is intended for inputs to MrBayes. + Default value is False. clock_model (ClockModel, optional): A ClockModel option indicating which type of clock model to use. This option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified. + Default value is "strict". ancestral_logger (AncestralLogger, optional): An AncestralLogger option indicating which class of logger (if any) to use for ancestral states. This option is intended for inputs to BEAST 2. table_type (TableType, optional): A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". + show_ext (bool, optional): An optional flag indicating whether each cell in a distance or similarity matrix + should include the number of variation units where both witnesses are extant after the number of their disagreements/agreements. + Only applicable for tabular output formats of type \"distance\" or \"similarity\". + Default value is False. seed (optional, int): A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output). """ file_addr = Path(file_addr) @@ -2335,6 +2475,7 @@ def to_file( proportion=proportion, table_type=table_type, split_missing=split_missing, + show_ext=show_ext, ) if format == Format.TSV: @@ -2345,6 +2486,7 @@ def to_file( proportion=proportion, table_type=table_type, split_missing=split_missing, + show_ext=show_ext, sep="\t", ) @@ -2356,6 +2498,7 @@ def to_file( proportion=proportion, table_type=table_type, split_missing=split_missing, + show_ext=show_ext, ) if format == Format.STEMMA: diff --git a/teiphy/main.py b/teiphy/main.py index 3f8b9cb..5d7c23e 100644 --- a/teiphy/main.py +++ b/teiphy/main.py @@ -81,12 +81,16 @@ def to_file( ), table: TableType = typer.Option( TableType.matrix, - help="The type of table to use for CSV/Excel output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.", + help="The type of table to use for CSV/Excel output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.", ), split_missing: bool = typer.Option( False, help="Treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Not applicable for non-tabular formats.", ), + show_ext: bool = typer.Option( + False, + help="If set, each cell in a distance or similarity matrix will display the count/proportion of disagreements/agreements, followed by the number of variation units where both witnesses are extant and have unambiguous readings. (For example, a cell containing 47/50 in a similarity table would indicate that the row and column witnesses agree at 47 of the 50 units where they both have readings.) This option is only valid for tabular output formats of type \"distance\" or \"similarity\".", + ), seed: int = typer.Option( None, help="Seed for random number generation (used for setting default initial values of transcriptional rate parameters for BEAST 2 XML output); if not specified, then the default seeding of the numpy.random.default_rng class will be used.", @@ -177,5 +181,6 @@ def to_file( ancestral_logger=ancestral_logger, table_type=table, split_missing=split_missing, + show_ext=show_ext, seed=seed, ) diff --git a/tests/test_collation.py b/tests/test_collation.py index 74dc19e..253cada 100644 --- a/tests/test_collation.py +++ b/tests/test_collation.py @@ -446,6 +446,68 @@ def test_to_distance_matrix_drop_constant_proportion(self): abs(matrix[0, 1] - 13 / (len(self.xml_variation_units) - 2 - 2)) < 1e-4 ) # entry for UBS and P46 should be 13 divided by the number of non-constant variation units where neither witness is lacunose or ambiguous + def test_to_distance_matrix_show_ext(self): + matrix, witness_labels = self.collation.to_distance_matrix(show_ext=True) + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], "13/40" + ) + + def test_to_distance_matrix_proportion_show_ext(self): + matrix, witness_labels = self.collation.to_distance_matrix(proportion=True, show_ext=True) + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], "0.325/40" + ) + + def test_to_similarity_matrix(self): + matrix, witness_labels = self.collation.to_similarity_matrix() + self.assertNotEqual(np.trace(matrix), 0) # diagonal entries should be nonzero + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], 27 + ) # entry for UBS and P46 should be 27 (remember not to count P46 lacunae and ambiguities and to count P46 defective readings as agreeing with the UBS reading) + + def test_to_similarity_matrix_drop_constant(self): + matrix, witness_labels = self.collation.to_similarity_matrix(drop_constant=True) + self.assertNotEqual(np.trace(matrix), 0) # diagonal entries should be 0 + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], 25 + ) # entry for UBS and P46 should be 25 (the two constant variation units should be dropped) + + def test_to_similarity_matrix_proportion(self): + matrix, witness_labels = self.collation.to_similarity_matrix(proportion=True) + self.assertEqual(np.trace(matrix), 38) # diagonal entries should be 1 + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertTrue(np.all(matrix >= 0.0) and np.all(matrix <= 1.0)) # all elements should be between 0 and 1 + self.assertTrue( + abs(matrix[0, 1] - 27 / (len(self.xml_variation_units) - 2)) < 1e-4 + ) # entry for UBS and P46 should be 27 divided by the number of variation units where neither witness is lacunose or ambiguous + + def test_to_similarity_matrix_drop_constant_proportion(self): + matrix, witness_labels = self.collation.to_similarity_matrix(drop_constant=True, proportion=True) + self.assertEqual(np.trace(matrix), 38) # diagonal entries should be 0 + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertTrue(np.all(matrix >= 0.0) and np.all(matrix <= 1.0)) # all elements should be between 0 and 1 + self.assertTrue( + abs(matrix[0, 1] - 25 / (len(self.xml_variation_units) - 2 - 2)) < 1e-4 + ) # entry for UBS and P46 should be 25 divided by the number of non-constant variation units where neither witness is lacunose or ambiguous + + def test_to_similarity_matrix_show_ext(self): + matrix, witness_labels = self.collation.to_similarity_matrix(show_ext=True) + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], "27/40" + ) + + def test_to_similarity_matrix_proportion_show_ext(self): + matrix, witness_labels = self.collation.to_similarity_matrix(proportion=True, show_ext=True) + self.assertTrue(np.all(matrix == matrix.T)) # matrix should be symmetrical + self.assertEqual( + matrix[0, 1], "0.675/40" + ) + def test_to_nexus_table(self): nexus_table, row_labels, column_labels = self.collation.to_nexus_table() self.assertEqual(row_labels[0], "UBS") diff --git a/tests/test_main.py b/tests/test_main.py index 901e6c4..8099e83 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1388,7 +1388,7 @@ def test_to_csv_distance_table(): text = output.read_text(encoding="utf-8-sig") assert text.startswith(",UBS,P46,01,02,03,04,06") assert "\nUBS," in text - assert ",13.0," in text + assert ",13," in text def test_to_csv_proportion_distance_table(): @@ -1405,6 +1405,82 @@ def test_to_csv_proportion_distance_table(): assert ",0.5," in text +def test_to_csv_show_ext_distance_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke(app, ["--verbose", "--table", "distance", "--show-ext", str(input_example), str(output)]) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + print(text) + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert ",19/41," in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous + + +def test_to_csv_proportion_show_ext_distance_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke(app, ["--verbose", "--table", "distance", "--proportion", "--show-ext", str(input_example), str(output)]) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert ",0.4634146341463415/41," in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous + + +def test_to_csv_similarity_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke(app, ["--verbose", "--table", "similarity", str(input_example), str(output)]) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert ",22," in text + + +def test_to_csv_proportion_similarity_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke( + app, ["--verbose", "--table", "similarity", "--proportion", str(input_example), str(output)] + ) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert ",0.5365853658536586," in text + + +def test_to_csv_show_ext_similarity_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke(app, ["--verbose", "--table", "similarity", "--show-ext", str(input_example), str(output)]) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + print(text) + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert "22/41" in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous + + +def test_to_csv_proportion_show_ext_similarity_table(): + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.csv" + result = runner.invoke(app, ["--verbose", "--table", "similarity", "--proportion", "--show-ext", str(input_example), str(output)]) + assert result.exit_code == 0 + assert output.exists() + text = output.read_text(encoding="utf-8-sig") + assert text.startswith(",UBS,P46,01,02,03,04,06") + assert "\nUBS," in text + assert "0.5365853658536586/41" in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous + + def test_to_csv_drop_constant_long_table(): with tempfile.TemporaryDirectory() as tmp_dir: output = Path(tmp_dir) / "test.csv" @@ -1496,7 +1572,7 @@ def test_to_tsv_distance_table(): text = output.read_text(encoding="utf-8-sig") assert text.startswith("\tUBS\tP46\t01\t02\t03\t04\t06") assert "\nUBS\t" in text - assert "\t13.0\t" in text + assert "\t13\t" in text def test_to_excel(): From fa4e6b2e5cea704c8b6e9c01efd1f0501d588e44 Mon Sep 17 00:00:00 2001 From: jjmccollum Date: Thu, 16 Jan 2025 02:59:47 +1100 Subject: [PATCH 2/2] :bug: Removed a print statement leftover from debugging --- teiphy/collation.py | 58 ++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/teiphy/collation.py b/teiphy/collation.py index 6ea6149..d5d9e0d 100644 --- a/teiphy/collation.py +++ b/teiphy/collation.py @@ -1749,7 +1749,7 @@ def to_distance_matrix(self, drop_constant: bool = False, proportion: bool = Fal Default value is False. proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units. Default value is False. - show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix should include the number of their extant, unambiguous variation units after the number of their disagreements. Default value is False. @@ -1772,16 +1772,20 @@ def to_distance_matrix(self, drop_constant: bool = False, proportion: bool = Fal # The type of the matrix will depend on the input options: matrix = None if show_ext: - matrix = np.full((len(witness_labels), len(witness_labels)), "NA", dtype=object) # strings of the form "disagreements/extant" + matrix = np.full( + (len(witness_labels), len(witness_labels)), "NA", dtype=object + ) # strings of the form "disagreements/extant" elif proportion: - matrix = np.full((len(witness_labels), len(witness_labels)), 0.0, dtype=float) # floats of the form disagreements/extant + matrix = np.full( + (len(witness_labels), len(witness_labels)), 0.0, dtype=float + ) # floats of the form disagreements/extant else: - matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form disagreements + matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form disagreements for i, wit_1 in enumerate(witness_labels): for j, wit_2 in enumerate(witness_labels): extant_units = 0 disagreements = 0 - # If either of the cells for this pair of witnesses has been populated already, + # If either of the cells for this pair of witnesses has been populated already, # then just copy the entry from the other side of the diagonal without recalculating: if i > j: matrix[i, j] = matrix[j, i] @@ -1822,7 +1826,7 @@ def to_similarity_matrix(self, drop_constant: bool = False, proportion: bool = F Default value is False. proportion (bool, optional): An optional flag indicating whether or not to calculate similarities as proportions over extant, unambiguous variation units. Default value is False. - show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix should include the number of their extant, unambiguous variation units after the number of agreements. Default value is False. @@ -1845,16 +1849,20 @@ def to_similarity_matrix(self, drop_constant: bool = False, proportion: bool = F # The type of the matrix will depend on the input options: matrix = None if show_ext: - matrix = np.full((len(witness_labels), len(witness_labels)), "NA", dtype=object) # strings of the form "agreements/extant" + matrix = np.full( + (len(witness_labels), len(witness_labels)), "NA", dtype=object + ) # strings of the form "agreements/extant" elif proportion: - matrix = np.full((len(witness_labels), len(witness_labels)), 0.0, dtype=float) # floats of the form agreements/extant + matrix = np.full( + (len(witness_labels), len(witness_labels)), 0.0, dtype=float + ) # floats of the form agreements/extant else: - matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form agreements + matrix = np.full((len(witness_labels), len(witness_labels)), 0, dtype=int) # ints of the form agreements for i, wit_1 in enumerate(witness_labels): for j, wit_2 in enumerate(witness_labels): extant_units = 0 agreements = 0 - # If either of the cells for this pair of witnesses has been populated already, + # If either of the cells for this pair of witnesses has been populated already, # then just copy the entry from the other side of the diagonal without recalculating: if i > j: matrix[i, j] = matrix[j, i] @@ -1870,8 +1878,6 @@ def to_similarity_matrix(self, drop_constant: bool = False, proportion: bool = F wit_2_rdg_inds = [l for l, w in enumerate(wit_2_rdg_support) if w > 0] if len(wit_1_rdg_inds) != 1 or len(wit_2_rdg_inds) != 1: continue - if i == 0 and j == 1: - print(vu_id, wit_1_rdg_inds[0], wit_2_rdg_inds[0]) extant_units += 1 if wit_1_rdg_inds[0] == wit_2_rdg_inds[0]: agreements += 1 @@ -2047,10 +2053,10 @@ def to_dataframe( table_type (TableType, optional): A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; - if False, then missing data is ignored (i.e., all states are 0). + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). Default value is True. - show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. Only applicable for tabular output formats of type \"distance\" or \"similarity\". Default value is False. @@ -2068,11 +2074,15 @@ def to_dataframe( df = pd.DataFrame(matrix, index=reading_labels, columns=witness_labels) elif table_type == TableType.distance: # Convert the collation to a NumPy array and get its row and column labels first: - matrix, witness_labels = self.to_distance_matrix(drop_constant=drop_constant, proportion=proportion, show_ext=show_ext) + matrix, witness_labels = self.to_distance_matrix( + drop_constant=drop_constant, proportion=proportion, show_ext=show_ext + ) df = pd.DataFrame(matrix, index=witness_labels, columns=witness_labels) elif table_type == TableType.similarity: # Convert the collation to a NumPy array and get its row and column labels first: - matrix, witness_labels = self.to_similarity_matrix(drop_constant=drop_constant, proportion=proportion, show_ext=show_ext) + matrix, witness_labels = self.to_similarity_matrix( + drop_constant=drop_constant, proportion=proportion, show_ext=show_ext + ) df = pd.DataFrame(matrix, index=witness_labels, columns=witness_labels) elif table_type == TableType.nexus: # Convert the collation to a NumPy array and get its row and column labels first: @@ -2112,10 +2122,10 @@ def to_csv( table_type: A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; - if False, then missing data is ignored (i.e., all states are 0). + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). Default value is True. - show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. Only applicable for tabular output formats of type \"distance\" or \"similarity\". Default value is False. @@ -2166,10 +2176,10 @@ def to_excel( table_type: A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; - if False, then missing data is ignored (i.e., all states are 0). + split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; + if False, then missing data is ignored (i.e., all states are 0). Default value is True. - show_ext: An optional flag indicating whether each cell in a distance or similarity matrix + show_ext: An optional flag indicating whether each cell in a distance or similarity matrix should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements. Only applicable for tabular output formats of type \"distance\" or \"similarity\". Default value is False. @@ -2426,7 +2436,7 @@ def to_file( table_type (TableType, optional): A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". - show_ext (bool, optional): An optional flag indicating whether each cell in a distance or similarity matrix + show_ext (bool, optional): An optional flag indicating whether each cell in a distance or similarity matrix should include the number of variation units where both witnesses are extant after the number of their disagreements/agreements. Only applicable for tabular output formats of type \"distance\" or \"similarity\". Default value is False.