Merge pull request #282 from MannLabs/development

Development
MannLabs · Jan 22, 2025 · c9792ce · c9792ce
2 parents 774d1ab + 81d9d7d
commit c9792ce
Show file tree

Hide file tree

Showing 61 changed files with 4,113 additions and 2,859 deletions.
diff --git a/.github/workflows/_run_tests.yml b/.github/workflows/_run_tests.yml
@@ -14,7 +14,7 @@ on:
         required: true
         type: string
 jobs:
-  pre-commit:
+  run-tests:
     runs-on: ${{ inputs.os }}
     steps:
     - uses: actions/checkout@v3
@@ -35,7 +35,7 @@ jobs:
       shell: bash -le {0}
       run: |
         conda activate alphabase
-        pip install pytest nbmake==1.5.3
+        pip install -r requirements/requirements_tests.txt
         conda deactivate
     - name: Run tests
       shell: bash -le {0}

diff --git a/.github/workflows/branch-checks.yaml b/.github/workflows/branch-checks.yaml
@@ -24,3 +24,11 @@ jobs:
       python-version: ${{ matrix.python-version }}
       os: ${{ matrix.os }}
       install-script: pip_install.sh tests
+  get-code-review-input:
+    runs-on: ubuntu-latest
+    #if: contains(github.event.pull_request.labels.*.name, 'code-review')
+    steps:
+      - uses: MannLabs/alphashared/actions/get-code-review-input@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.number }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,6 +11,16 @@ repos:
   rev: v0.4.0
   hooks:
     - id: ruff-format
+    # running ruff with rules in pyproject.toml (all files, limited rules)
     - id: ruff
       args:
         - "--fix"
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.7.3 # newer version -> stricter
+  hooks:
+    - id: ruff
+      # running ruff again with rules in ruff-lint-psm-readers.toml (specific files, all rules)
+      args:
+        - "--config"
+        - "ruff-lint-psm-readers.toml"
+        - "--fix"
diff --git a/README.md b/README.md
@@ -198,17 +198,35 @@ For an even more interactive participation, check out the
 
 ### Notes for developers
 
-#### Tagging of changes
-In order to have release notes automatically generated, changes need to be tagged with labels.
-The following labels are used (should be safe-explanatory):
-`breaking-change`, `bug`, `enhancement`.
+#### 1. Code Structure
+While AlphaBase offers an object-oriented interface, algorithms for manipulating data should be implemented in a functional way and called from class methods. This allows the functions to be reused without instatiating a class.
 
-#### Release a new version
-This package uses a shared release process defined in the
-[alphashared](https://github.com/MannLabs/alphashared) repository. Please see the instructions
-[there](https://github.com/MannLabs/alphashared/blob/reusable-release-workflow/.github/workflows/README.md#release-a-new-version).
+#### 2. DataFrame Handling
+- Return DataFrames in the same order as they were passed
+- Minimize in-place modifications of DataFrames. Mention them explicitly in the docstring
+- Implement low-level functions that operate on numpy arrays and return arrays. Use higher-level functions to assign array results to DataFrames
+
+#### 3. Data Assumptions
+Avoid making assumptions about:
+- Precursor ordering by `nAA`
+- Fragment indices ordering (e.g., `frag_start_idx`)
+- Continuity of `frag_start_idx` where `frag_start_idx[i+1] == frag_stop_idx[i]`
+- All fragments being assigned to a precursor
+
+Assumptions are only permitted for low-level or optimized functions and should be documented in the docstring.
+
+#### 3. Optimization Strategy
+When performance optimization is needed:
+1. Implement the general solution first
+2. Add optimized versions for special cases for refined precursor df or order `nAA`
+3. Check conditions at runtime to use optimized versions when applicable
+
+#### 4. Code Quality
+- Include python type hints
+- Include docstrings in numpy style (see [numpy docstring example](https://www.sphinx-doc.org/en/master/usage/extensions/example_numpy.html#example-numpy))
 
-#### pre-commit hooks
+
+#### 5. pre-commit hooks
 It is highly recommended to use the provided pre-commit hooks, as the CI pipeline enforces all checks therein to
 pass in order to merge a branch.
 
@@ -221,9 +239,19 @@ You can run the checks yourself using:
 pre-commit run --all-files
 ```
 
+#### 6. Tagging of Pull Requests
+In order to have release notes automatically generated, pull requests need to be tagged with labels.
+The following labels are used (should be safe-explanatory):
+`breaking-change`, `bug`, `enhancement`.
+
+#### 7. Release a new version
+This package uses a shared release process defined in the
+[alphashared](https://github.com/MannLabs/alphashared) repository. Please see the instructions
+[there](https://github.com/MannLabs/alphashared/blob/reusable-release-workflow/.github/workflows/README.md#release-a-new-version).
+
+
 ------------------------------------------------------------------------
 
 ## Changelog
 
-See the [HISTORY.md](HISTORY.md) for a full overview of the changes made
-in each version.
+For a full overview of the changes made in each version see [CHANGELOG.md](CHANGELOG.md) (until version 1.1.0) and the github release notes (from >1.1.0).
diff --git a/alphabase/constants/_const.py b/alphabase/constants/_const.py
@@ -5,6 +5,7 @@
 from alphabase.yaml_utils import load_yaml
 
 CONST_FILE_FOLDER = os.path.join(os.path.dirname(__file__), "const_files")
+PSM_READER_YAML_FILE_NAME = "psm_reader.yaml"
 
 common_const_dict: dict = load_yaml(
     os.path.join(CONST_FILE_FOLDER, "common_constants.yaml")

diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -1,3 +1,26 @@
+alphadia:
+  reader_type: alphadia
+  rt_unit: minute
+  fixed_C57: False
+  column_mapping:
+    'raw_name': 'run'
+    'sequence': 'sequence'
+    'charge': 'charge'
+    'rt': 'rt_observed'
+    'rt_start': 'rt_start'
+    'rt_stop': 'rt_stop'
+    'ccs': 'ccs'
+    'mobility': 'mobility'
+    'proteins': 'proteins'
+    'uniprot_ids': 'uniprot_ids'
+    'genes': 'genes'
+#    'scan_num': '' ?
+    'score': 'score'
+    'fdr': 'fdr'
+    'mods': 'mods'
+    'intensity': 'intensity'
+  modification_mapping_type: 'maxquant' # TODO: None?
+
 alphapept:
   reader_type: alphapept
   rt_unit: minute
@@ -13,18 +36,15 @@ alphapept:
     'raw_name': 'raw_name' #parse from `ms_data.hdf`` file
     'fdr': 'q_value'
     'decoy': 'decoy'
-  modification_mapping:
-    'Carbamidomethyl@C': 'cC'
-    'Oxidation@M': 'oxM'
-    'Phospho@S': 'pS'
-    'Phospho@T': 'pT'
-    'Phospho@Y': 'pY'
-    'Acetyl@Protein_N-term': 'a'
+  modification_mapping_type: 'alphapept'
+
 
 maxquant:
   reader_type: maxquant
   rt_unit: minute
   fixed_C57: True
+  mod_seq_columns:
+    - 'Modified sequence'
   column_mapping:
     'sequence': 'Sequence'
     'charge': 'Charge'
@@ -47,8 +67,14 @@ maxquant:
     'genes': ['Gene Names','Gene names']
     'decoy': 'Reverse'
     'intensity': 'Intensity'
+  modification_mapping_type: 'maxquant'
 
-  modification_mapping:
+modification_mappings:
+  maxquant:
+    'mTRAQ@K':
+      - 'K(mTRAQ)'
+    'mTRAQ@Any_N-term':
+      - '(mTRAQ)'
     'Dimethyl@K':
       - 'K(Dimethyl)'
     'Dimethyl@R':
@@ -102,6 +128,13 @@ maxquant:
     'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
     'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
     'hydroxyisobutyryl@K': 'K(2-)'
+  alphapept:
+    'Carbamidomethyl@C': 'cC'
+    'Oxidation@M': 'oxM'
+    'Phospho@S': 'pS'
+    'Phospho@T': 'pT'
+    'Phospho@Y': 'pY'
+    'Acetyl@Protein_N-term': 'a'
 
 pfind:
   reader_type: pfind
@@ -118,8 +151,7 @@ pfind:
     'uniprot_ids': 'Proteins'
     'fdr': 'Q-value'
     'decoy': ['Target/Decoy', 'Targe/Decoy']
-  modification_mapping:
-    '': ''
+  modification_mapping_type: null  # no mapping required
 
 msfragger_pepxml:
   reader_type: msfragger_pepxml
@@ -131,12 +163,9 @@ msfragger_pepxml:
     'query_id': 'spectrum'
     'scan_num': 'start_scan'
     'score': 'expect'
-    'fdr': 'expect'
     'proteins': 'protein'
     'raw_name': 'raw_name'
     'mobility': 'ion_mobility'
-  modification_mapping:
-    '': ''
   mass_mapped_mods:
     - 'Oxidation@M' #other Oxidation@X are not needed here
     - 'Carbamidomethyl@C'
@@ -149,6 +178,7 @@ msfragger_pepxml:
     - 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly
     - 'Methyl@E' #an example of a PTM that can be C-term
   mod_mass_tol: 0.1 # Da
+  modification_mapping_type: 'maxquant'
 
 diann:
   reader_type: diann
@@ -169,7 +199,9 @@ diann:
     'scan_num': 'MS2.Scan'
     'score': 'CScore'
     'fdr': 'Q.Value'
-  modification_mapping: 'maxquant'
+  mod_seq_columns:
+    - "Modified.Sequence"
+  modification_mapping_type: 'maxquant'
 
 spectronaut_report:
   reader_type: spectronaut_report
@@ -183,19 +215,16 @@ spectronaut_report:
     'genes': 'PG.Genes'
     'uniprot_ids': 'PG.UniProtIds'
     'charge': 'charge'
-  modification_mapping: 'maxquant'
+  mod_seq_columns:
+    - 'ModifiedSequence'
+  precursor_id_columns:
+    - "EG.PrecursorId"
+  modification_mapping_type: 'maxquant'
 
 spectronaut:
   reader_type: spectronaut
   rt_unit: irt
   fixed_C57: False
-  mod_seq_columns:
-    - 'ModifiedPeptide'
-    - 'ModifiedSequence'
-    - 'FullUniModPeptideName'
-    - 'ModifiedPeptideSequence'
-    - 'LabeledSequence'
-    - 'FullUniModPeptideName'
   column_mapping:
     'raw_name': 'ReferenceRun'
     'sequence': ['StrippedPeptide','PeptideSequence']
@@ -207,20 +236,21 @@ spectronaut:
     'proteins': ['Protein Name','ProteinId','ProteinID','ProteinName','ProteinGroup','ProteinGroups']
     'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
     'genes': ['Genes','Gene','GeneName','GeneNames']
-  modification_mapping: 'maxquant'
-
-library_reader_base:
-  reader_type: library_reader_base
-  rt_unit: irt
-  fixed_C57: False
-  csv_sep: "\t"
   mod_seq_columns:
-    - 'ModifiedPeptideSequence'
     - 'ModifiedPeptide'
     - 'ModifiedSequence'
     - 'FullUniModPeptideName'
+    - 'ModifiedPeptideSequence'
     - 'LabeledSequence'
     - 'FullUniModPeptideName'
+  precursor_id_columns:
+    - "EG.PrecursorId"
+  modification_mapping_type: 'maxquant'
+
+library_reader_base:
+  reader_type: library_reader_base
+  rt_unit: irt
+  fixed_C57: False
   column_mapping:
     'raw_name': 'ReferenceRun'
     'sequence': ['PeptideSequence', 'StrippedPeptide']
@@ -239,11 +269,18 @@ library_reader_base:
     'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge']
     'fragment_series': ['FragmentSeriesNumber','FragmentNumber']
     'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType']
-  modification_mapping: 'maxquant'
+  mod_seq_columns:
+    - 'ModifiedPeptideSequence'
+    - 'ModifiedPeptide'
+    - 'ModifiedSequence'
+    - 'FullUniModPeptideName'
+    - 'LabeledSequence'
+    - 'FullUniModPeptideName'
+  modification_mapping_type: 'maxquant'
 
 sage:
   reader_type: sage
-  rt_unit: minute
+  rt_unit: second
   column_mapping:
     'modified_sequence': 'peptide'
     'sequence': 'stripped_peptide'
@@ -258,3 +295,4 @@ sage:
     'peptide_fdr': 'peptide_q'
     'protein_fdr': 'protein_q'
     'decoy': 'is_decoy'
+  modification_mapping_type: null # custom mapping in code