terrierteam · seanmacavaney · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,27 @@
+name: deploy
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  pypi:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: install-deps
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine build
+    - name: build
+      run: |
+        python -m build
+    - name: upload
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+      run: |
+        twine upload dist/*
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -0,0 +1,36 @@
+name: style
+
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+
+jobs:
+  ruff:
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        os: ['ubuntu-latest']
+
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Install Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Cache Dependencies
+      uses: actions/cache@v4
+      with:
+        path: ${{ env.pythonLocation }}
+        key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+
+    - name: Install Dependencies
+      run: |
+        pip install --upgrade -r requirements-dev.txt
+        pip install -e .
+
+    - name: Ruff
+      run: 'ruff check --output-format=github pyterrier_t5'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,55 @@
+name: test
+
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+  schedule: [cron: '0 12 * * 3'] # every Wednesday at noon
+
+jobs:
+  pytest:
+    strategy:
+      matrix:
+        os: ['ubuntu-latest']
+        python-version: ['3.9', '3.12']
+
+    runs-on: ${{ matrix.os }}
+    env:
+      runtag: ${{ matrix.os }}-${{ matrix.python-version }}
+
+    steps:
+     - name: Checkout
+       uses: actions/checkout@v4
+
+     - name: Install Python ${{ matrix.python-version }}
+       uses: actions/setup-python@v5
+       with:
+         python-version: ${{ matrix.python-version }}
+
+     - name: Cache Dependencies
+       uses: actions/cache@v4
+       with:
+         path: ${{ env.pythonLocation }}
+         key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+
+     - name: Loading Torch models from cache
+       uses: actions/cache@v3
+       with:
+         path: /home/runner/.cache/
+         key: model-cache
+
+     - name: Install Dependencies
+       run: |
+         pip install --upgrade -r requirements.txt -r requirements-dev.txt
+         pip install -e .
+
+     - name: Unit Test
+       run: |
+         pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_t5 --cov-report json:${{ env.runtag }}.coverage.json tests/
+
+     - name: Report Test Results
+       if: always()
+       run: |
+         printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY
+         jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY
+         printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY
+         jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,118 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+logs/
+wheels/
 *.egg-info/
-.idea
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+*.res
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# mac
+.DS_Store
+
+#java
+terrier-python-helper/target/
+.vscode
+.classpath
+.project
+terrier-python-helper/.settings
+*jar
+logs/
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,34 @@
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pyterrier-t5"
+description = "PyTerrier components for T5 ranking"
+requires-python = ">=3.9"
+authors = [
+  {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
+]
+maintainers = [
+  {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
+]
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python",
+  "Operating System :: OS Independent",
+  "Topic :: Text Processing",
+  "Topic :: Text Processing :: Indexing",
+  "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
+]
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.dynamic]
+version = {attr = "pyterrier_t5.__version__"}
+dependencies = {file = ["requirements.txt"]}
+
+[tool.setuptools.packages.find]
+exclude = ["tests"]
+
+[project.urls]
+Repository = "https://github.com/terrierteam/pyterrier_t5"
+"Bug Tracker" = "https://github.com/terrierteam/pyterrier_t5/issues"
diff --git a/pyterrier_t5/__init__.py b/pyterrier_t5/__init__.py
@@ -1,16 +1,14 @@
-import sys
+__version__ = '0.1.0'
+
 import math
 import warnings
 import itertools
 import pyterrier as pt
-import pandas as pd
 from collections import defaultdict
 from pyterrier.model import add_ranks
 import torch
 from torch.nn import functional as F
-from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration
-from typing import List
-import re
+from transformers import T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration
 
 
 class MonoT5ReRanker(pt.Transformer):
@@ -39,7 +37,7 @@ def transform(self, run):
         scores = []
         queries, texts = run['query'], run[self.text_field]
         it = range(0, len(queries), self.batch_size)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
         if self.verbose:
             it = pt.tqdm(it, desc='monoT5', unit='batches')
@@ -91,9 +89,8 @@ def __str__(self):
         return f"DuoT5({self.model_name})"
 
     def transform(self, run):
-        queries, texts = run['query'], run[self.text_field]
         scores = defaultdict(lambda: 0.)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
         for batch in self._iter_duo_batches(run):
             enc_query = self.tokenizer.batch_encode_plus([f'Query: {q}' for q in batch['query']], return_tensors='pt', padding='longest')
@@ -192,7 +189,7 @@ def transform(self, run):
         scores = []
         queries, texts = run['query'], run[self.text_field]
         it = range(0, len(queries), self.batch_size)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = 512 - prompts['input_ids'].shape[1] #mT5Config doesn't have n_positions so we fallback to 512
         if self.verbose:
             it = pt.tqdm(it, desc='monoT5', unit='batches')

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,5 @@
+pytest
+pytest-subtests
+pytest-cov
+pytest-json-report
+ruff