From 0a9742b572f1f11b62b66b133c8aba810f4faaf0 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Wed, 27 Nov 2024 13:09:26 +0000
Subject: [PATCH 1/4] Replace setup.py with pyproject.toml

---
 pyproject.toml           | 34 ++++++++++++++++++++++++++++++++++
 pyterrier_t5/__init__.py |  2 ++
 setup.py                 | 28 ----------------------------
 3 files changed, 36 insertions(+), 28 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b3867d4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,34 @@
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pyterrier-t5"
+description = "PyTerrier components for T5 ranking"
+requires-python = ">=3.9"
+authors = [
+  {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
+]
+maintainers = [
+  {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
+]
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python",
+  "Operating System :: OS Independent",
+  "Topic :: Text Processing",
+  "Topic :: Text Processing :: Indexing",
+  "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
+]
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.dynamic]
+version = {attr = "pyterrier_t5.__version__"}
+dependencies = {file = ["requirements.txt"]}
+
+[tool.setuptools.packages.find]
+exclude = ["tests"]
+
+[project.urls]
+Repository = "https://github.com/terrierteam/pyterrier_t5"
+"Bug Tracker" = "https://github.com/terrierteam/pyterrier_t5/issues"
diff --git a/pyterrier_t5/__init__.py b/pyterrier_t5/__init__.py
index 3524e8b..a1310e8 100644
--- a/pyterrier_t5/__init__.py
+++ b/pyterrier_t5/__init__.py
@@ -1,3 +1,5 @@
+__version__ = '0.1.0'
+
 import sys
 import math
 import warnings
diff --git a/setup.py b/setup.py
deleted file mode 100644
index e822108..0000000
--- a/setup.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import setuptools
-
-requirements = []
-with open('requirements.txt', 'rt') as f:
-    for req in f.read().splitlines():
-        if req.startswith('git+'):
-            pkg_name = req.split('/')[-1].replace('.git', '')
-            if "#egg=" in pkg_name:
-                pkg_name = pkg_name.split("#egg=")[1]
-            requirements.append(f'{pkg_name} @ {req}')
-        else:
-            requirements.append(req)
-
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
-setuptools.setup(
-    name="pyterrier-t5",
-    version="0.0.1",
-    author="Sean MacAvaney",
-    author_email='sean.macavaney{at}.glasgow.ac.uk',
-    description="PyTerrier components for T5 ranking",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    packages=setuptools.find_packages(),
-    install_requires=requirements,
-    python_requires='>=3.9',
-)

From 5173938c3589d2197b97a131defd0a4be794c657 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Wed, 27 Nov 2024 13:14:31 +0000
Subject: [PATCH 2/4] github actions

---
 .github/workflows/deploy.yml | 27 ++++++++++++++++++
 .github/workflows/push.yml   | 52 ----------------------------------
 .github/workflows/style.yml  | 36 +++++++++++++++++++++++
 .github/workflows/test.yml   | 55 ++++++++++++++++++++++++++++++++++++
 requirements-dev.txt         |  5 ++++
 5 files changed, 123 insertions(+), 52 deletions(-)
 create mode 100644 .github/workflows/deploy.yml
 delete mode 100644 .github/workflows/push.yml
 create mode 100644 .github/workflows/style.yml
 create mode 100644 .github/workflows/test.yml
 create mode 100644 requirements-dev.txt

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
new file mode 100644
index 0000000..6615edb
--- /dev/null
+++ b/.github/workflows/deploy.yml
@@ -0,0 +1,27 @@
+name: deploy
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  pypi:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: install-deps
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine build
+    - name: build
+      run: |
+        python -m build
+    - name: upload
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+      run: |
+        twine upload dist/*
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
deleted file mode 100644
index ce75b89..0000000
--- a/.github/workflows/push.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Python package
-
-on:
-  push:
-    branches: [ master ]
-  pull_request: {}
-  schedule: [cron: '0 12 * * 3'] # every Wednesday at noon
-
-jobs:
-  build:
-    
-    strategy:
-      matrix:
-        python-version: ['3.9', '3.12']
-        java: [11] # [11, 13]
-        os: ['ubuntu-latest'] # ['ubuntu-latest', 'macOs-latest', 'windows-latest']
-        architecture: ['x64']
-        terrier: ['snapshot']
-
-    runs-on: ${{ matrix.os }}
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup java
-      uses: actions/setup-java@v4
-      with:
-        distribution: 'temurin'
-        java-version: ${{ matrix.java }}
-        architecture: ${{ matrix.architecture }}
-
-
-    - name: Install Python dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements.txt
-        #install this software
-        pip install --timeout=120 .
-        pip install pytest
-
-    - name: All unit tests
-      env:
-        TERRIER_VERSION: ${{ matrix.terrier }}
-      run: |
-        pytest
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
new file mode 100644
index 0000000..27cabee
--- /dev/null
+++ b/.github/workflows/style.yml
@@ -0,0 +1,36 @@
+name: style
+
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+
+jobs:
+  ruff:
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        os: ['ubuntu-latest']
+
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Install Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Cache Dependencies
+      uses: actions/cache@v4
+      with:
+        path: ${{ env.pythonLocation }}
+        key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+
+    - name: Install Dependencies
+      run: |
+        pip install --upgrade -r requirements-dev.txt
+        pip install -e .
+
+    - name: Ruff
+      run: 'ruff check --output-format=github pyterrier_t5'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..bf128ec
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,55 @@
+name: test
+
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+  schedule: [cron: '0 12 * * 3'] # every Wednesday at noon
+
+jobs:
+  pytest:
+    strategy:
+      matrix:
+        os: ['ubuntu-latest']
+        python-version: ['3.9', '3.12']
+
+    runs-on: ${{ matrix.os }}
+    env:
+      runtag: ${{ matrix.os }}-${{ matrix.python-version }}
+
+    steps:
+     - name: Checkout
+       uses: actions/checkout@v4
+
+     - name: Install Python ${{ matrix.python-version }}
+       uses: actions/setup-python@v5
+       with:
+         python-version: ${{ matrix.python-version }}
+
+     - name: Cache Dependencies
+       uses: actions/cache@v4
+       with:
+         path: ${{ env.pythonLocation }}
+         key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+
+     - name: Loading Torch models from cache
+       uses: actions/cache@v3
+       with:
+         path: /home/runner/.cache/
+         key: model-cache
+
+     - name: Install Dependencies
+       run: |
+         pip install --upgrade -r requirements.txt -r requirements-dev.txt
+         pip install -e .
+
+     - name: Unit Test
+       run: |
+         pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_t5 --cov-report json:${{ env.runtag }}.coverage.json tests/
+
+     - name: Report Test Results
+       if: always()
+       run: |
+         printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY
+         jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY
+         printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY
+         jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..27a0e5e
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,5 @@
+pytest
+pytest-subtests
+pytest-cov
+pytest-json-report
+ruff

From f69b3b361080643dfb5a3a79904f593c82762902 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Wed, 27 Nov 2024 13:15:15 +0000
Subject: [PATCH 3/4] .gitignore

---
 .gitignore | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0a5c4a6..9439360 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,118 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+logs/
+wheels/
 *.egg-info/
-.idea
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+*.res
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# mac
+.DS_Store
+
+#java
+terrier-python-helper/target/
+.vscode
+.classpath
+.project
+terrier-python-helper/.settings
+*jar
+logs/

From 0bb2bfe16149555ffdf017fc9143c94644af0947 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Wed, 27 Nov 2024 13:16:56 +0000
Subject: [PATCH 4/4] fix ruff errors

---
 pyterrier_t5/__init__.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/pyterrier_t5/__init__.py b/pyterrier_t5/__init__.py
index a1310e8..0627585 100644
--- a/pyterrier_t5/__init__.py
+++ b/pyterrier_t5/__init__.py
@@ -1,18 +1,14 @@
 __version__ = '0.1.0'
 
-import sys
 import math
 import warnings
 import itertools
 import pyterrier as pt
-import pandas as pd
 from collections import defaultdict
 from pyterrier.model import add_ranks
 import torch
 from torch.nn import functional as F
-from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration
-from typing import List
-import re
+from transformers import T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration
 
 
 class MonoT5ReRanker(pt.Transformer):
@@ -41,7 +37,7 @@ def transform(self, run):
         scores = []
         queries, texts = run['query'], run[self.text_field]
         it = range(0, len(queries), self.batch_size)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
         if self.verbose:
             it = pt.tqdm(it, desc='monoT5', unit='batches')
@@ -93,9 +89,8 @@ def __str__(self):
         return f"DuoT5({self.model_name})"
 
     def transform(self, run):
-        queries, texts = run['query'], run[self.text_field]
         scores = defaultdict(lambda: 0.)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
         for batch in self._iter_duo_batches(run):
             enc_query = self.tokenizer.batch_encode_plus([f'Query: {q}' for q in batch['query']], return_tensors='pt', padding='longest')
@@ -194,7 +189,7 @@ def transform(self, run):
         scores = []
         queries, texts = run['query'], run[self.text_field]
         it = range(0, len(queries), self.batch_size)
-        prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
+        prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
         max_vlen = 512 - prompts['input_ids'].shape[1] #mT5Config doesn't have n_positions so we fallback to 512
         if self.verbose:
             it = pt.tqdm(it, desc='monoT5', unit='batches')