From 0a9742b572f1f11b62b66b133c8aba810f4faaf0 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Wed, 27 Nov 2024 13:09:26 +0000 Subject: [PATCH 1/4] Replace setup.py with pyproject.toml --- pyproject.toml | 34 ++++++++++++++++++++++++++++++++++ pyterrier_t5/__init__.py | 2 ++ setup.py | 28 ---------------------------- 3 files changed, 36 insertions(+), 28 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b3867d4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyterrier-t5" +description = "PyTerrier components for T5 ranking" +requires-python = ">=3.9" +authors = [ + {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, +] +maintainers = [ + {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, +] +readme = "README.md" +classifiers = [ + "Programming Language :: Python", + "Operating System :: OS Independent", + "Topic :: Text Processing", + "Topic :: Text Processing :: Indexing", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", +] +dynamic = ["version", "dependencies"] + +[tool.setuptools.dynamic] +version = {attr = "pyterrier_t5.__version__"} +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools.packages.find] +exclude = ["tests"] + +[project.urls] +Repository = "https://github.com/terrierteam/pyterrier_t5" +"Bug Tracker" = "https://github.com/terrierteam/pyterrier_t5/issues" diff --git a/pyterrier_t5/__init__.py b/pyterrier_t5/__init__.py index 3524e8b..a1310e8 100644 --- a/pyterrier_t5/__init__.py +++ b/pyterrier_t5/__init__.py @@ -1,3 +1,5 @@ +__version__ = '0.1.0' + import sys import math import warnings diff --git a/setup.py b/setup.py deleted file mode 100644 index e822108..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -import setuptools - -requirements = [] -with open('requirements.txt', 'rt') as f: - for req in f.read().splitlines(): - if req.startswith('git+'): - pkg_name = req.split('/')[-1].replace('.git', '') - if "#egg=" in pkg_name: - pkg_name = pkg_name.split("#egg=")[1] - requirements.append(f'{pkg_name} @ {req}') - else: - requirements.append(req) - -with open("README.md", "r") as fh: - long_description = fh.read() - -setuptools.setup( - name="pyterrier-t5", - version="0.0.1", - author="Sean MacAvaney", - author_email='sean.macavaney{at}.glasgow.ac.uk', - description="PyTerrier components for T5 ranking", - long_description=long_description, - long_description_content_type="text/markdown", - packages=setuptools.find_packages(), - install_requires=requirements, - python_requires='>=3.9', -) From 5173938c3589d2197b97a131defd0a4be794c657 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Wed, 27 Nov 2024 13:14:31 +0000 Subject: [PATCH 2/4] github actions --- .github/workflows/deploy.yml | 27 ++++++++++++++++++ .github/workflows/push.yml | 52 ---------------------------------- .github/workflows/style.yml | 36 +++++++++++++++++++++++ .github/workflows/test.yml | 55 ++++++++++++++++++++++++++++++++++++ requirements-dev.txt | 5 ++++ 5 files changed, 123 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/deploy.yml delete mode 100644 .github/workflows/push.yml create mode 100644 .github/workflows/style.yml create mode 100644 .github/workflows/test.yml create mode 100644 requirements-dev.txt diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..6615edb --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,27 @@ +name: deploy + +on: + release: + types: [created] + +jobs: + pypi: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: install-deps + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine build + - name: build + run: | + python -m build + - name: upload + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + twine upload dist/* diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml deleted file mode 100644 index ce75b89..0000000 --- a/.github/workflows/push.yml +++ /dev/null @@ -1,52 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Python package - -on: - push: - branches: [ master ] - pull_request: {} - schedule: [cron: '0 12 * * 3'] # every Wednesday at noon - -jobs: - build: - - strategy: - matrix: - python-version: ['3.9', '3.12'] - java: [11] # [11, 13] - os: ['ubuntu-latest'] # ['ubuntu-latest', 'macOs-latest', 'windows-latest'] - architecture: ['x64'] - terrier: ['snapshot'] - - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup java - uses: actions/setup-java@v4 - with: - distribution: 'temurin' - java-version: ${{ matrix.java }} - architecture: ${{ matrix.architecture }} - - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - #install this software - pip install --timeout=120 . - pip install pytest - - - name: All unit tests - env: - TERRIER_VERSION: ${{ matrix.terrier }} - run: | - pytest diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..27cabee --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,36 @@ +name: style + +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + +jobs: + ruff: + strategy: + matrix: + python-version: ['3.10'] + os: ['ubuntu-latest'] + + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements-dev.txt + pip install -e . + + - name: Ruff + run: 'ruff check --output-format=github pyterrier_t5' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..bf128ec --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,55 @@ +name: test + +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + schedule: [cron: '0 12 * * 3'] # every Wednesday at noon + +jobs: + pytest: + strategy: + matrix: + os: ['ubuntu-latest'] + python-version: ['3.9', '3.12'] + + runs-on: ${{ matrix.os }} + env: + runtag: ${{ matrix.os }}-${{ matrix.python-version }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Loading Torch models from cache + uses: actions/cache@v3 + with: + path: /home/runner/.cache/ + key: model-cache + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements.txt -r requirements-dev.txt + pip install -e . + + - name: Unit Test + run: | + pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_t5 --cov-report json:${{ env.runtag }}.coverage.json tests/ + + - name: Report Test Results + if: always() + run: | + printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY + printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..27a0e5e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,5 @@ +pytest +pytest-subtests +pytest-cov +pytest-json-report +ruff From f69b3b361080643dfb5a3a79904f593c82762902 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Wed, 27 Nov 2024 13:15:15 +0000 Subject: [PATCH 3/4] .gitignore --- .gitignore | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0a5c4a6..9439360 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,118 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +logs/ +wheels/ *.egg-info/ -.idea +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ +*.res + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# mac +.DS_Store + +#java +terrier-python-helper/target/ +.vscode +.classpath +.project +terrier-python-helper/.settings +*jar +logs/ From 0bb2bfe16149555ffdf017fc9143c94644af0947 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Wed, 27 Nov 2024 13:16:56 +0000 Subject: [PATCH 4/4] fix ruff errors --- pyterrier_t5/__init__.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pyterrier_t5/__init__.py b/pyterrier_t5/__init__.py index a1310e8..0627585 100644 --- a/pyterrier_t5/__init__.py +++ b/pyterrier_t5/__init__.py @@ -1,18 +1,14 @@ __version__ = '0.1.0' -import sys import math import warnings import itertools import pyterrier as pt -import pandas as pd from collections import defaultdict from pyterrier.model import add_ranks import torch from torch.nn import functional as F -from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration -from typing import List -import re +from transformers import T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration class MonoT5ReRanker(pt.Transformer): @@ -41,7 +37,7 @@ def transform(self, run): scores = [] queries, texts = run['query'], run[self.text_field] it = range(0, len(queries), self.batch_size) - prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') + prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1] if self.verbose: it = pt.tqdm(it, desc='monoT5', unit='batches') @@ -93,9 +89,8 @@ def __str__(self): return f"DuoT5({self.model_name})" def transform(self, run): - queries, texts = run['query'], run[self.text_field] scores = defaultdict(lambda: 0.) - prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') + prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1] for batch in self._iter_duo_batches(run): enc_query = self.tokenizer.batch_encode_plus([f'Query: {q}' for q in batch['query']], return_tensors='pt', padding='longest') @@ -194,7 +189,7 @@ def transform(self, run): scores = [] queries, texts = run['query'], run[self.text_field] it = range(0, len(queries), self.batch_size) - prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') + prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest') max_vlen = 512 - prompts['input_ids'].shape[1] #mT5Config doesn't have n_positions so we fallback to 512 if self.verbose: it = pt.tqdm(it, desc='monoT5', unit='batches')