Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Package modernization #16

Merged
merged 4 commits into from
Nov 27, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: deploy

on:
release:
types: [created]

jobs:
pypi:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: install-deps
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine build
- name: build
run: |
python -m build
- name: upload
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
twine upload dist/*
52 changes: 0 additions & 52 deletions .github/workflows/push.yml

This file was deleted.

36 changes: 36 additions & 0 deletions .github/workflows/style.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: style

on:
push: {branches: [main]} # pushes to main
pull_request: {} # all PRs

jobs:
ruff:
strategy:
matrix:
python-version: ['3.10']
os: ['ubuntu-latest']

runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Cache Dependencies
uses: actions/cache@v4
with:
path: ${{ env.pythonLocation }}
key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}

- name: Install Dependencies
run: |
pip install --upgrade -r requirements-dev.txt
pip install -e .

- name: Ruff
run: 'ruff check --output-format=github pyterrier_t5'
55 changes: 55 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: test

on:
push: {branches: [main]} # pushes to main
pull_request: {} # all PRs
schedule: [cron: '0 12 * * 3'] # every Wednesday at noon

jobs:
pytest:
strategy:
matrix:
os: ['ubuntu-latest']
python-version: ['3.9', '3.12']

runs-on: ${{ matrix.os }}
env:
runtag: ${{ matrix.os }}-${{ matrix.python-version }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Cache Dependencies
uses: actions/cache@v4
with:
path: ${{ env.pythonLocation }}
key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}

- name: Loading Torch models from cache
uses: actions/cache@v3
with:
path: /home/runner/.cache/
key: model-cache

- name: Install Dependencies
run: |
pip install --upgrade -r requirements.txt -r requirements-dev.txt
pip install -e .

- name: Unit Test
run: |
pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_t5 --cov-report json:${{ env.runtag }}.coverage.json tests/

- name: Report Test Results
if: always()
run: |
printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY
jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY
printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY
jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY
118 changes: 117 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,118 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
logs/
wheels/
*.egg-info/
.idea
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
*.res

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# mac
.DS_Store

#java
terrier-python-helper/target/
.vscode
.classpath
.project
terrier-python-helper/.settings
*jar
logs/
34 changes: 34 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[build-system]
requires = ["setuptools >= 61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "pyterrier-t5"
description = "PyTerrier components for T5 ranking"
requires-python = ">=3.9"
authors = [
{name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
]
maintainers = [
{name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"},
]
readme = "README.md"
classifiers = [
"Programming Language :: Python",
"Operating System :: OS Independent",
"Topic :: Text Processing",
"Topic :: Text Processing :: Indexing",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
]
dynamic = ["version", "dependencies"]

[tool.setuptools.dynamic]
version = {attr = "pyterrier_t5.__version__"}
dependencies = {file = ["requirements.txt"]}

[tool.setuptools.packages.find]
exclude = ["tests"]

[project.urls]
Repository = "https://github.com/terrierteam/pyterrier_t5"
"Bug Tracker" = "https://github.com/terrierteam/pyterrier_t5/issues"
15 changes: 6 additions & 9 deletions pyterrier_t5/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import sys
__version__ = '0.1.0'

import math
import warnings
import itertools
import pyterrier as pt
import pandas as pd
from collections import defaultdict
from pyterrier.model import add_ranks
import torch
from torch.nn import functional as F
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration
from typing import List
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, MT5ForConditionalGeneration


class MonoT5ReRanker(pt.Transformer):
@@ -39,7 +37,7 @@ def transform(self, run):
scores = []
queries, texts = run['query'], run[self.text_field]
it = range(0, len(queries), self.batch_size)
prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
if self.verbose:
it = pt.tqdm(it, desc='monoT5', unit='batches')
@@ -91,9 +89,8 @@ def __str__(self):
return f"DuoT5({self.model_name})"

def transform(self, run):
queries, texts = run['query'], run[self.text_field]
scores = defaultdict(lambda: 0.)
prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
max_vlen = self.model.config.n_positions - prompts['input_ids'].shape[1]
for batch in self._iter_duo_batches(run):
enc_query = self.tokenizer.batch_encode_plus([f'Query: {q}' for q in batch['query']], return_tensors='pt', padding='longest')
@@ -192,7 +189,7 @@ def transform(self, run):
scores = []
queries, texts = run['query'], run[self.text_field]
it = range(0, len(queries), self.batch_size)
prompts = self.tokenizer.batch_encode_plus([f'Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
prompts = self.tokenizer.batch_encode_plus(['Relevant:' for _ in range(self.batch_size)], return_tensors='pt', padding='longest')
max_vlen = 512 - prompts['input_ids'].shape[1] #mT5Config doesn't have n_positions so we fallback to 512
if self.verbose:
it = pt.tqdm(it, desc='monoT5', unit='batches')
5 changes: 5 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pytest
pytest-subtests
pytest-cov
pytest-json-report
ruff
Loading