Skip to content

Commit

Permalink
refactoring & scorer (#4)
Browse files Browse the repository at this point in the history
Co-authored-by: Alberto Veneri <[email protected]>
Co-authored-by: Craig Macdonald <[email protected]>
  • Loading branch information
3 people authored Aug 28, 2024
1 parent 40e5d02 commit 4f89605
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 93 deletions.
54 changes: 54 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Python package

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:

strategy:
matrix:
python-version: ['3.8', '3.9', '3.10']
java: [11]
os: ['ubuntu-latest']
architecture: ['x64']
terrier: ['snapshot']

runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}

- name: Setup java
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
architecture: ${{ matrix.architecture }}

- name: Install Terrier snapshot
if: matrix.terrier == '5.4-SNAPSHOT'
run: |
git clone https://github.com/terrier-org/terrier-core.git
cd terrier-core
mvn -B -DskipTests install
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install git+https://github.com/naver/splade.git
pip install -r requirements.txt
pip install --timeout=120 .
pip install pytest
- name: All unit tests
env:
TERRIER_VERSION: ${{ matrix.terrier }}
run: |
pytest
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,12 @@ The Terrier indexer is configured to index tokens unchanged.
```python

import pyterrier as pt
pt.init()

import pyt_splade
splade = pyt_splade.SpladeFactory()
splade = pyt_splade.Splade()
indexer = pt.IterDictIndexer('./msmarco_psg', pretokenised=True)

indxr_pipe = splade.indexing() >> indexer
indxr_pipe = splade.doc_encoder() >> indexer
index_ref = indxr_pipe.index(dataset.get_corpus_iter(), batch_size=128)

```
Expand All @@ -39,7 +38,18 @@ We apply this as a query encoding transformer. It encodes the query into Terrier

```python

splade_retr = splade.query() >> pt.BatchRetrieve('./msmarco_psg', wmodel='Tf')
splade_retr = splade.query_encoder(matchop=True) >> pt.terrier.Retrieve('./msmarco_psg', wmodel='Tf')

```

# Scoring

SPLADE can also be used as a text scoring function.

```python

first_stage = ... # e.g., BM25, dense retrieval, etc.
splade_scorer = first_stage >> pt.text.get_text(dataset, 'text') >> splade.scorer()

```

Expand Down Expand Up @@ -68,4 +78,5 @@ We have a demo of PyTerrier_SPLADE at https://huggingface.co/spaces/terrierteam/

# Credits

Craig Macdonald
- Craig Macdonald
- Sean MacAvaney
230 changes: 142 additions & 88 deletions pyt_splade/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,24 @@
import base64
import string
import more_itertools
import pyterrier as pt

assert pt.started()
from typing import Union
import torch
import numpy as np
import pandas as pd

def _matchop(t, w):
import base64
import string
if not all(a in string.ascii_letters + string.digits for a in t):
encoded = base64.b64encode(t.encode('utf-8')).decode("utf-8")
t = f'#base64({encoded})'
if w != 1:
t = f'#combine:0={w}({t})'
return t

class SpladeFactory():
class Splade():

def __init__(
self,
model : Union[torch.nn.Module, str] = "naver/splade-cocondenser-ensembledistil",
tokenizer=None,
agg='max',
max_length = 256,
device=None):

import torch
self,
model: Union[torch.nn.Module, str] = "naver/splade-cocondenser-ensembledistil",
tokenizer=None,
agg='max',
max_length=256,
device=None):
self.max_length = max_length
self.model = model
self.tokenizer = tokenizer
Expand All @@ -46,86 +40,146 @@ def __init__(

self.reverse_voc = {v: k for k, v in self.tokenizer.vocab.items()}

def indexing(self) -> pt.Transformer:
def _transform_indexing(df):
rtr = []
if len(df) > 0:
with torch.no_grad():
# now compute the document representation
doc_reps = self.model(d_kwargs=self.tokenizer(
df.text.tolist(),
add_special_tokens=True,
padding="longest", # pad to max sequence length in batch
truncation="longest_first", # truncates to max model length,
max_length=self.max_length,
return_attention_mask=True,
return_tensors="pt",
).to(self.device))["d_rep"] # (sparse) doc rep in voc space, shape (docs, 30522,)

for i in range(doc_reps.shape[0]): #for each doc
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_reps[i]).squeeze().cpu().tolist()

# now let's create the bow representation as a dictionary
weights = doc_reps[i,col].cpu().tolist()
d = {self.reverse_voc[k] : v for k, v in sorted(zip(col, weights), key=lambda x: (-x[1], x[0]))}
rtr.append(d)
return df.assign(toks=rtr)
return pt.apply.generic(_transform_indexing)


def query(self, mult=100) -> pt.Transformer:

def _transform_query(df):
from pyterrier.model import push_queries
new_queries = []
if len(df) > 0:
with torch.no_grad():
# now compute the query representations
query_reps = self.model(q_kwargs=self.tokenizer(
df['query'].tolist(),
add_special_tokens=True,
padding="longest", # pad to max sequence length in batch
truncation="longest_first", # truncates to max model length,
max_length=self.max_length,
return_attention_mask=True,
return_tensors="pt",
).to(self.device))["q_rep"] # (sparse) q rep in voc space, shape (queries, 30522,)

for i in range(query_reps.shape[0]): #for each query
# get the number of non-zero dimensions in the rep:
cols = torch.nonzero(query_reps[i]).squeeze().cpu().tolist()
# and corresponding weights
weights = query_reps[i,cols].cpu().tolist()

# Now let's create the bow representation in terrier's matchop QL.
# We scale by mult(=100) to better match the quantized weights in
# the inverted index created by toks2doc(). These defaults match the
# parameters suggested for Anserini in Splade repo, namely
# quantization_factor_document=100 quantization_factor_query=100.
newquery = ' '.join( _matchop(self.reverse_voc[k], v * mult) for k, v in sorted(zip(cols, weights), key=lambda x: (-x[1], x[0])))
new_queries.append(newquery)

rtr = push_queries(df)
rtr['query'] = new_queries
return rtr
return pt.apply.generic(_transform_query)

def doc_encoder(self, text_field='text', batch_size=100, sparse=True, verbose=False, scale=100) -> pt.Transformer:
out_field = 'toks' if sparse else 'doc_vec'
return SpladeEncoder(self, text_field, out_field, 'd', sparse, batch_size, verbose, scale)

indexing = doc_encoder # backward compatible name

def query_encoder(self, batch_size=100, sparse=True, verbose=False, matchop=False, scale=100) -> pt.Transformer:
out_field = 'query_toks' if sparse else 'query_vec'
res = SpladeEncoder(self, 'query', out_field, 'q', sparse, batch_size, verbose, scale)
if matchop:
res = res >> MatchOp()
return res

def query(self, batch_size=100, sparse=True, verbose=False, matchop=True, scale=100) -> pt.Transformer:
# backward compatible name w/ default matchop=True
return self.query_encoder(batch_size, sparse, verbose, matchop, scale)

def scorer(self, text_field='text', batch_size=100, verbose=False) -> pt.Transformer:
return SpladeScorer(self, text_field, batch_size, verbose)

def encode(self, texts, rep='d', format='dict', scale=1.):
rtr = []
with torch.no_grad():
reps = self.model(**{rep + '_kwargs': self.tokenizer(
texts,
add_special_tokens=True,
padding="longest", # pad to max sequence length in batch
truncation="longest_first", # truncates to max model length,
max_length=self.max_length,
return_attention_mask=True,
return_tensors="pt",
).to(self.device)})[rep + '_rep']
reps = reps * scale
if format == 'dict':
reps = reps.cpu()
for i in range(reps.shape[0]):
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(reps[i]).squeeze(1).tolist()
# now let's create the bow representation as a dictionary
weights = reps[i, col].cpu().tolist()
# if document cast to int to make the weights ready for terrier indexing
if rep == "d":
weights = list(map(int, weights))
sorted_weights = sorted(zip(col, weights), key=lambda x: (-x[1], x[0]))
# create the dict removing the weights less than 1, i.e. 0, that are not helpful
d = {self.reverse_voc[k]: v for k, v in sorted_weights if v > 0}
rtr.append(d)
elif format == 'np':
reps = reps.cpu().numpy()
for i in range(reps.shape[0]):
rtr.append(reps[i])
elif format == 'torch':
rtr = reps
return rtr


SpladeFactory = Splade # backward compatible name


class SpladeEncoder(pt.Transformer):
def __init__(self, splade, text_field, out_field, rep, sparse=True, batch_size=100, verbose=False, scale=1.):
self.splade = splade
self.text_field = text_field
self.out_field = out_field
self.rep = rep
self.sparse = sparse
self.batch_size = batch_size
self.verbose = verbose
self.scale = scale

def transform(self, df):
assert self.text_field in df.columns
it = iter(df[self.text_field])
if self.verbose:
it = pt.tqdm(it, total=len(df), unit=self.text_field)
res = []
for batch in more_itertools.chunked(it, self.batch_size):
res.extend(self.splade.encode(batch, self.rep, format='dict' if self.sparse else 'np', scale=self.scale))
return df.assign(**{self.out_field: res})


class SpladeScorer(pt.Transformer):
def __init__(self, splade, text_field, batch_size=100, verbose=False):
self.splade = splade
self.text_field = text_field
self.batch_size = batch_size
self.verbose = verbose

def transform(self, df):
assert all(f in df.columns for f in ['query', self.text_field])
it = df.groupby('query')
if self.verbose:
it = pt.tqdm(it, unit='query')
res = []
for query, df in it:
query_enc = self.splade.encode([query], 'q', 'torch')
scores = []
for batch in more_itertools.chunked(df[self.text_field], self.batch_size):
doc_enc = self.splade.encode(batch, 'd', 'torch')
scores.append((query_enc @ doc_enc.T).flatten().cpu().numpy())
res.append(df.assign(score=np.concatenate(scores)))
res = pd.concat(res)
from pyterrier.model import add_ranks
res = add_ranks(res)
return res


class MatchOp(pt.Transformer):

def transform(self, df):
assert 'query_toks' in df.columns
from pyterrier.model import push_queries
rtr = push_queries(df)
rtr = rtr.assign(
query=df.query_toks.apply(lambda toks: ' '.join(_matchop(k, v) for k, v in toks.items())))
rtr = rtr.drop(columns=['query_toks'])
return rtr


def _matchop(t, w):
if not all(a in string.ascii_letters + string.digits for a in t):
encoded = base64.b64encode(t.encode('utf-8')).decode("utf-8")
t = f'#base64({encoded})'
if w != 1:
t = f'#combine:0={w}({t})'
return t


def toks2doc(mult=100):

def _dict_tf2text(tfdict):
rtr = ""
for t in tfdict:
for i in range(int(mult*tfdict[t])):
rtr += t + " "
for i in range(int(mult * tfdict[t])):
rtr += t + " "
return rtr

def _rowtransform(df):
df = df.copy()
df["text"] = df['toks'].apply(_dict_tf2text)
df.drop(columns=['toks'], inplace=True)
return df

return pt.apply.generic(_rowtransform)

30 changes: 30 additions & 0 deletions tests/test_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest
import pandas as pd
import tempfile
class TestBasic(unittest.TestCase):

def setUp(self):
import pyterrier as pt
if not pt.started():
pt.init()
import pyt_splade
self.factory = pyt_splade.SpladeFactory(device='cpu')

def test_scorer(self):
df = self.factory.scorer()(pd.DataFrame([
{'qid': '0', 'query': 'chemical reactions', 'docno' : 'd1', 'text' : 'hello there'},
{'qid': '0', 'query': 'chemical reactions', 'docno' : 'd2', 'text' : 'chemistry society'},
{'qid': '1', 'query': 'hello', 'docno' : 'd1', 'text' : 'hello there'},
]))
self.assertAlmostEqual(0., df['score'][0])
self.assertAlmostEqual(11.133593, df['score'][1], places=5)
self.assertAlmostEqual(17.566324, df['score'][2], places=5)
self.assertEqual('0', df['qid'][0])
self.assertEqual('0', df['qid'][1])
self.assertEqual('1', df['qid'][2])
self.assertEqual('d1', df['docno'][0])
self.assertEqual('d2', df['docno'][1])
self.assertEqual('d1', df['docno'][2])
self.assertEqual(1, df['rank'][0])
self.assertEqual(0, df['rank'][1])
self.assertEqual(0, df['rank'][2])

0 comments on commit 4f89605

Please sign in to comment.