Skip to content

Commit fa39c09

Browse files
committed
lccs now retrieving the sequence
1 parent 713a899 commit fa39c09

File tree

5 files changed

+89
-21
lines changed

5 files changed

+89
-21
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ lccs_len = lccs_length(seq1, seq2) # 5, [2, 3, 4, 5, 6]
3535

3636
## TODOs
3737

38+
* implement lccs with suffix tree;
3839
* batch methods, i.e. supporting 2D arrays;
3940
* batch methods with any number of dimensions (nD array) and add a `dim` argument;
4041
* make it work with an unlimited number of sequences, and set `dim` and `pad_token` as kwargs only;

src/cpu/lccs_cpu_dyn.cpp

+32-3
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ namespace nb = nanobind;
66
using namespace nb::literals;
77

88

9-
// Calculating the length of the longest common contiguous subsequence with dynamic programming
10-
int lccs_length(
9+
// Returns the length of the longest common subsequence and the idx of its end in s1
10+
std::vector<int> lccs_length_idx(
1111
const nb::ndarray<double, nb::ndim<1>>& s1,
1212
const nb::ndarray<double, nb::ndim<1>>& s2
1313
) {
@@ -21,16 +21,45 @@ int lccs_length(
2121

2222
std::vector<std::vector<int>> table(s1Len + 1, std::vector<int>(s2Len + 1, 0));
2323
int max_length = 0;
24+
int imax = 0; // ending idx of the lccs
2425
for (int i = 0; i < s1Len; ++i) {
2526
for (int j = 0; j < s2Len; ++j) {
2627
if (v1(i) == v2(j)) {
2728
table[i + 1][j + 1] = table[i][j] + 1;
2829
if (table[i + 1][j + 1] > max_length) {
30+
imax = i;
2931
max_length = table[i + 1][j + 1];
3032
}
3133
}
3234
}
3335
}
36+
std::vector<int> lccs_len_idx = {max_length, imax};
37+
return lccs_len_idx;
38+
}
39+
40+
41+
// Calculating the length of the longest common contiguous subsequence with dynamic programming
42+
int lccs_length(
43+
const nb::ndarray<double, nb::ndim<1>>& s1,
44+
const nb::ndarray<double, nb::ndim<1>>& s2
45+
) {
46+
std::vector<int> lccs_len_idx = lccs_length_idx(s1, s2);
47+
return lccs_len_idx[0];
48+
}
49+
50+
51+
// Calculating the longest common contiguous subsequence with dynamic programming
52+
std::vector<int> lccs(
53+
const nb::ndarray<double, nb::ndim<1>>& s1,
54+
const nb::ndarray<double, nb::ndim<1>>& s2
55+
) {
56+
std::vector<int> lccs_len_idx = lccs_length_idx(s1, s2);
57+
58+
// Extract the longest common substring from s1
59+
std::vector<int> longestSubseq(lccs_len_idx[0]);
60+
int idx = 0;
61+
for (int i = lccs_len_idx[1] - lccs_len_idx[0] + 1; i <= lccs_len_idx[1]; ++i)
62+
longestSubseq[idx++] = s1(i);
3463

35-
return max_length;
64+
return longestSubseq;
3665
}

src/lcs.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ NB_MODULE(lcsvec_ext, m) {
1313
m.def("lcs_table", &createLCSTable, "seq1"_a, "seq2"_a,
1414
"Returns the longest common subsequence (lcs) table from `seq1` and `seq2`.");
1515

16+
m.def("lccs", &lccs, "seq1"_a, "seq2"_a,
17+
"Returns the longest common contiguous subsequence (lccs) from `seq1` and `seq2`.");
1618
m.def("lccs_length", &lccs_length, "seq1"_a, "seq2"_a,
1719
"Returns the length of the longest common contiguous subsequence (lccs) from `seq1` and `seq2`.");
1820
}

src/lcsvec/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Main LCSvec module."""
22

3-
from .lcsvec_ext import lccs_length, lcs, lcs_length, lcs_table
3+
from .lcsvec_ext import lccs, lccs_length, lcs, lcs_length, lcs_table
44

55
__version__ = "0.0.1"
66

7-
__all__ = ["lccs_length", "lcs", "lcs_length", "lcs_table"]
7+
__all__ = ["lccs", "lccs_length", "lcs", "lcs_length", "lcs_table"]

tests/test_lccs_cpu.py

+52-16
Original file line numberDiff line numberDiff line change
@@ -5,37 +5,73 @@
55
from typing import TYPE_CHECKING
66

77
import numpy as np
8-
from lcsvec import lccs_length
8+
import pytest
9+
from lcsvec import lccs, lccs_length
910
from torch import IntTensor, LongTensor, arange
1011

1112
if TYPE_CHECKING:
1213
from numpy.typing import NDArray
1314

1415

16+
TEST_CASES = [
17+
(range(12), [8, 0, 1, 2, 8, 2, 3, 8, 4, 0], range(3)),
18+
(range(12), [8, 0, 9, 2, 8, 2, 7, 3, 4, 5], range(3, 6)),
19+
(range(12), [0, 1, 2, 3, 8, 9, 2, 3, 4, 5], range(4)),
20+
(range(-2, 10), [9, -1, 0, 1, 2, 9, 2, 4, 4, 5], range(-1, 3)),
21+
]
22+
23+
1524
def _test_lccs(
1625
seq1: NDArray | IntTensor | LongTensor,
1726
seq2: NDArray | IntTensor | LongTensor,
1827
ref: list[int],
19-
) -> None:
20-
lcs_len = lccs_length(seq1, seq2)
21-
assert lcs_len == len(ref)
28+
) -> True:
29+
lccs_ = lccs(seq1, seq2)
30+
lccs_len = lccs_length(seq1, seq2)
2231

32+
assert lccs_len == len(ref)
33+
assert lccs_ == ref
34+
return True
2335

24-
def test_lccs_numpy() -> None:
36+
37+
@pytest.mark.parametrize("sequences", TEST_CASES)
38+
def test_lccs_numpy(
39+
sequences: tuple[list[int] | range, list[int] | range, list[int] | range],
40+
) -> None:
2541
r"""Test the LCCS methods with numpy."""
26-
seq1 = np.arange(0, 12)
27-
seq2 = np.array([8, 0, 1, 2, 8, 2, 3, 8, 4, 0], dtype=np.int64)
28-
ref = np.arange(0, 3).tolist()
42+
seq1, seq2, ref = sequences
43+
seq1 = (
44+
np.arange(seq1.start, seq1.stop)
45+
if isinstance(seq1, range)
46+
else np.array(seq1, dtype=np.int64)
47+
)
48+
seq2 = (
49+
np.arange(seq2.start, seq2.stop)
50+
if isinstance(seq2, range)
51+
else np.array(seq2, dtype=np.int64)
52+
)
53+
ref = (
54+
np.arange(ref.start, ref.stop).tolist()
55+
if isinstance(ref, range)
56+
else np.array(ref, dtype=np.int64).tolist()
57+
)
2958

30-
lcs_len = lccs_length(seq1, seq2)
31-
assert lcs_len == len(ref)
59+
assert _test_lccs(seq1, seq2, ref)
3260

3361

34-
def test_lccs_torch() -> None:
62+
@pytest.mark.parametrize("sequences", TEST_CASES)
63+
def test_lccs_torch(
64+
sequences: tuple[list[int] | range, list[int] | range, list[int] | range],
65+
) -> None:
3566
r"""Test the LCCS methods with pytorch."""
36-
seq1 = arange(0, 12)
37-
seq2 = LongTensor([8, 0, 1, 2, 8, 2, 3, 8, 4, 0])
38-
ref = arange(0, 3).tolist()
67+
seq1, seq2, ref = sequences
68+
seq1 = (
69+
arange(seq1.start, seq1.stop) if isinstance(seq1, range) else LongTensor(seq1)
70+
)
71+
seq2 = (
72+
arange(seq2.start, seq2.stop) if isinstance(seq2, range) else LongTensor(seq2)
73+
)
74+
ref = arange(ref.start, ref.stop) if isinstance(ref, range) else LongTensor(ref)
75+
ref = ref.tolist()
3976

40-
lcs_len = lccs_length(seq1, seq2)
41-
assert lcs_len == len(ref)
77+
assert _test_lccs(seq1, seq2, ref)

0 commit comments

Comments
 (0)