Skip to content

Commit ee40071

Browse files
committed
add support for 'start' and 'count' params
1 parent 4aa0112 commit ee40071

File tree

2 files changed

+55
-4
lines changed

2 files changed

+55
-4
lines changed

scrapinghub/client/items.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import absolute_import
22

3+
import sys
4+
35
from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin
46

57

@@ -51,6 +53,20 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy):
5153
File "<stdin>", line 1, in <module>
5254
StopIteration
5355
56+
- retrieving via meth::`list_iter` also supports the `start` and `count`.
57+
params. This is useful when you want to only retrieve a subset of items in
58+
a job. The example below belongs to a job with 10 items::
59+
60+
>>> gen = job.items.list_iter(chunksize=2, start=5, size=3)
61+
>>> next(gen)
62+
[{'name': 'Item #5'}, {'name': 'Item #6'}]
63+
>>> next(gen)
64+
[{'name': 'Item #7'}]
65+
>>> next(gen)
66+
Traceback (most recent call last):
67+
File "<stdin>", line 1, in <module>
68+
StopIteration
69+
5470
- retrieve 1 item with multiple filters::
5571
5672
>>> filters = [("size", ">", [30000]), ("size", "<", [40000])]
@@ -85,18 +101,29 @@ def list_iter(self, chunksize=1000, *args, **kwargs):
85101
You can improve I/O overheads by increasing the chunk value but that
86102
would also increase the memory consumption.
87103
104+
:param chunksize: size of list to be returned per iteration
105+
:param start: offset to specify the start of the item iteration
106+
:param count: overall number of items to be returned, which is broken
107+
down by `chunksize`.
108+
88109
:return: an iterator over items, yielding lists of items.
89110
:rtype: :class:`collections.Iterable`
90111
"""
91112

113+
start = kwargs.pop("start", 0)
114+
count = kwargs.pop("count", sys.maxsize)
92115
processed = 0
116+
93117
while True:
94-
next_key = self.key + '/' + str(processed)
118+
next_key = self.key + "/" + str(start)
95119
items = [
96120
item for item in self.iter(
97121
count=chunksize, start=next_key, *args, **kwargs)
98122
]
99123
yield items
100124
processed += len(items)
125+
start += len(items)
126+
if processed >= count:
127+
break
101128
if len(items) < chunksize:
102129
break

tests/client/test_items.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from six.moves import range
33

44

5-
def _add_test_items(job):
6-
for i in range(3):
5+
def _add_test_items(job, size=3):
6+
for i in range(size):
77
job.items.write({'id': i, 'data': 'data' + str(i)})
88
job.items.flush()
99
job.items.close()
@@ -41,8 +41,9 @@ def test_items_list(spider, json_and_msgpack):
4141
def test_items_list_iter(spider, json_and_msgpack):
4242
job = spider.jobs.run(meta={'state': 'running'})
4343
_add_test_items(job)
44+
job.finish()
4445

45-
o = job.items.list_iter(2)
46+
o = job.items.list_iter(chunksize=2)
4647
assert next(o) == [
4748
{'id': 0, 'data': 'data0'},
4849
{'id': 1, 'data': 'data1'},
@@ -52,3 +53,26 @@ def test_items_list_iter(spider, json_and_msgpack):
5253
]
5354
with pytest.raises(StopIteration):
5455
next(o)
56+
57+
58+
def test_items_list_iter_with_start_and_count(spider, json_and_msgpack):
59+
job = spider.jobs.run(meta={'state': 'running'})
60+
_add_test_items(job, size=10)
61+
job.finish()
62+
63+
o = job.items.list_iter(chunksize=3, start=3, size=7)
64+
assert next(o) == [
65+
{'id': 3, 'data': 'data3'},
66+
{'id': 4, 'data': 'data4'},
67+
{'id': 5, 'data': 'data5'},
68+
]
69+
assert next(o) == [
70+
{'id': 6, 'data': 'data6'},
71+
{'id': 7, 'data': 'data7'},
72+
{'id': 8, 'data': 'data8'},
73+
]
74+
assert next(o) == [
75+
{'id': 9, 'data': 'data9'},
76+
]
77+
with pytest.raises(StopIteration):
78+
next(o)

0 commit comments

Comments
 (0)