add support for 'start' and 'count' params

BurnzZ · BurnzZ · commit ee4007150416 · 2019-10-16T19:33:54.000+08:00
diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import
 
+import sys
+
 from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin
 
 
@@ -51,6 +53,20 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy):
           File "<stdin>", line 1, in <module>
         StopIteration
 
+    - retrieving via meth::`list_iter` also supports the `start` and `count`.
+      params. This is useful when you want to only retrieve a subset of items in
+      a job. The example below belongs to a job with 10 items::
+
+        >>> gen = job.items.list_iter(chunksize=2, start=5, size=3)
+        >>> next(gen)
+        [{'name': 'Item #5'}, {'name': 'Item #6'}]
+        >>> next(gen)
+        [{'name': 'Item #7'}]
+        >>> next(gen)
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+        StopIteration
+
     - retrieve 1 item with multiple filters::
 
         >>> filters = [("size", ">", [30000]), ("size", "<", [40000])]
@@ -85,18 +101,29 @@ def list_iter(self, chunksize=1000, *args, **kwargs):
         You can improve I/O overheads by increasing the chunk value but that
         would also increase the memory consumption.
 
+        :param chunksize: size of list to be returned per iteration
+        :param start: offset to specify the start of the item iteration
+        :param count: overall number of items to be returned, which is broken
+            down by `chunksize`.
+
         :return: an iterator over items, yielding lists of items.
         :rtype: :class:`collections.Iterable`
         """
 
+        start = kwargs.pop("start", 0)
+        count = kwargs.pop("count", sys.maxsize)
         processed = 0
+
         while True:
-            next_key = self.key + '/' + str(processed)
+            next_key = self.key + "/" + str(start)
             items = [
                 item for item in self.iter(
                     count=chunksize, start=next_key, *args, **kwargs)
             ]
             yield items
             processed += len(items)
+            start += len(items)
+            if processed >= count:
+                break
             if len(items) < chunksize:
                 break
diff --git a/tests/client/test_items.py b/tests/client/test_items.py
@@ -2,8 +2,8 @@
 from six.moves import range
 
 
-def _add_test_items(job):
-    for i in range(3):
+def _add_test_items(job, size=3):
+    for i in range(size):
         job.items.write({'id': i, 'data': 'data' + str(i)})
     job.items.flush()
     job.items.close()
@@ -41,8 +41,9 @@ def test_items_list(spider, json_and_msgpack):
 def test_items_list_iter(spider, json_and_msgpack):
     job = spider.jobs.run(meta={'state': 'running'})
     _add_test_items(job)
+    job.finish()
 
-    o = job.items.list_iter(2)
+    o = job.items.list_iter(chunksize=2)
     assert next(o) == [
         {'id': 0, 'data': 'data0'},
         {'id': 1, 'data': 'data1'},
@@ -52,3 +53,26 @@ def test_items_list_iter(spider, json_and_msgpack):
     ]
     with pytest.raises(StopIteration):
         next(o)
+
+
+def test_items_list_iter_with_start_and_count(spider, json_and_msgpack):
+    job = spider.jobs.run(meta={'state': 'running'})
+    _add_test_items(job, size=10)
+    job.finish()
+
+    o = job.items.list_iter(chunksize=3, start=3, size=7)
+    assert next(o) == [
+        {'id': 3, 'data': 'data3'},
+        {'id': 4, 'data': 'data4'},
+        {'id': 5, 'data': 'data5'},
+    ]
+    assert next(o) == [
+        {'id': 6, 'data': 'data6'},
+        {'id': 7, 'data': 'data7'},
+        {'id': 8, 'data': 'data8'},
+    ]
+    assert next(o) == [
+        {'id': 9, 'data': 'data9'},
+    ]
+    with pytest.raises(StopIteration):
+        next(o)