From acb4a8ab9c3117055b162fbef888cd47ea43bbd5 Mon Sep 17 00:00:00 2001 From: Jabb0 <33359018+Jabb0@users.noreply.github.com> Date: Sat, 28 Nov 2020 19:51:09 +0100 Subject: [PATCH] Made the download aware of the actual returned batch size --- psaw/PushshiftAPI.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/psaw/PushshiftAPI.py b/psaw/PushshiftAPI.py index c12d3d6..03efd52 100644 --- a/psaw/PushshiftAPI.py +++ b/psaw/PushshiftAPI.py @@ -212,9 +212,15 @@ def _handle_paging(self, url): raise NotImplementedError(err_msg.format(self.max_results_per_request)) self._add_nec_args(self.payload) - yield self._get(url, self.payload) - - if (limit is not None) & (limit == 0): + data = self._get(url, self.payload) + yield data + received_size = int(data['metadata']['size']) + requested_size = self.payload['limit'] + # Apparently the API can decide to send less data than desired. We need to send another request in that case + if received_size < requested_size: + limit += requested_size - received_size + + if (limit is not None) and (limit == 0): return def _search(self,