From 517b811f6af2ad3b095fc960207851a7b710b2eb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 22 Jan 2024 18:36:40 +0800 Subject: [PATCH] tmp: trace --- sh_scrapy/middlewares.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sh_scrapy/middlewares.py b/sh_scrapy/middlewares.py index 848ad77..6318db4 100644 --- a/sh_scrapy/middlewares.py +++ b/sh_scrapy/middlewares.py @@ -15,11 +15,11 @@ class HubstorageSpiderMiddleware(object): """Hubstorage spider middleware. - + What it does: - + - Sets parent request ids to the requests coming out of the spider. - + """ def __init__(self): @@ -27,23 +27,26 @@ def __init__(self): def process_spider_output(self, response, result, spider): parent = self._seen_requests.pop(response.request, None) + print(f"[SpiderMw] parent={parent}") for x in result: + print(f"[SpiderMw] result={x}") if isinstance(x, Request): x.meta[HS_PARENT_ID_KEY] = parent # Remove request id if it was for some reason set in the request coming from Spider. x.meta.pop(HS_REQUEST_ID_KEY, None) + print(f"[SpiderMw] x.meta={x.meta}") yield x class HubstorageDownloaderMiddleware(object): """Hubstorage dowloader middleware. - + What it does: - + - Generates request ids for all downloaded requests. - Sets parent request ids for requests generated in downloader middlewares. - Stores all downloaded requests into Hubstorage. - + """ def __init__(self): @@ -60,9 +63,12 @@ def process_request(self, request, spider): request.meta[HS_PARENT_ID_KEY] = request_id def process_response(self, request, response, spider): + print(f"[DownloaderMw] request={request} parent={request.meta.setdefault(HS_PARENT_ID_KEY)}") + # This class of response check is intended to fix the bug described here # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112 if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"): + print(f"[DownloaderMw] skip") return response self.pipe_writer.write_request(