From 6c77bb77d19e75430bc2e85b15e69ae7ecbf9d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Fri, 1 Oct 2021 15:34:13 -0300 Subject: [PATCH 1/4] Add example of how to run start-crawl in docker --- docs/conf.py | 5 ++- docs/deploy-custom-image.rst | 74 ++++++++++++++++++++++++++++++++++++ docs/requirements.txt | 2 + 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 docs/requirements.txt diff --git a/docs/conf.py b/docs/conf.py index 9a74f564..37db5603 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -145,6 +145,7 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_css_files = ["theme_overrides.css"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -307,5 +308,5 @@ # Wrap text in tables # https://github.com/snide/sphinx_rtd_theme/issues/117#issuecomment-41571653 -def setup(app): - app.add_stylesheet("theme_overrides.css") +# def setup(app): +# app.add_stylesheet() diff --git a/docs/deploy-custom-image.rst b/docs/deploy-custom-image.rst index 7bb9bb23..bc4b3e96 100644 --- a/docs/deploy-custom-image.rst +++ b/docs/deploy-custom-image.rst @@ -566,3 +566,77 @@ If you are using a private repository to push your images to, make sure to pass Or pass it to :ref:`upload ` command:: $ shub image upload --username johndoe --password yourpass + + +Container works locally but fails in scrapy cloud +------------------------------------------------- + +Prior to running ``start-crawl`` in Scrapy Cloud, some configurations +are set to ensure we can run an isolated process. +This can lead to issues that are quite hard to debug and find the +root cause. +To aid in this process, below you willl find some steps that +are quite similar to what actually runs in scrapy cloud. + +Run your container in interactive with ``bash`` (or any other +terminal that is available). Please replace the 2 occurrences of +```` with the actual spider that is to run:: + + $ docker run \ + -it \ + -e SHUB_JOBKEY=123/4/5 \ + -e SHUB_JOB_DATA='{ + "_shub_worker": "kumo", + "api_url": "https://app.zyte.com/api/", + "auth": "", + "deploy_id": 1, + "key": "123/4/5", + "pending_time": 1632739881823, + "priority": 2, + "project": 123, + "running_time": 1632739882059, + "scheduled_by": "some_user", + "spider": "", + "spider_type": "manual", + "started_by": "jobrunner", + "state": "running", + "tags": [], + "units": 1, + "version": "1.0" + }' \ + -e SHUB_JOB_ENV='{}' \ + -e SHUB_JOB_MEMORY_LIMIT=950 \ + -e SHUB_JOB_UID=123 \ + -e SHUB_SETTINGS='{ + "deploy_id": 1, + "enabled_addons": [], + "job_settings": {}, + "organization_settings": {}, + "project_settings": {}, + "spider_settings": {}, + "status": "ok", + "version": "1.0" + }' \ + -e SHUB_SPIDER= \ + --net bridge \ + --volume=/scrapinghub \ + --rm=true \ + --name=scrapy-cloud-container \ + my-docker-image \ + /bin/bash + +Connect to the container in a new terminal window +and open a named pipe to communicate through ``sh_scrapy``:: + + $ docker exec -it scrapy-cloud-container /bin/bash + $ mkfifo -m 0600 /dev/scrapinghub + $ chown 65534:65534 /dev/scrapinghub + $ cat /dev/scrapinghub + +Go back to the first window and start the crawling process:: + + $ export SHUB_FIFO_PATH=/dev/scrapinghub + $ start-crawl + +Switch back to the second window (the named pipe one) +to see the results comming out. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..04ba4d62 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +Sphinx>=3.0 +sphinx-rtd-theme>=0.5.2 \ No newline at end of file From 3c490f840f0fdb793a2ac5c25fcc66c693a8934a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Mon, 4 Oct 2021 14:53:42 -0300 Subject: [PATCH 2/4] Update with PR comments --- docs/deploy-custom-image.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/deploy-custom-image.rst b/docs/deploy-custom-image.rst index bc4b3e96..4ddfab9f 100644 --- a/docs/deploy-custom-image.rst +++ b/docs/deploy-custom-image.rst @@ -568,17 +568,17 @@ Or pass it to :ref:`upload ` command:: $ shub image upload --username johndoe --password yourpass -Container works locally but fails in scrapy cloud +Container works locally but fails in Scrapy Cloud ------------------------------------------------- Prior to running ``start-crawl`` in Scrapy Cloud, some configurations are set to ensure we can run an isolated process. This can lead to issues that are quite hard to debug and find the root cause. -To aid in this process, below you willl find some steps that -are quite similar to what actually runs in scrapy cloud. +To aid in this process, below you will find some steps that +are quite similar to what actually runs in Scrapy Cloud. -Run your container in interactive with ``bash`` (or any other +Run your container in interactive mode with ``bash`` (or any other terminal that is available). Please replace the 2 occurrences of ```` with the actual spider that is to run:: @@ -588,7 +588,7 @@ terminal that is available). Please replace the 2 occurrences of -e SHUB_JOB_DATA='{ "_shub_worker": "kumo", "api_url": "https://app.zyte.com/api/", - "auth": "", + "auth": "SOME AUTH KEY NOT REQUIRED FOR THIS TEST", "deploy_id": 1, "key": "123/4/5", "pending_time": 1632739881823, From 0ce87c556c0048b137c133f51968c7c1ca2d0714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Mon, 4 Oct 2021 14:55:11 -0300 Subject: [PATCH 3/4] Remove docs changes --- docs/conf.py | 5 ++--- docs/requirements.txt | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 docs/requirements.txt diff --git a/docs/conf.py b/docs/conf.py index 37db5603..ae142436 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -145,7 +145,6 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] -html_css_files = ["theme_overrides.css"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -308,5 +307,5 @@ # Wrap text in tables # https://github.com/snide/sphinx_rtd_theme/issues/117#issuecomment-41571653 -# def setup(app): -# app.add_stylesheet() +def setup(app): + app.add_stylesheet() diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 04ba4d62..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -Sphinx>=3.0 -sphinx-rtd-theme>=0.5.2 \ No newline at end of file From d38c4c05eb9b117e68502ce571a6046b9a192d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Tue, 5 Oct 2021 08:26:36 -0300 Subject: [PATCH 4/4] Fix conf.py --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index ae142436..9a74f564 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -308,4 +308,4 @@ # Wrap text in tables # https://github.com/snide/sphinx_rtd_theme/issues/117#issuecomment-41571653 def setup(app): - app.add_stylesheet() + app.add_stylesheet("theme_overrides.css")