started work on 9gag crawler #119 ... and set it on hold

jkowalleck · May 21, 2020 · 025cbcb · 025cbcb
1 parent a10e7af
commit 025cbcb
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 8 deletions.
diff --git a/examples/config/dev_9gag.yaml b/examples/config/dev_9gag.yaml
@@ -0,0 +1,13 @@
+## This is a config file for nichtprasoup (v3.0)
+## This is an example file. For a better kickStart run: nichtparasoup server config --help
+
+webserver:
+  hostname: "0.0.0.0"
+  port: 8080
+
+imageserver:
+  crawler_upkeep: 10
+
+crawlers:
+  - name: "9gag"
+    # config:
diff --git a/src/nichtparasoup/imagecrawlers/__init__.py b/src/nichtparasoup/imagecrawlers/__init__.py
@@ -25,14 +25,16 @@ def _builtins() -> Dict[_ImagecrawlerName, _ImagecrawlerClass]:
         from .reddit import Reddit
         from .instagram import InstagramHashtag, InstagramProfile
         from .pr0gramm import Pr0gramm
-        return dict(
-            Echo=Echo,
-            Picsum=Picsum,
-            Reddit=Reddit,
-            InstagramProfile=InstagramProfile,
-            InstagramHashtag=InstagramHashtag,
-            Pr0gramm=Pr0gramm,
-        )
+        from .ninegag import NineGag
+        return {
+            'Echo': Echo,
+            'Picsum': Picsum,
+            'Reddit': Reddit,
+            'InstagramProfile': InstagramProfile,
+            'InstagramHashtag': InstagramHashtag,
+            'Pr0gramm': Pr0gramm,
+            '9gag': NineGag,
+        }
 
     def __init__(self, entries: Iterable[EntryPoint]) -> None:  # pragma: no cover
         self._list: List[_Imagecrawler] = [(n, c) for n, c in self._builtins().items()]

diff --git a/src/nichtparasoup/imagecrawlers/ninegag.py b/src/nichtparasoup/imagecrawlers/ninegag.py
@@ -0,0 +1,81 @@
+__all__ = ["NineGag"]
+
+from typing import Any, Dict, Optional
+from urllib.parse import urlencode, quote as urlquote
+
+from ..imagecrawler import BaseImageCrawler, Image, ImageCollection, ImageCrawlerConfig, ImageCrawlerInfo, RemoteFetcher
+
+# @TODO fix crawling
+# currently the "api" is not crawable ... thanks to CloudFlare protection ...
+# curl -v -L \
+# -H 'Referer: https://9gag.com/' \
+# -H 'Accept: application/json' \
+# -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0' \
+# 'https://9gag.com/v1/group-posts/group/default/type/hot?c=10&after='
+#
+
+class NineGag(BaseImageCrawler):
+
+    def __init__(self, **config: Any) -> None:  # pragma: no cover
+        super().__init__(**config)
+        self._after: Optional[str] = None
+        self._remote_fetcher = RemoteFetcher(headers={
+            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'X-Requested-With': 'XMLHttpRequest',
+            'Referer': 'https://9gag.com/',
+            'Cookie': '__cfduid=dabf5c79209d50bb41e6d457b5e0a94b61590077498; ____ri=165; ____lo=DE; gag_tz=2; sign_up_referer=https%3A%2F%2Fduckduckgo.com%2F; _pk_id.7.f7ab=2e8a243e691eff4a.1590077498.2.1590082408.1590082407.; _pk_ref.7.f7ab=%5B%22%22%2C%22%22%2C1590082407%2C%22https%3A%2F%2Fduckduckgo.com%2F%22%5D; _pk_ses.7.f7ab=*',
+        })
+
+    @classmethod
+    def info(cls) -> ImageCrawlerInfo:  # pragma: no cover
+        return ImageCrawlerInfo(
+            # TODO
+            description='',
+            long_description='',
+            config={},
+            icon_url='https://upload.wikimedia.org/wikipedia/commons/9/97/9GAG_new_logo.svg',
+        )
+
+    @classmethod
+    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:  # pragma: no cover
+        return ImageCrawlerConfig(
+            # TODO
+        )
+
+    def _reset(self) -> None:  # pragma: no cover
+        self._after = None
+
+    __CHANNELS = {
+        # 'hot': 'http://127.0.0.1:54321?'  ,
+        'hot': 'https://9gag.com/v1/group-posts/group/default/type/hot?c=10&after='
+    }
+
+    def _get_api_uri(self, *,
+                     category: str,
+                     type: str,
+                     count: int,
+                     after: Optional[str]
+                     ) -> str:
+        category = urlquote(category, safe='')
+        type_ = urlquote(type, safe='')
+        query_string = urlencode({
+            'after': after or '',
+            'c': count,
+        })
+        return f'https://9gag.com/v1/group-posts/group/{category}/type/{type_}?{query_string}'
+
+    def _crawl(self) -> ImageCollection:  # pragma: no cover
+        images = ImageCollection()
+        api_uri = self._get_api_uri(category='default', type='hot', count=10, after=self._after)
+        response_raw, api_uri = self._remote_fetcher.get_string(api_uri)
+        # use
+        images.add(  # pylint: disable=no-member # false-positive
+            Image(
+                uri='https://img-9gag-fun.9cache.com/photo/aVwXDQP_700bwp.webp',
+                source='https://9gag.com/gag/aVwXDQP',
+                is_generic=True
+            )
+        )
+        return images