Skip to content
This repository has been archived by the owner on Jul 24, 2023. It is now read-only.

Commit

Permalink
started work on 9gag crawler #119 ... and set it on hold
Browse files Browse the repository at this point in the history
  • Loading branch information
jkowalleck committed May 21, 2020
1 parent a10e7af commit 025cbcb
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 8 deletions.
13 changes: 13 additions & 0 deletions examples/config/dev_9gag.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## This is a config file for nichtprasoup (v3.0)
## This is an example file. For a better kickStart run: nichtparasoup server config --help

webserver:
hostname: "0.0.0.0"
port: 8080

imageserver:
crawler_upkeep: 10

crawlers:
- name: "9gag"
# config:
18 changes: 10 additions & 8 deletions src/nichtparasoup/imagecrawlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,16 @@ def _builtins() -> Dict[_ImagecrawlerName, _ImagecrawlerClass]:
from .reddit import Reddit
from .instagram import InstagramHashtag, InstagramProfile
from .pr0gramm import Pr0gramm
return dict(
Echo=Echo,
Picsum=Picsum,
Reddit=Reddit,
InstagramProfile=InstagramProfile,
InstagramHashtag=InstagramHashtag,
Pr0gramm=Pr0gramm,
)
from .ninegag import NineGag
return {
'Echo': Echo,
'Picsum': Picsum,
'Reddit': Reddit,
'InstagramProfile': InstagramProfile,
'InstagramHashtag': InstagramHashtag,
'Pr0gramm': Pr0gramm,
'9gag': NineGag,
}

def __init__(self, entries: Iterable[EntryPoint]) -> None: # pragma: no cover
self._list: List[_Imagecrawler] = [(n, c) for n, c in self._builtins().items()]
Expand Down
81 changes: 81 additions & 0 deletions src/nichtparasoup/imagecrawlers/ninegag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
__all__ = ["NineGag"]

from typing import Any, Dict, Optional
from urllib.parse import urlencode, quote as urlquote

from ..imagecrawler import BaseImageCrawler, Image, ImageCollection, ImageCrawlerConfig, ImageCrawlerInfo, RemoteFetcher

# @TODO fix crawling
# currently the "api" is not crawable ... thanks to CloudFlare protection ...
# curl -v -L \
# -H 'Referer: https://9gag.com/' \
# -H 'Accept: application/json' \
# -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0' \
# 'https://9gag.com/v1/group-posts/group/default/type/hot?c=10&after='
#

class NineGag(BaseImageCrawler):

def __init__(self, **config: Any) -> None: # pragma: no cover
super().__init__(**config)
self._after: Optional[str] = None
self._remote_fetcher = RemoteFetcher(headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://9gag.com/',
'Cookie': '__cfduid=dabf5c79209d50bb41e6d457b5e0a94b61590077498; ____ri=165; ____lo=DE; gag_tz=2; sign_up_referer=https%3A%2F%2Fduckduckgo.com%2F; _pk_id.7.f7ab=2e8a243e691eff4a.1590077498.2.1590082408.1590082407.; _pk_ref.7.f7ab=%5B%22%22%2C%22%22%2C1590082407%2C%22https%3A%2F%2Fduckduckgo.com%2F%22%5D; _pk_ses.7.f7ab=*',
})

@classmethod
def info(cls) -> ImageCrawlerInfo: # pragma: no cover
return ImageCrawlerInfo(
# TODO
description='',
long_description='',
config={},
icon_url='https://upload.wikimedia.org/wikipedia/commons/9/97/9GAG_new_logo.svg',
)

@classmethod
def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: # pragma: no cover
return ImageCrawlerConfig(
# TODO
)

def _reset(self) -> None: # pragma: no cover
self._after = None

__CHANNELS = {
# 'hot': 'http://127.0.0.1:54321?' ,
'hot': 'https://9gag.com/v1/group-posts/group/default/type/hot?c=10&after='
}

def _get_api_uri(self, *,
category: str,
type: str,
count: int,
after: Optional[str]
) -> str:
category = urlquote(category, safe='')
type_ = urlquote(type, safe='')
query_string = urlencode({
'after': after or '',
'c': count,
})
return f'https://9gag.com/v1/group-posts/group/{category}/type/{type_}?{query_string}'

def _crawl(self) -> ImageCollection: # pragma: no cover
images = ImageCollection()
api_uri = self._get_api_uri(category='default', type='hot', count=10, after=self._after)
response_raw, api_uri = self._remote_fetcher.get_string(api_uri)
# use
images.add( # pylint: disable=no-member # false-positive
Image(
uri='https://img-9gag-fun.9cache.com/photo/aVwXDQP_700bwp.webp',
source='https://9gag.com/gag/aVwXDQP',
is_generic=True
)
)
return images

0 comments on commit 025cbcb

Please sign in to comment.