scrap.py

# -*- coding: utf-8 -*-
import argparse
import codecs
import shutil
import os
import json
import hashlib

from operator import attrgetter

import bleach
import dateutil.parser
import requests
from jinja2 import Environment, FileSystemLoader

THEME_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'theme')


class Author(object):
    def __init__(self, data):
        self.id = data['id']
        self.name = data['name']


class Comment(object):
    def __init__(self, item):
        self.id = item['id']
        self.content = None
        self.picture = item['picture'] if 'picture' in item else None
        self.content = bleach.linkify(item['message']) if 'message' in item else None
        self.author = Author(item['from'])
        self.date = dateutil.parser.parse(item['created_time'])
        self.likes = [Author(d) for d in item['likes']['data']] if 'likes' in item else []


class Entry(object):
    def __init__(self, item):
        self.id = item['id']
        self.picture = item['picture'] if 'picture' in item else None
        self.content = bleach.linkify(item['message']) if 'message' in item else None
        self.author = Author(item['from']) if 'from' in item else None
        self.date = dateutil.parser.parse(item['created_time'])
        self.likes = [Author(d) for d in item['likes']['data']] if 'likes' in item else []
        self.comments = [Comment(d) for d in item['comments']['data']] if 'comments' in item else []


def render_template(output_path, tpl_name, filename, **options):
    env = Environment(loader=FileSystemLoader(THEME_PATH))
    template = env.get_template(tpl_name)
    output = template.render(**options)

    full_path = os.path.join(output_path, filename)

    with codecs.open(full_path, 'w+', encoding='utf-8') as f:
        f.write(output)


def copy(source, destination):
    """Recursively copy source into destination.

    Taken from pelican.

    If source is a file, destination has to be a file as well.
    The function is able to copy either files or directories.
    :param source: the source file or directory
    :param destination: the destination file or directory
    """
    source_ = os.path.abspath(os.path.expanduser(source))
    destination_ = os.path.abspath(os.path.expanduser(destination))

    if os.path.isfile(source_):
        dst_dir = os.path.dirname(destination_)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        shutil.copy2(source_, destination_)

    elif os.path.isdir(source_):
        if not os.path.exists(destination_):
            os.makedirs(destination_)
        if not os.path.isdir(destination_):
            return

        for src_dir, subdirs, others in os.walk(source_):
            dst_dir = os.path.join(destination_,
                                   os.path.relpath(src_dir, source_))

            if not os.path.isdir(dst_dir):
                # Parent directories are known to exist, so 'mkdir' suffices.
                os.mkdir(dst_dir)

            for o in others:
                src_path = os.path.join(src_dir, o)
                dst_path = os.path.join(dst_dir, o)
                if os.path.isfile(src_path):
                    shutil.copy2(src_path, dst_path)


def download(url, output_path):
    print("downloading %s" % url)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    m = hashlib.md5()
    m.update(url)
    filename = m.hexdigest()
    file_path = os.path.join(output_path, filename)
    if not os.path.exists(file_path):
        resp = requests.get(url, stream=True)
        if resp.status_code == 200:
            with open(file_path, 'wb') as f:
                resp.raw.decode_content = True
                shutil.copyfileobj(resp.raw, f)
    return filename


def get_attachments(item, image_path, token):
    resp = requests.get('https://graph.facebook.com/v2.12/%s/attachments' % item.id, params={
        'access_token': token,
    })
    if 'data' in resp.json() and resp.json()['data']:
        data = resp.json()['data']
        item.pictures = [get_from_type(d, image_path) for d in data][0]


def get_from_type(item, image_path):
    if item['type'] == 'photo':
        return [download(item['media']['image']['src'], image_path)]
    elif item['type'] == 'album':
        return [get_from_type(d, image_path)[0] for d in item['subattachments']['data']]


def parse_data(data):
    entries = [Entry(d) for d in data]
    entries.sort(key=attrgetter('date'))
    return entries


def enhance_entries(entries, output_path, token):
    pictures_path = os.path.join(output_path, 'pictures')
    for entry in entries:
        get_attachments(entry, pictures_path, token)


def generate_archive(data, output_path, token):
    with open(data, 'r') as f:
        data_json = json.load(f)
    entries = parse_data(data_json)
    enhance_entries(entries, output_path, token)
    render_template(output_path, 'index.html', 'index.html', entries=entries)


def copy_assets(output_path):
    copy(os.path.join(THEME_PATH, 'fonts'), os.path.join(output_path, 'fonts'))
    copy(os.path.join(THEME_PATH, 'assets'), os.path.join(output_path, 'assets'))


def parse_args():
    parser = argparse.ArgumentParser(description='Generate facebook group archive pages.')
    parser.add_argument('--data', dest='data', default='data.json',
                        help='Location of the JSON file containing the data.')
    parser.add_argument('--output', dest='output_path',
                        default='output',
                        help='Path where to output the generated files.')
    parser.add_argument('--token', dest='token', help='the access token from Facebook graph API.')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    generate_archive(args.data, args.output_path, args.token)
    copy_assets(args.output_path)
    print('')