mailwww.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# kate: space-indent on; tab-width 4; indent-width 4;

""" @package docstring
Cronjob emailer script

Reads an HTML page from a Web server and sends it through email

@author Gabriele Tozzi <gabriele@tozzi.eu>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys
import re
import logging
import posixpath

from optparse import OptionParser

import urllib, urlparse
from HTMLParser import HTMLParser

import smtplib
from email.Utils import COMMASPACE, formatdate, make_msgid
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart


class Main:

    NAME = 'mailwww'
    VERSION = '0.5'

    def run(self):
        """ Main entry point """

        # Read command line
        usage = "%prog [options] <url> <address> [<address2>] [<address...>]"
        parser = OptionParser(usage=usage, version=self.NAME + ' ' + self.VERSION)
        parser.add_option("--http-user", dest="http_user",
            help="Username for HTTP POST authentication")
        parser.add_option("--http-pass", dest="http_pass",
            help="Password for HTTP POST authentication")
        parser.add_option("-s", "--smtp", dest="smtp",
            help="SMTP server address. Default: localhost",
            default='localhost')
        parser.add_option("--smtp-user", dest="smtp_user",
            help="Username for SMTP authentication")
        parser.add_option("--smtp-pass", dest="smtp_pass",
            help="Password for SMTP authentication")
        parser.add_option("-c", "--cc", dest="cc",
            help="Carbon Copy recipient")
        parser.add_option("-f", "--from", dest="sender",
            help="eMail sender. Default: emailer@localhost",
            default="emailer@localhost")
        parser.add_option("-j", "--subject", dest="subject",
            help="eMail Subject. Default: MailWWW Autogenerated Mail",
            default="MailWWW Autogenerated Mail")
        parser.add_option("-n", "--no-css", dest="nocss",
            help="Disable embedding of linked Style Sheets",
            default=False, action="store_true")
        parser.add_option("-m", "--multiple", dest="multiple",
            help="Send multiple emails: one for each recipient (Cc field is ignored)",
            default=False, action="store_true")
        parser.add_option("-v", "--verbose", dest="verbose",
            help="Show progress information",
            default=False, action="store_true")

        (options, args) = parser.parse_args()

        # Parse mandatory arguments
        if len(args) < 2:
            parser.error("unvalid number of arguments")
        dest = []
        i = 0
        for a in args:
            if i == 0:
                url = a
            else:
                dest.append(a)
            i += 1

        # Parse optional arguments
        http_user = options.http_user
        http_pass = options.http_pass
        cc = []
        if options.cc:
            cc.append(options.cc)
        host = options.smtp
        port = 25
        user = options.smtp_user
        pwd = options.smtp_pass
        sender = options.sender
        subject = options.subject
        nocss = options.nocss
        multiple = options.multiple
        verbose = options.verbose

        logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)

        # Opens URL
        logging.info('Fetching url %s', url)
        data = None
        if http_user or http_pass:
            # Use POST authentication
            data = urllib.urlencode({ 'username': http_user, 'password': http_pass, 'login': True })
        f = urllib.urlopen(url, data)
        html = f.read()
        # Search for meta content-type tag, use this encoding when found
        encre = re.compile(r'<meta\s+http-equiv=(?:"|\')Content-Type(?:"|\')\s+content=(?:"|\')([^\'"]*)(?:"|\')\s*/>',
            re.I | re.M)
        match = encre.search(html)
        if match:
            encoding = self.__parseEncoding(match.group(1))
            try:
                html = unicode(html, encoding, errors='replace')
            except LookupError as e:
                encoding = self.__parseEncoding(f.headers['content-type'])
                html = unicode(html, encoding, errors='replace')
        else:
            encoding = self.__parseEncoding(f.headers['content-type'])
            html = unicode(html, encoding, errors='replace')
        logging.info('Detected charset: %s', encoding)
        f.close()

        # Retrieve linked style sheets
        if not nocss:
            logging.info('Fetching Style Sheets...')
            parser = CSSLister(url)
            parser.feed(html)
            parser.close()
            for search, replace in parser.get_replacements():
                html = html.replace(search, replace, 1)

        # Prepare mail
        msg = MIMEMultipart()
        msg['Date'] = formatdate(localtime=True)
        msg['Message-ID'] = make_msgid('emailer')
        msg['Subject'] = subject
        msg['From'] = sender

        if cc and not multiple:
            msg['Cc'] = ', '.join(cc)
        msg.preamble = 'This is a milti-part message in MIME format.'

        txt = MIMEText(html.encode('utf-8'), 'html', 'utf-8')
        msg.attach(txt)

        if not multiple:
            msg['To'] = ', '.join(dest)

        # Sends message
        smtp = smtplib.SMTP()
        smtp.connect(host, port)
        if user:
            smtp.login(user, pwd)
        if multiple:
            for d in dest:
                del msg['To']
                msg['To'] = d
                logging.info('Sending mail to: %s', d)
                smtp.sendmail(sender, d, msg.as_string())
        else:
            logging.info('Sending mail to: %s, Cc: %s', dest, cc)
            smtp.sendmail(sender, dest+cc, msg.as_string())
        smtp.quit()

    def __parseEncoding(self, encstr, default='utf-8'):
        encoding = encstr.split('charset=')[-1]
        if encoding.find('/') == -1:
            return encoding
        return default


class CSSLister(HTMLParser):

    def __init__(self, baseurl):
        (scheme,netloc,path,parameters,query,fragment) = urlparse.urlparse(baseurl)
        self.__baseurl = scheme + '://' + netloc + posixpath.dirname(path) + '/'
        HTMLParser.__init__(self)
        self.__log = logging.getLogger('css')

    def reset(self):
        self.__repl = []
        HTMLParser.reset(self)

    def handle_starttag(self, tag, attrs):
        if tag == 'link' and ('rel', 'stylesheet') in attrs:
            # Found new link tag
            for k, v in attrs:
                if k == 'href':
                    # Go get the CSS
                    self.__log.info('Fetching CSS %s%s', self.__baseurl, v)
                    c = urllib.urlopen(self.__baseurl + v)
                    css = "<style>\n" + c.read() + "</style>\n"
                    c.close()
                    self.__repl.append( (self.get_starttag_text(), css) )
                    break

    def handle_endtag(self, data):
        pass

    def get_replacements(self):
        return self.__repl

if __name__ == '__main__':
    app = Main()
    app.run()
    sys.exit(0)