#
# file.py  -  Methods to allow full text search and
#             thumbnails in the "file" table of a database.
#
# Copyright (C) 2015 Jan Jockusch <jan.jockusch@perfact-innovation.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
'''Required software:
- poppler-utils (pdftotext, pdftoppm)
- imagemagick (convert)
- ghostscript (ps2pdf)
- libimage-exiftool-perl (exiftool)

This module provides text extractors and thumbnail renderers for
several mime types

For indexing, we need the filename, mime type, and binary data.

We maintain a dictionary of known mime types, which new text
extractors and thumbnail creators may register to.

Thumbnail resolution needs to be given as an argument. This allows the
front-end to decide.

You may use this to upload files into the DB-Utils DMS:

  >>> bindata = fileassets['tests.manual-image']  # doctest: +SKIP
  >>> id = file_upload(bindata, filename='manual-image.png',
  ...                  mimetype='image/png')  # doctest: +SKIP
  >>> type(id) == type(0L)  # doctest: +SKIP
  True

Files can be removed from the repository:

  >>> file_remove(id)  # doctest: +SKIP

Lower level functions are also available.
'''

import tempfile
import os
import time
import subprocess
import stat
import string
import shutil  # for rmtree
import six

from .generic import safe_syscall, cleanup_string
from .generic import to_string, to_ustring, to_bytes
from .say import say

from .dbconn import dbconn
from .sql import esc_binary  # for database storage
from .fileassets import fileassets  # get queries for db uploads


# Python 2/3 switches
if not six.PY2:
    long = int

if six.PY2:
    from urllib import pathname2url as url_quote
else:
    from urllib.parse import quote as url_quote


# Handling of file repository

file_repository = '/vol/zope_files'

# DANGER: The repo_... methods need to be secured carefully!


def repo_path(id):
    '''Generate a n levels deep file path.
    Create the directory as a side effect.'''
    file = '%012d' % id
    path = file_repository + '/repository'
    for i in range(0, 9, 3):
        path += '/' + file[i:i+3]
    try:
        # Permissions need to allow group access (for db_sync)
        os.umask(0o2)
        os.makedirs(path, mode=0o770)
    except OSError:
        pass
    return path + '/' + file


def repo_write(id, data):
    '''Given an ID and data or a file-like object, write
    a copy out to the repository.'''
    fh = open(repo_path(id), 'wb')
    if isinstance(data, bytes):
        fh.write(data)
    else:
        while True:
            buf = data.read(65536)
            if not buf:
                break
            fh.write(buf)
    fh.close()
    return


def repo_size(id):
    '''Return the size of the file with the given ID'''
    return os.stat(repo_path(id))[stat.ST_SIZE]


def repo_delete(id):
    '''Remove an orphaned file from the repository.'''
    os.remove(repo_path(id))
    return


def repo_read(id):
    '''Read the contents of the file with the given ID'''
    fh = open(repo_path(id), 'rb')
    data = fh.read()
    fh.close()
    return data


def repo_stream(response, id, filename=None, filesize=None, mimetype=None):
    '''Pipe the file with the given ID through the
    response. Optionally sets the headers in the response.'''
    fh = open(repo_path(id), 'rb')

    # Set headers after opening. If opening fails and Zope generates an error
    # page, having already set these headers will confuse the browser or
    # external program if they are trying to render the error page as PDF.
    if mimetype is not None:
        response.setHeader('Content-Type', mimetype)
    if filesize is not None:
        response.setHeader('Content-Length', filesize)
    if filename is not None:
        response.setHeader('Content-Disposition',
                           "filename*=UTF-8''" + url_quote(filename))

    while True:
        buf = fh.read(65536)
        if not buf:
            break
        response.write(buf)
    fh.close()
    return


def repo_get_ids(offset, min_secs=24*60*60):
    '''Given a numerical offset (1000, 2000, etc.) return a list of
    ids in the file system (at most 1000).
    Only return files that were changed at least min_secs seconds ago.
    '''
    path = repo_path(offset)
    folder = path[:path.rfind('/')]
    result = []
    now = time.time()
    for filename in os.listdir(folder):
        mtime = os.path.getmtime(os.path.join(folder, filename))
        if (now - mtime) > min_secs:
            result.append(int(filename))
    return result


def repo_virus_check(id):
    '''Given the id, perform a virus check on the file.'''
    filename = repo_path(id)
    return virusCheck(filename)


# Handling pass-through of assets

def asset_passthrough(base_path, traverse_subpath, RESPONSE):
    '''Check if the file indicated by the path exists.
    If it can successfully be read, pass it through the response.
    On problems, set the response status accordingly.
    '''
    # Perform basic sanity checks
    for component in traverse_subpath:
        assert not component.startswith('.'), "No special or hidden files"
        assert '/' not in component, "No slashes allowed in file names"
    full_path = os.path.join(base_path, *traverse_subpath)

    # Check that file is reachable without softlinks
    stat_result = os.lstat(full_path)
    assert stat.S_ISREG(stat_result.st_mode), "Only regular files allowed"

    # Check if file is readable
    assert os.access(full_path, os.R_OK), "Permission denied"

    # Retrieve the size and MIME type
    file_size = stat_result.st_size
    mimetype = get_mimetype(full_path)

    RESPONSE.setHeader('Content-Length', str(file_size))
    RESPONSE.setHeader('Content-Type', mimetype)
    RESPONSE.setHeader('Cache-Control', 'max-age=36000')
    # TODO: Set Last-Modified also
    # TODO: Maybe also work with HEAD? Need the request for that...
    with open(full_path, 'rb') as fh:
        while True:
            buffer = fh.read(1024*1024)
            if not buffer:
                break
            RESPONSE.write(buffer)
    return


# Writing of a file to the fs

def upload_asset(data, filename, subpath):
    '''
    Given a filehandle or bytes, a filename, and a path,
    writes the data to /vol/zope_files/assets/{subpath}/{filename}
    '''

    # Validate subpath
    allowed_root = file_repository + "/assets"
    filepath = allowed_root

    if subpath:
        filepath = os.path.join(filepath, subpath)
        filepath = os.path.abspath(filepath)
    assert os.path.exists(filepath), "Subpath doesn't exist!"

    # Validate filename
    assert filename != "", "Invalid filename!"
    assert "/" not in filename, "Invalid filename!"

    filepath = os.path.join(filepath, filename)
    filepath = os.path.abspath(filepath)

    assert filepath.startswith(allowed_root), "Forbidden Access!"

    # Write file
    with open(filepath, 'wb') as fh:
        if isinstance(data, bytes):
            fh.write(data)
        else:
            while True:
                buf = data.read(65536)
                if not buf:
                    break
                fh.write(buf)
    return filepath


# Temporary directory / temporary file handling

# The "data" argument may be of type long or int, in which
# case it is interpreted as the file id.

# If this is found here, the routine passes the original file
# path instead of writing a copy.
# Filename sanitation

def cleanup_filename(name):
    '''Sanitize a filename.'''
    return cleanup_string(
        name,
        valid_chars=string.ascii_letters + string.digits + '-_.')


def writeTempFile(filename, data):
    '''Create a temporary directory, and write a file there.'''
    filename = cleanup_filename(filename)
    tdir = tempfile.mkdtemp()
    if isinstance(data, six.integer_types):
        path = repo_path(data)
    else:
        path = tdir + '/' + filename
        with open(path, 'wb') as fh:
            fh.write(data)
    return tdir, path


def clearTempDir(tempdir):
    '''Get rid of the temporary directory'''
    if not tempdir:
        return
    if not tempdir.startswith('/tmp/'):
        return
    # os.system('rm -rf "%s"' % tempdir)
    return


def tmp_write(filename, data):
    '''Shortcut for writing into temporary storage.'''
    assert filename.startswith('/tmp/'), "Only writing to /tmp is allowed."
    if isinstance(data, bytes):
        fh = open(filename, 'wb')
    else:
        fh = open(filename, 'w')
    fh.write(data)
    fh.close()
    return


def tmp_read(filename):
    '''Shortcut for reading from temporary storage.'''
    assert filename.startswith('/tmp/'), "Only reading from /tmp is allowed."
    fh = open(filename, 'rb')
    data = fh.read()
    fh.close()
    return data


def tmp_readstr(filename):
    '''Shortcut for reading from temporary storage (as a string).'''
    fh = open(filename, 'r')
    data = fh.read()
    fh.close()
    return data


def tmp_cleanup(tempdir):
    '''Shortcut for removing a temporary directory or file'''
    if not tempdir:
        return
    if tempdir.startswith('/tmp/tmp') and os.path.isdir(tempdir):
        shutil.rmtree(tempdir)
    elif tempdir.startswith('/tmp/') and os.path.isfile(tempdir):
        os.remove(tempdir)


# Checksum calculation

def get_checksum(filename, data):
    '''Checksum extractor.'''
    tempdir, path = writeTempFile(filename, data)
    checksum = subprocess.check_output(
        ['sha256sum', path],
        universal_newlines=True,
    ).strip().split()
    clearTempDir(tempdir)
    return checksum[0] if checksum else ''


# Mimetype extraction (not always transmitted by browser)

def get_mimetype(filename, data=b'', use_filename=None):
    '''Mimetype extractor.
    `data` can either contain the file content or be a file ID, which reads
    from the repository.
    `use_filename` controls if the filename is included in the extraction or if
    a content-only extraction is performed. The default behavior of `None` is
    for backwards compatibility and includes the filename if `data` contains
    `bytes`, but ignores it if `data` is a file ID.

    >>> get_mimetype('asdf.png')
    'image/png'
    >>> get_mimetype('.ogg')
    'audio/ogg'
    >>> data = b'PK\\x03\\x04\\x14\\x00\\x06\\x00\\x08\\x00'
    >>> get_mimetype('test.xlsx', data)
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    >>> get_mimetype('test.xlsx', data, use_filename=False)
    'application/zip'
    >>> get_mimetype('test.xlsx', data, use_filename=True)
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    '''

    data_is_id = isinstance(data, six.integer_types)
    if data_is_id:
        data = repo_read(data)

    if use_filename is None:
        # Backwards compatibility
        use_filename = not data_is_id

    if not use_filename:
        filename = 'undefined'

    tempdir, path = writeTempFile(filename, data)
    try:
        out = subprocess.check_output(
            ['mimetype', '-b', '-i', path],
            universal_newlines=True,
        ).strip()
    finally:
        clearTempDir(tempdir)

    if not out:
        return None

    return out.split('; ')[0].split()[-1]


# Mimetype icons

# Prepare icons in a one-shot manner with this:
# ls /usr/share/icons/gnome/scalable/mimetypes/gnome-mime*.svg | \
# while read i ; do base=$(basename $i .svg) ; \
# inkscape -w 256 -h 256 -e $base.png -f $i ; done

def get_mimetype_icon(mimetype):
    # Find an icon by first trying the full mimetype,
    # then reducing by throwing away details.
    # If this routine returns None, you should fall back
    # to a mimetype that's present, like "text"
    mime_parts = mimetype.replace('/', '-').split('-')
    path_fmt = file_repository + '/icons/gnome-mime-%s.png'
    while mime_parts:
        filename = path_fmt % '-'.join(mime_parts)
        try:
            fh = open(filename, 'rb')
            break
        except IOError:
            pass
        mime_parts.pop()
    if not mime_parts:
        return None
    data = fh.read()
    fh.close()
    return data


# Mimetype registry handling

mimetype_registry = {}


def registerMimetype(mimetype, textfnc, thumbfnc, tagsfnc):
    '''Register a text and thumbnail extractor functions with a mime type.'''
    mimetype_registry[mimetype] = (textfnc, thumbfnc, tagsfnc)
    return


def get_text(filename, mimetype, data):
    '''Convert file contents to text, if possible.'''
    if mimetype not in mimetype_registry:
        return None
    fnc = mimetype_registry[mimetype][0]
    if not fnc:
        return None
    return fnc(filename, data)


def get_thumb(filename, mimetype, data, res=256):
    '''Convert file contents to a thumbnail image, if possible.'''
    if mimetype not in mimetype_registry:
        return None
    fnc = mimetype_registry[mimetype][1]
    if not fnc:
        return None
    return fnc(filename, data, res)


def get_tags(filename, mimetype, data):
    '''Get file's EXIF tags, if possible.'''
    if mimetype not in mimetype_registry:
        return None
    fnc = mimetype_registry[mimetype][2]
    if not fnc:
        return None
    return fnc(filename, data)


# Generic handlers

def makeTagsDefault(filename, data):
    '''Default tags extractor.'''

    tempdir, path = writeTempFile(filename, data)
    try:
        tags = subprocess.check_output(
            ['exiftool', path], universal_newlines=True
        )
    except subprocess.CalledProcessError as err:
        tags = ''
        say('Error generating tags with exiftool',
            level=2,
            error_output=err.output)
    except UnicodeDecodeError:
        tags = ''
        say('Error generating tags with exiftool', level=2)

    clearTempDir(tempdir)
    return tags


def makeThumbDefault(filename, data, res):
    '''Default thumbnail maker.'''
    dbl = res*2
    tempdir, path = writeTempFile(filename, data)
    convert_returncode = subprocess.call(
        ['convert', path+'[0]',
         '-thumbnail', 'x%d' % dbl,
         '-resize', '%dx<' % dbl,
         '-resize', '50%',
         '-gravity', 'center',
         '-crop', '%dx%d+0+0' % (res, res),
         '+repage',
         '-depth', '8',
         '+dither',
         '-colors', '64',
         '-flatten',
         'out.png'
         ], cwd=tempdir)
    if convert_returncode != 0:
        thumb = None
        say('Thumbnail creation failed',
            filename=filename,
            returncode=convert_returncode,
            level=2)
    else:
        try:
            fh = open(tempdir+'/out.png', 'rb')
            thumb = fh.read()
            fh.close()
        except IOError:
            thumb = None

    clearTempDir(tempdir)
    return thumb


# PDF handling

def ocrPDF(filename, data, ocr_lang=None, timeout=None):
    '''Attempt to extract text from a PDF and embed it into the original.
    This attempts to use OCR to extract text from scanned PDFs.
    Pass the parameter "ocr_lang" to switch to another language.

    >>> data = fileassets['tests.ocrtest']  # doctest: +SKIP
    >>> new_data = ocrPDF('ocrtest.pdf', data,
    ...                   ocr_lang='deu')  # doctest: +SKIP
    >>> txt = makeTextPDF('ocrtest.pdf', new_data)  # doctest: +SKIP
    >>> 'Nummern: 28839108984674' in txt  # doctest: +SKIP
    True
    >>> 'Email: info@perfact.de' in txt  # doctest: +SKIP
    True
    >>> 'Lorem ipsum dolor sit amet' in txt  # doctest: +SKIP
    True

    Files which are already text, should not be converted:

    >>> data = fileassets['tests.ocrtestfail']  # doctest: +SKIP
    >>> new_data = ocrPDF('ocrtestfail.pdf', data,
    ...                   ocr_lang='deu')  # doctest: +SKIP
    >>> new_data == None  # doctest: +SKIP
    True
    '''
    # Try to OCR this file (this requires the ocrmypdf utility!)
    tempdir, path = writeTempFile(filename, data)
    opts = []
    if ocr_lang:
        valid_langs = ['eng', 'deu', 'eng+deu', ]
        assert ocr_lang in valid_langs, "Invalid language"
        opts.extend(['-l', ocr_lang, ])

    outpath = tempdir + '/out.pdf'
    ocrcmd = ['/usr/bin/ocrmypdf', ] + opts + [
        path, outpath,
    ]
    if timeout is not None:
        ocrcmd = ['timeout', str(timeout)] + ocrcmd
    retcode, output = safe_syscall(ocrcmd, raisemode=False)
    if retcode == 0:
        # Successful. Read OCRed version and pass back.
        new_data = open(outpath, 'rb').read()
        return new_data
    elif retcode == 6:
        # File already contains text, OCR refused.
        return None
    elif timeout is not None and retcode == 124:
        assert False, "OCR timeout"
    else:
        assert False, "OCR failed with return code {}".format(retcode)


def makeTextPDF(filename, data):
    '''Text extractor for PDF files.
    '''
    limit = 0x1000000  # Limit to 16M of text data.
    tempdir, path = writeTempFile(filename, data)
    proc = subprocess.Popen(
        ['pdftotext', path, '-'],
        universal_newlines=True,
        stdout=subprocess.PIPE
    )
    text = proc.stdout.read(limit)
    proc.terminate()
    clearTempDir(tempdir)
    return text


# PS handling

def makeTextPS(filename, data):
    '''Text extractor for PS files.'''
    limit = 0x1000000  # Limit to 16M of text data.
    tempdir, path = writeTempFile(filename, data)
    subprocess.check_call(
        ['ps2pdf', path, tempdir+'/out.pdf']
    )
    proc = subprocess.Popen(
        ['pdftotext', tempdir+'/out.pdf', '-'],
        universal_newlines=True,
        stdout=subprocess.PIPE
    )
    text = proc.stdout.read(limit)
    proc.terminate()
    clearTempDir(tempdir)
    return text


# MS-Word handling

def makeTextMSWord(filename, data):
    '''Text extractor for MSWord files.'''
    limit = 0x1000000  # Limit to 16M of text data.
    tempdir, path = writeTempFile(filename, data)

    timeout = 10
    proc = subprocess.Popen(
        ['/usr/bin/timeout', str(timeout),
         'unoconv', '--stdout', '-f', 'txt', path],
        stdout=subprocess.PIPE,
    )
    text, retcode = proc.communicate()
    clearTempDir(tempdir)
    text = to_ustring(text)[:limit]
    return text


registerMimetype('application/pdf', makeTextPDF, makeThumbDefault,
                 makeTagsDefault)
registerMimetype('application/postscript', makeTextPS, makeThumbDefault,
                 makeTagsDefault)
registerMimetype('application/msword', makeTextMSWord, None, None)
registerMimetype('image/svg+xml', None, makeThumbDefault, None)
registerMimetype('image/jpeg', None, makeThumbDefault, makeTagsDefault)
registerMimetype('image/png', None, makeThumbDefault, makeTagsDefault)
registerMimetype('image/gif', None, makeThumbDefault, makeTagsDefault)
registerMimetype('application/ogg', None, None, makeTagsDefault)
registerMimetype('audio/mpeg', None, None, makeTagsDefault)


# Specials

# TODO: pstopdf is not a binary found in any package on Ubuntu 16 or 18
# systems. Drop this function?
def pdfFirstPage(filename, data):
    '''Create a new PDF file containing only the first page of the original.'''
    tempdir, path = writeTempFile(filename, data)
    p1 = subprocess.Popen(
        ['pdftops', '-f', '1', '-l', '1', path, ],
        stdout=subprocess.PIPE,
    )
    newpdf = subprocess.check_output(
        ['pstopdf', '-', '-'],
        stdin=p1.stdout,
    )
    clearTempDir(tempdir)
    return newpdf


def virusCheck(filename):
    # Scan the file
    cmds = ['clamdscan', filename]
    retcode, output = safe_syscall(cmds=cmds)

    if retcode == 0:
        result = 'clean'
    elif retcode == 1:
        result = 'infected'
    else:
        raise AssertionError(
            ("virusCheck could not be executed: "
             "%s exited with code: %s and output: %s") %
            (cmds, retcode, output))

    return {
        'result': result,
        'retcode': retcode,
        'output': output,
    }


def file_upload(newfile, id=None, caption=None,
                filename=None, mimetype=None, avoid_anonymous=True,
                use_file_repository=True,
                username='__system__', commit=True):
    '''Port of file_d.sec.upload().

    Handle file uploads. This includes generation of metadata like
    thumbnails and text extractions.

    Attention: This is no port of file_d.upload.new()!
               auto_tag and virus checks are not included.
               However, this creates files in their default status, which
               switched from 1 (in preparation) to 7 (in quarantine) in
               DB-Utils 3.14. Files in quarantine can not be accessed by users
               until a corresponding job runs these checks and moves
               quarantined files into preparation or rejects them.

    Inputs:
    - newfile: file-like object or bytes with data
    - id: optional file_id for overwriting
    - caption: optional caption, else use filename
    - filename: optional, taken from file-like object or ''
    - mimetype: optional if file-like object has a content-type header
    - avoid_anonymous: optional, if False allow Anonymous Users
    - use_file_repository: optional, False stores bytea in database
    - username: optional, defaults to '__system__'
    '''

    if not filename:
        # Try to get the filename from a file-like object.
        try:
            filename = newfile.filename
        except AttributeError:
            filename = ''

    # Cut last path component, and clean up.
    filename = filename.replace('\\', '/').rsplit('/', 1)[-1]
    filename = cleanup_filename(filename)

    # Default caption
    if not caption:
        caption = filename

    if not mimetype:
        # This fails when we pass raw data.
        try:
            mimetype = newfile.headers['content-type']
        except (AttributeError, KeyError):
            pass

    # Unicode conversion
    filename = to_ustring(filename)
    caption = to_ustring(caption)
    mimetype = to_ustring(mimetype)
    username = to_ustring(username)

    size = 0
    if use_file_repository:
        bindata = ''
    else:
        try:
            bindata = newfile.read()
        except AttributeError:
            bindata = newfile
        size = len(bindata)

    if id:
        dbconn.execute(fileassets['file.upload_validate'], id=id)
        vals = dbconn.dictionaries()
        assert len(vals) and vals[0]['editable'], "File locked."
        dbconn.execute(fileassets['file.upload_upd'],
                       id=id, bindata=esc_binary(bindata),
                       caption=caption,
                       filename=filename, mimetype=mimetype,
                       username=username)
        res = dbconn.tuples()
    else:
        dbconn.execute(fileassets['file.upload_ins'],
                       bindata=esc_binary(bindata),
                       caption=caption,
                       filename=filename, mimetype=mimetype,
                       username=username)
        res = dbconn.tuples()

    id = res[0][0]

    if use_file_repository:
        repo_write(id, newfile)
        size = repo_size(id)

    dbconn.execute(fileassets['file.upload_set_size'],
                   id=id, size=size)

    if commit:
        dbconn.commit()

    # Update metadata
    generate_metadata(id=id, commit=commit)

    return id


def file_remove(id, use_file_repository=True, commit=True):
    '''Remove a file from the database (plus repository)
    '''
    dbconn.execute(fileassets['file.remove'],
                   id=id)
    if use_file_repository:
        repo_delete(id)
    if commit:
        dbconn.commit()


def generate_metadata(id, override_mimetype=True, commit=True):
    ''' re-implementation of file_d.sec.metadata.generate()

    If the id is given, update the metadata of that record.
    Otherwise, check the queue of pending file metadata updates.
    '''

    dbconn.execute(fileassets['file.meta_generate'],
                   id=id)
    for item in dbconn.result():
        filename = to_string(item.file_filename)
        data = (
            str(item.file_data) if item.file_data is not None
            else repo_read(item.file_id)
        )

        mimetype = get_mimetype(filename, data)
        if override_mimetype:
            # Override the detected mime type by file extension
            mimetype = mimetype_by_extension(item.file_filename, mimetype)

        dbconn.execute(fileassets['file.meta_generate_mimetype'],
                       id=item.file_id, mimetype=mimetype)

    dbconn.execute(fileassets['file.meta_generate'],
                   id=id)
    for item in dbconn.result():
        filename = to_string(item.file_filename)
        data = (
            str(item.file_data) if item.file_data is not None
            else repo_read(item.file_id)
        )

        checksum = get_checksum(filename, data) or ''

        tags = get_tags(filename, item.file_mimetype, data) or ''
        tags = to_ustring(tags)
        text = get_text(filename, item.file_mimetype, data) or ''
        text = to_ustring(text)

        thumb = get_thumb(filename, item.file_mimetype, data)
        if thumb is None:
            thumb = get_mimetype_icon(item.file_mimetype)
        if thumb is None:
            thumb = get_mimetype_icon('text')
        if thumb is None:
            thumb = ''

        dbconn.execute(fileassets['file.meta_generate_upd'],
                       id=item.file_id, checksum=checksum,
                       tags=tags, text=text,
                       thumb=esc_binary(thumb))
    if commit:
        dbconn.commit()
    return


def mimetype_by_extension(filename, default=None):
    mimetypes = {
        'txt': 'text/plain',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
        'png': 'image/png',

        'manifest': 'application/manifest',
        'xaml': 'application/xaml+xml',
        'application': 'application/x-ms-application',
        'deploy': 'application/octet-stream',
        'xbap': 'application/x-ms-xbap',
        'docm': 'application/vnd.ms-word.document.macroEnabled.12',
        'docx': ('application/vnd.openxmlformats-officedocument.'
                 'wordprocessingml.document'),
        'dotm': 'application/vnd.ms-word.template.macroEnabled.12',
        'dotx': ('application/vnd.openxmlformats-officedocument.'
                 'wordprocessingml.template'),
        'potm': 'application/vnd.ms-powerpoint.template.macroEnabled.12',
        'potx': ('application/vnd.openxmlformats-officedocument.'
                 'presentationml.template'),
        'ppam': 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
        'ppsm': 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
        'ppsx': ('application/vnd.openxmlformats-officedocument.'
                 'presentationml.slideshow'),
        'pptm': 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
        'pptx': ('application/vnd.openxmlformats-officedocument.'
                 'presentationml.presentation'),
        'xlam': 'application/vnd.ms-excel.addin.macroEnabled.12',
        'xlsb': 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
        'xlsm': 'application/vnd.ms-excel.sheet.macroEnabled.12',
        'xlsx': ('application/vnd.openxmlformats-officedocument.'
                 'spreadsheetml.sheet'),
        'xltm': 'application/vnd.ms-excel.template.macroEnabled.12',
        'xltx': ('application/vnd.openxmlformats-officedocument.'
                 'spreadsheetml.template'),

        'odt': 'application/vnd.oasis.opendocument.text',
        'ods': 'application/vnd.oasis.opendocument.spreadsheet',
        'doc': 'application/msword',
        'dot': 'application/msword',
        'pdf': 'application/pdf',
        'rtf': 'application/rtf',
        'zip': 'application/zip',
        'xml': 'application/xml',
        'xls': 'application/vnd.ms-excel',
        'htm': 'text/html',
        'html': 'text/html',
        'eml': 'message/rfc822',
        'msg': 'message/rfc822',
        'dat': 'application/ms-tnef',  # This is a little too generic...
    }

    suffix = filename.split('.')[-1]
    return mimetypes.get(suffix, default)


def create_zipfile(files):
    '''Create a zipfile containing some files.

    Input: Dictionary of files. Each key is a filename, each value is
    the binary content.

    Output: Zipfile bytes.

    Side Effects: The temporary data in /tmp will remain there until
    cleaned up by the cron job.

    >>> files = {'test.txt': 'ABCD', 'another_file.txt': 'Hello world!'}
    >>> create_zipfile(files).startswith(b'PK\\x03\\x04\\n')
    True

    You need to pass clean file names. Only ascii letters, digits and "-_.+"
    are allowed:
    >>> create_zipfile({'no blanks': 'ABCD', 'no/slashes': 'ABCD'})
    Traceback (most recent call last):
      ...
    AssertionError: Illegal filename. ...
    '''

    # check for sane filenames
    for filename in files.keys():
        valid_chars = string.ascii_letters + string.digits + '-_.+'
        sanitized = cleanup_string(filename, valid_chars=valid_chars)
        assert filename == sanitized, (
            "Illegal filename. Valid characters are: " + valid_chars
        )

    tempdir = tempfile.mkdtemp()
    for filename, data in files.items():
        with open(tempdir+'/'+filename, 'wb') as fh:
            fh.write(to_bytes(data))
    retcode = subprocess.call(
        ['zip', 'install.zip', ] + sorted(list(files.keys())),
        cwd=tempdir,
    )
    assert retcode == 0, "Problem creating zipfile."
    with open(tempdir + '/install.zip', 'rb') as fh:
        zipfile = fh.read()
    return zipfile
