diff --git a/archive-process-email b/archive-process-email index a76c934defab2d98fb2726b306a363d52ebe5561..57eb9c7dcea4fcfbdbb5e5526e1ba2a8418abd78 100755 --- a/archive-process-email +++ b/archive-process-email @@ -16,11 +16,14 @@ import os.path from email.parser import BytesHeaderParser from email.utils import getaddresses +from nm2.lib.fileops import is_gzipped + # TODO: once nm.debian.org is python3, move most of this code to process/ and # make it unit-tested VERSION="0.2" + class umask_override: """ Context manager that temporarily overrides the umask during its lifetime @@ -80,6 +83,9 @@ class IncomingMessage: self.msg.add_header("NM-Archive-Lookup-History", "exception: {}: {}".format(exc.__class__.__name__, str(exc))) def deliver_to_mailbox(self, pathname): + if is_gzipped(pathname): + print(f"{pathname} is archived, ignoring the incoming message.") + return with umask_override(0o037) as uo: with open(pathname, "ab") as out: out.write(self.msg.as_string(True).encode("utf-8")) @@ -114,37 +120,6 @@ class IncomingMessage: return None - def lookup_mailbox_filename(self, key, sqlite=False): - db, Q = open_db(sqlite) - - cur = db.cursor() - query = """ - SELECT pr.archive_key - FROM person p - JOIN process pr ON pr.person_id = p.id - WHERE pr.is_active - """ - - if '=' in key: - # Lookup email - email = key.replace("=", "@") - self.log_lookup("lookup by email '%s'" % email) - cur.execute(Q(query + "AND p.email=%s"), (email,)) - else: - # Lookup uid - self.log_lookup("lookup by uid '%s'" % key) - cur.execute(Q(query + "AND p.uid=%s"), (key,)) - - basename = None - for i, in cur: - basename = i - - if basename is None: - return None - else: - return basename + ".mbox" - - def get_dest_pathname(msg, sqlite=False): """ Return a couple (destdir, filename) with the default directory and mailbox @@ -159,13 +134,7 @@ def get_dest_pathname(msg, sqlite=False): # New-style processes return "/srv/nm.debian.org/mbox/processes", "process-{}.mbox".format(key) else: - # Old-style processes, need a DB lookup - fname = msg.lookup_mailbox_filename(key, sqlite) - if fname is None: - msg.log_lookup("Key {} not found in the database".format(repr(key))) - return "/srv/nm.debian.org/mbox/", "archive-failsafe.mbox" - else: - return "/srv/nm.debian.org/mbox/applicants", fname + return None, None except Exception as e: msg.log_exception(e) return "/srv/nm.debian.org/mbox/", "archive-failsafe.mbox" @@ -183,6 +152,9 @@ def main(): msg = IncomingMessage(sys.stdin.buffer) destdir, filename = get_dest_pathname(msg, args.sqlite) + if filename is None: + return 1 + # Override destdir if requested if args.dest: destdir = args.dest diff --git a/nm2/lib/fileops.py b/nm2/lib/fileops.py new file mode 100644 index 0000000000000000000000000000000000000000..8bf5b0eeb3cbf8d10f0d073fa957573d8ba62079 --- /dev/null +++ b/nm2/lib/fileops.py @@ -0,0 +1,70 @@ +#!/usr/bin/python3 +""" +Small library for file manipulation +""" + +import os +import gzip +import lzma +import shutil + +from contextlib import contextmanager + + +GZIP_MAGIC_NUMBER = "1f8b" + + +def is_gzipped(pathname): + """ + Returns True if the file pointed by `pathname` is very probably + gzipped + """ + + file_begin = open(pathname, "rb").read(2).hex() + if file_begin == GZIP_MAGIC_NUMBER: + return True + +def do_gzip_file(pathname): + """ + Actually gzip a file from pathname to pathname + ".gz" + + Doesn't do anything if the file seems already gzipped. + """ + + dirname, fname = os.path.split(pathname) + gzip_pathname = os.path.join(dirname, f"{fname}.gz") + + if is_gzipped(pathname): + print(f"{pathname} looks already gzipped") + return None + + if os.path.exists(gzip_pathname): + os.unlink(gzip_pathname) + + with open(pathname, 'rb') as f_in: + with gzip.open(gzip_pathname, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + if os.path.exists(gzip_pathname): + os.unlink(pathname) + + return True + + +@contextmanager +def open(pathname, mode="r"): + """ + Opens a file with the appropriate compression library if needed + """ + + if filename.endswith(".gz"): + my_fd = gzip.open(filename, mode) + elif filename.endswith(".xz"): + my_fd = lzma.open(filename, mode) + else: + my_fd = open(filename, mode) + + try: + yield my_fd + finally: + my_fd.close() diff --git a/process/housekeeping.py b/process/housekeeping.py index f06da0b6bb0721eea651fc6e4447526b2bc11375..2fc2cdfd5b3439412ae566716eed2aac1db5bc1b 100644 --- a/process/housekeeping.py +++ b/process/housekeeping.py @@ -5,6 +5,7 @@ from backend.housekeeping import Housekeeper from .maintenance import ( ping_stuck_processes, submit_rt_ticket_for_fd_approved_processes, + archive_old_processes_mailbox, ) STAGES = ["main"] @@ -24,3 +25,12 @@ class OpenApprovedRTTickets(hk.Task): def run_main(self, stage): approval_to_rt_delay = datetime.timedelta(days=7) submit_rt_ticket_for_fd_approved_processes(approval_to_rt_delay) + + +class ArchiveOldProcessesMbox(hk.Task): + DEPENDS = [Housekeeper] + + def run_main(self, stage): + closed_archival_delay = datetime.timedelta(days=28) + closed_archival_treshold = datetime.timedelta(days=56) + archive_old_processes_mailbox(closed_archival_delay, closed_archival_treshold) diff --git a/process/maintenance.py b/process/maintenance.py index 6b7743e75331ea5c9eba6b5b0c5255cb94bafcb0..9ec58c4f127d94f89f28a388b3a4acb742a375d2 100644 --- a/process/maintenance.py +++ b/process/maintenance.py @@ -1,9 +1,12 @@ from __future__ import annotations from typing import Optional from django.utils.timezone import now +import datetime + +from nm2.lib import fileops + from . import models as pmodels from . import ops as pops -import datetime def ping_stuck_processes(stuck_cutoff, audit_author, logdate=None): @@ -165,3 +168,36 @@ def send_rt_ticket(process, statement): ) op.rt_text = statement.statement op.execute() + + +def archive_old_processes_mailbox(closed_archival_delay: datetime.timedelta, + closed_archival_threshold: datetime.datetime): + """ + Grabs all processes closed for more than closed_archival_delay and less + than closed_archival_delay*2, and archive their mailboxes + """ + + most_recent = now - closed_archival_delay + oldest = now - closed_archival_threshold + + processes = pmodels.Process.objects.filter( + closed_time__lte=most_recent, closed_time__gte=oldest + ) + + for process in processes: + archive_old_process_mailbox(process) + + +def archive_old_process_mailbox(process): + """ + Archive the mailbox of a process by making it a GZip file + """ + + mailbox_path = process.mailbox_file + if fileops.is_gzipped(mailbox_path): + return True + if not process.closed: + return False + + fileops.do_gzip_file(mailbox_path) + return True diff --git a/process/views.py b/process/views.py index 459a10042c60caf230bbe26a241dc274540d929b..ac7487ddf863a19609df600753d78924b7232f75 100644 --- a/process/views.py +++ b/process/views.py @@ -15,7 +15,7 @@ from backend.shortcuts import build_absolute_uri from backend.mixins import VisitorMixin, VisitPersonMixin, TokenAuthMixin from backend import const import backend.models as bmodels -from nm2.lib import assets +from nm2.lib import assets, fileops import nm2.lib.forms from .mixins import VisitProcessMixin, RequirementMixin, StatementMixin import datetime @@ -528,7 +528,7 @@ class MailArchive(VisitProcessMixin, View): # The last mtime argument seems to only be supported in python 2.7 outfd = GzipFile(user_fname, "wb", 9, res) # , os.path.getmtime(fname)) try: - with open(fname, "rb") as infd: + with fileops.open(fname, "rb") as infd: shutil.copyfileobj(infd, outfd) outfd.write(b"\n") outfd.write(self.process.get_statements_as_mbox(self.request.user))