Olivier Sallou · Olivier Sallou · Olivier Sallou · Olivier Sallou · Olivier Sallou · 9c723776
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ install:
 - pip install python-coveralls
 - python setup.py -q install
 script:
- nosetests -a '!network'
+- nosetests -a '!network,!local_irods'
 - flake8 --ignore E501 biomaj_download/*.py biomaj_download/download
 deploy:
  provider: pypi

--- a/CHANGES.txt
+++ b/CHANGES.txt
+3.1.0:
+  #16 Don't change name after download in DirectHTTPDownloader
+  PR #7 Refactor downloaders (*WARNING* breaks API)
 3.0.27:
  Fix previous release broken with a bug in direct protocols
 3.0.26:

--- a/README.md
+++ b/README.md
@@ -17,6 +17,19 @@ To compile protobuf, in biomaj_download/message:

    flake8  biomaj_download/\*.py biomaj_download/download

+# Test
+
+To run the test suite, use:
+
+    nosetests -a '!local_irods' tests/biomaj_tests.py
+
+This command skips the test that need a local iRODS server.
+
+Some test might fail due to network connection. You can skip them with:
+
+    nosetests -a '!network' tests/biomaj_tests.py
+
+(To skip the local iRODS test and the network tests, use `-a '!network,!local_irods'`).

 # Run


--- a/biomaj_download/download/ftp.py
+++ b/biomaj_download/download/ftp.py
--- a/biomaj_download/download/direct.py
+++ b/biomaj_download/download/direct.py
+"""
+Subclasses for direct download (i.e. downloading without regexp). The usage is
+a bit different: instead of calling method:`list` and method:`match`, client
+code explicitely calls method:`set_files_to_download` (passing a list
+containing only the file name). method:`list` is used to get more information
+about the file (if possile). method:`match` matches everything.
+Also client code can use method:`set_save_as` to indicate the name of the file
+to save.
+
+The trick for the implementation is to override
+method:`_append_file_to_download` to initialize the rfile with the file name
+and dummy values. Note that we use a list of rfile even if it contains only one
+file.
+method:`list` will modify directly the files_to_download.
+method:``match` don't call method:`_append_file_to_download` (since the list of
+files to download is already set up).
+We also override method:`set_files_to_download` to check that we pass only one
+file.
+"""
 import datetime
-import time
 import pycurl
-import os
 import re
 import hashlib
 import sys

-from biomaj_download.download.ftp import FTPDownload
+from biomaj_download.download.curl import CurlDownload
 from biomaj_core.utils import Utils

 if sys.version_info[0] < 3:
@@ -20,28 +37,21 @@ except ImportError:
    from StringIO import StringIO as BytesIO


-class DirectFTPDownload(FTPDownload):
+class DirectFTPDownload(CurlDownload):
    '''
    download a list of files from FTP, no regexp
    '''

-    def __init__(self, protocol, host, rootdir=''):
-        '''
+    ALL_PROTOCOLS = ["ftp", "ftps"]

+    def _append_file_to_download(self, filename):
+        '''
        Initialize the files in list with today as last-modification date.
-        Size is also preset to zero, size will be set after download
-
+        Size is also preset to zero.
        '''
-        FTPDownload.__init__(self, protocol, host, rootdir)
-        self.save_as = None
-        self.headers = {}
-
-    def set_files_to_download(self, files):
        today = datetime.date.today()
-        self.files_to_download = []
-        for file_to_download in files:
        rfile = {}
-            rfile['root'] = ''
+        rfile['root'] = self.rootdir
        rfile['permissions'] = ''
        rfile['group'] = ''
        rfile['user'] = ''
@@ -49,188 +59,76 @@ class DirectFTPDownload(FTPDownload):
        rfile['month'] = today.month
        rfile['day'] = today.day
        rfile['year'] = today.year
-            if file_to_download.endswith('/'):
-                rfile['name'] = file_to_download[:-1]
+        if filename.endswith('/'):
+            rfile['name'] = filename[:-1]
        else:
-                rfile['name'] = file_to_download
+            rfile['name'] = filename
        rfile['hash'] = None
-            if self.param:
-                if 'param' not in file_to_download or not file_to_download['param']:
-                    rfile['param'] = self.param
-            self.files_to_download.append(rfile)
+        # Use self.save_as even if we use it in list(). This is important.
+        rfile['save_as'] = self.save_as
+        super(DirectFTPDownload, self)._append_file_to_download(rfile)
+
+    def set_files_to_download(self, files_to_download):
+        if len(files_to_download) > 1:
+            self.files_to_download = []
+            msg = self.__class__.__name__ + ' accepts only 1 file'
+            self.logger.error(msg)
+            raise ValueError(msg)
+        return super(DirectFTPDownload, self).set_files_to_download(files_to_download)

    def list(self, directory=''):
        '''
        FTP protocol does not give us the possibility to get file date from remote url
        '''
-        for rfile in self.files_to_download:
-            if self.save_as is None:
-                self.save_as = rfile['name']
-            rfile['save_as'] = self.save_as
+        # TODO: are we sure about this implementation ?
        return (self.files_to_download, [])

    def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
        '''
        All files to download match, no pattern
        '''
-        if dir_list is None:
-            dir_list = []
-        self.files_to_download = file_list
+        pass


-class DirectHttpDownload(DirectFTPDownload):
+class DirectHTTPDownload(DirectFTPDownload):

-    def __init__(self, protocol, host, rootdir=''):
-        '''
-        :param file_list: list of files to download on server
-        :type file_list: list
-        '''
-        DirectFTPDownload.__init__(self, protocol, host, rootdir)
-        self.save_as = None
+    ALL_PROTOCOLS = ["http", "https"]
+
+    def __init__(self, curl_protocol, host, rootdir=''):
+        DirectFTPDownload.__init__(self, curl_protocol, host, rootdir)
        self.method = 'GET'
        self.param = {}

-    def download(self, local_dir, keep_dirs=True):
-        '''
-        Download remote files to local_dir
-
-        :param local_dir: Directory where files should be downloaded
-        :type local_dir: str
-        :param keep_dirs: keep file name directory structure or copy file in local_dir directly
-        :param keep_dirs: bool
-        :return: list of downloaded files
-        '''
-        self.logger.debug('DirectHTTP:Download')
-        nb_files = len(self.files_to_download)
-
-        if nb_files > 1:
-            self.files_to_download = []
-            self.logger.error('DirectHTTP accepts only 1 file')
-
-        cur_files = 1
-
-        for rfile in self.files_to_download:
-            if self.kill_received:
-                raise Exception('Kill request received, exiting')
-
-            if not self.save_as:
-                self.save_as = rfile['name']
-            else:
-                rfile['save_as'] = self.save_as
-            file_dir = local_dir
-            if keep_dirs:
-                file_dir = local_dir + os.path.dirname(self.save_as)
-            file_path = file_dir + '/' + os.path.basename(self.save_as)
-
-            # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
-            if not os.path.exists(file_dir):
-                os.makedirs(file_dir)
-            self.logger.debug('DirectHTTP:Download:Progress' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'] + ', save as ' + self.save_as)
-            cur_files += 1
-            if 'url' not in rfile:
-                rfile['url'] = self.url
-            fp = open(file_path, "wb")
-            curl = pycurl.Curl()
-
-            if self.proxy is not None:
-                curl.setopt(pycurl.PROXY, self.proxy)
-                if self.proxy_auth is not None:
-                    curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
-            if self.method == 'POST':
-                # Form data must be provided already urlencoded.
-                postfields = urlencode(self.param)
-                # Sets request method to POST,
-                # Content-Type header to application/x-www-form-urlencoded
-                # and data to send in request body.
-                if self.credentials is not None:
-                    curl.setopt(pycurl.USERPWD, self.credentials)
-
-                curl.setopt(pycurl.POSTFIELDS, postfields)
-                try:
-                    curl.setopt(pycurl.URL, rfile['url'] + rfile['root'] + '/' + rfile['name'])
-                except Exception:
-                    curl.setopt(pycurl.URL, (rfile['url'] + rfile['root'] + '/' + rfile['name']).encode('ascii', 'ignore'))
-
-            else:
-                url = rfile['url'] + rfile['root'] + '/' + rfile['name'] + '?' + urlencode(self.param)
-                try:
-                    curl.setopt(pycurl.URL, url)
-                except Exception:
-                    curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
-
-            curl.setopt(pycurl.WRITEDATA, fp)
-            start_time = datetime.datetime.now()
-            start_time = time.mktime(start_time.timetuple())
-            curl.perform()
-            end_time = datetime.datetime.now()
-            end_time = time.mktime(end_time.timetuple())
-            rfile['download_time'] = end_time - start_time
-
-            curl.close()
-            fp.close()
-            self.logger.debug('downloaded!')
-            rfile['name'] = self.save_as
-            self.set_permissions(file_path, rfile)
-        return self.files_to_download
-
-    def header_function(self, header_line):
-        # HTTP standard specifies that headers are encoded in iso-8859-1.
-        # On Python 2, decoding step can be skipped.
-        # On Python 3, decoding step is required.
-        header_line = header_line.decode('iso-8859-1')
-
-        # Header lines include the first status line (HTTP/1.x ...).
-        # We are going to ignore all lines that don't have a colon in them.
-        # This will botch headers that are split on multiple lines...
-        if ':' not in header_line:
-            return
-
-        # Break the header line into header name and value.
-        name, value = header_line.split(':', 1)
-
-        # Remove whitespace that may be present.
-        # Header lines include the trailing newline, and there may be whitespace
-        # around the colon.
-        name = name.strip()
-        value = value.strip()
-
-        # Header names are case insensitive.
-        # Lowercase name here.
-        name = name.lower()
-
-        # Now we can actually record the header name and value.
-        self.headers[name] = value
+    def _file_url(self, file_to_download):
+        url = super(DirectHTTPDownload, self)._file_url(file_to_download)
+        if self.method == "GET":
+            url += '?' + urlencode(self.param)
+        return url

    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
+        self._basic_curl_configuration()
+        # Specific configuration
+        self.crl.setopt(pycurl.HEADER, True)
+        self.crl.setopt(pycurl.NOBODY, True)
        for rfile in self.files_to_download:
            if self.save_as is None:
                self.save_as = rfile['name']

            rfile['save_as'] = self.save_as

-            self.crl.setopt(pycurl.HEADER, True)
-            if self.credentials is not None:
-                self.crl.setopt(pycurl.USERPWD, self.credentials)
-
-            if self.proxy is not None:
-                self.crl.setopt(pycurl.PROXY, self.proxy)
-                if self.proxy_auth is not None:
-                    self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
-            self.crl.setopt(pycurl.NOBODY, True)
+            file_url = self._file_url(rfile)
            try:
-                self.crl.setopt(pycurl.URL, self.url + self.rootdir + rfile['name'])
+                self.crl.setopt(pycurl.URL, file_url)
            except Exception:
-                self.crl.setopt(pycurl.URL, (self.url + self.rootdir + rfile['name']).encode('ascii', 'ignore'))
+                self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))

+            # Create a buffer and assign it to the pycurl object
            output = BytesIO()
-            # lets assign this buffer to pycurl object
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
-            self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
+
            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.

--- a/biomaj_download/download/http.py
+++ b/biomaj_download/download/http.py
-import pycurl
-import re
-import hashlib
-import datetime
-
-import humanfriendly
-
-from biomaj_core.utils import Utils
-from biomaj_download.download.ftp import FTPDownload
-
-try:
-    from io import BytesIO
-except ImportError:
-    from StringIO import StringIO as BytesIO
-
-
-class HTTPParse(object):
-
-    def __init__(self, dir_line, file_line, dir_name=1, dir_date=2, file_name=1, file_date=2, file_date_format=None, file_size=3):
-        r'''
-        http.parse.dir.line: <img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
-        http.parse.file.line: <img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
-        http.group.dir.name: 1
-        http.group.dir.date: 2
-        http.group.file.name: 1
-        http.group.file.date: 2
-        http.group.file.size: 3
-        '''
-        self.dir_line = dir_line
-        self.file_line = file_line
-        self.dir_name = dir_name
-        self.dir_date = dir_date
-        self.file_name = file_name
-        self.file_date = file_date
-        self.file_size = file_size
-        self.file_date_format = file_date_format
-
-
-class HTTPDownload(FTPDownload):
-    '''
-    Base class to download files from HTTP
-
-    Makes use of http.parse.dir.line etc.. regexps to extract page information
-
-    protocol=http
-    server=ftp.ncbi.nih.gov
-    remote.dir=/blast/db/FASTA/
-
-    remote.files=^alu.*\\.gz$
-
-    '''
-
-    def __init__(self, protocol, host, rootdir, http_parse=None):
-        FTPDownload.__init__(self, protocol, host, rootdir)
-        self.http_parse = http_parse
-
-    def list(self, directory=''):
-        '''
-        List FTP directory
-
-        :return: tuple of file and dirs in current directory with details
-        '''
-        self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
-
-        try:
-            self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
-        except Exception:
-            self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
-
-        if self.proxy is not None:
-            self.crl.setopt(pycurl.PROXY, self.proxy)
-            if self.proxy_auth is not None:
-                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
-
-        if self.credentials is not None:
-            self.crl.setopt(pycurl.USERPWD, self.credentials)
-
-        output = BytesIO()
-        # lets assign this buffer to pycurl object
-        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
-        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
-        self.crl.perform()
-        # Figure out what encoding was sent with the response, if any.
-        # Check against lowercased header name.
-        encoding = None
-        if 'content-type' in self.headers:
-            content_type = self.headers['content-type'].lower()
-            match = re.search(r'charset=(\S+)', content_type)
-            if match:
-                encoding = match.group(1)
-        if encoding is None:
-            # Default encoding for HTML is iso-8859-1.
-            # Other content types may have different default encoding,
-            # or in case of binary data, may have no encoding at all.
-            encoding = 'iso-8859-1'
-
-        # lets get the output in a string
-        result = output.getvalue().decode(encoding)
-        r'''
-        http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
-        http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
-        http.group.dir.name': 1,
-        http.group.dir.date': 2,
-        http.group.file.name': 1,
-        http.group.file.date': 2,
-        http.group.file.size': 3,
-        '''
-
-        rfiles = []
-        rdirs = []
-
-        dirs = re.findall(self.http_parse.dir_line, result)
-        if dirs is not None and len(dirs) > 0:
-            for founddir in dirs:
-                rfile = {}
-                rfile['permissions'] = ''
-                rfile['group'] = ''
-                rfile['user'] = ''
-                rfile['size'] = 0
-                date = founddir[self.http_parse.dir_date - 1]
-                dirdate = date.split()
-                parts = dirdate[0].split('-')
-                # 19-Jul-2014 13:02
-                rfile['month'] = Utils.month_to_num(parts[1])
-                rfile['day'] = int(parts[0])
-                rfile['year'] = int(parts[2])
-                rfile['name'] = founddir[self.http_parse.dir_name - 1]
-                rdirs.append(rfile)
-
-        files = re.findall(self.http_parse.file_line, result)
-        if files is not None and len(files) > 0:
-            for foundfile in files:
-                rfile = {}
-                rfile['permissions'] = ''
-                rfile['group'] = ''
-                rfile['user'] = ''
-                if self.http_parse.file_size != -1:
-                    rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
-                else:
-                    rfile['size'] = 0
-                if self.http_parse.file_date != -1:
-                    date = foundfile[self.http_parse.file_date - 1]
-                    if self.http_parse.file_date_format:
-                        date_object = datetime.datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
-                        rfile['month'] = date_object.month
-                        rfile['day'] = date_object.day
-                        rfile['year'] = date_object.year
-                    else:
-                        dirdate = date.split()
-                        parts = dirdate[0].split('-')
-                        # 19-Jul-2014 13:02
-                        rfile['month'] = Utils.month_to_num(parts[1])
-                        rfile['day'] = int(parts[0])
-                        rfile['year'] = int(parts[2])
-                else:
-                    today = datetime.datetime.now()
-                    date = '%s-%s-%s' % (today.year, today.month, today.day)
-                    rfile['month'] = today.month
-                    rfile['day'] = today.day
-                    rfile['year'] = today.year
-                rfile['name'] = foundfile[self.http_parse.file_name - 1]
-                filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
-                rfile['hash'] = hashlib.md5(filehash).hexdigest()
-                rfiles.append(rfile)
-
-        return (rfiles, rdirs)
--- a/biomaj_download/download/interface.py
+++ b/biomaj_download/download/interface.py
@@ -24,12 +24,30 @@ class _FakeLock(object):

 class DownloadInterface(object):
    '''
-    Main interface that all downloaders must extend
+    Main interface that all downloaders must extend.
+
+    The methods are divided into 2 broad categories:
+      - setters which act on properties of the downloader; those methods are
+        important in microservice mode
+      - file operations which are used to list and match remote files, download
+        them, etc.
+
+    Usually, it is enough to overload list, _append_file_to_download and
+    _download.
+
+    TODO:
+      - the purpose of some setters (set_server, set_protocol) is not clear
+        since a subclass cannot always change those parameters arbitrarily
+      - chroot is not used in BioMaJ
    '''

    files_num_threads = 4

    def __init__(self):
+        # This variable defines the protocol as passed by the config file (i.e.
+        # this is directftp for DirectFTPDownload). It is used by the workflow
+        # to send the download message so it must be set.
+        self.protocol = None
        self.config = None
        self.files_to_download = []
        self.files_to_copy = []
@@ -47,13 +65,16 @@ class DownloadInterface(object):
        self.logger = logging.getLogger('biomaj')
        self.param = None
        self.method = None
-        self.protocol = None
        self.server = None
        self.offline_dir = None
        # Options
        self.protocol_options = {}
        self.skip_check_uncompress = False

+    #
+    # Setters for downloader
+    #
+
    def set_offline_dir(self, offline_dir):
        self.offline_dir = offline_dir

@@ -61,15 +82,13 @@ class DownloadInterface(object):
        self.server = server

    def set_protocol(self, protocol):
+        """
+        Method used by DownloadService to set the protocol. This value is
+        passed from the config file so is not always a real protocol (for
+        instance it can be "directhttp" for a direct downloader).
+        """
        self.protocol = protocol

-    def set_files_to_download(self, files):
-        self.files_to_download = files
-        for file_to_download in self.files_to_download:
-            if self.param:
-                if 'param' not in file_to_download or not file_to_download['param']:
-                    file_to_download['param'] = self.param
-
    def set_param(self, param):
        self.param = param

@@ -100,6 +119,54 @@ class DownloadInterface(object):
    def set_method(self, method):
        self.method = method

+    def set_credentials(self, userpwd):
+        '''
+        Set credentials in format user:pwd
+
+        :param userpwd: credentials
+        :type userpwd: str
+        '''
+        self.credentials = userpwd
+
+    def set_options(self, protocol_options):
+        """
+        Set protocol specific options.
+
+        Subclasses that override this method must call the
+        parent implementation.
+        """
+        self.protocol_options = protocol_options
+        if "skip_check_uncompress" in protocol_options:
+            self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
+
+    #
+    # File operations (match, list, download) and associated hook methods
+    #
+
+    def _append_file_to_download(self, rfile):
+        """
+        Add a file to the download list and check its properties (this method
+        is called in `match` and `set_files_to_download`).
+
+        Downloaders can override this to add some properties to the file (for
+        instance, most of them will add "root").
+        """
+        # Add properties to the file if needed (for safety)
+        if 'save_as' not in rfile or rfile['save_as'] is None:
+            rfile['save_as'] = rfile['name']
+        if self.param:
+            if 'param' not in rfile or not rfile['param']:
+                rfile['param'] = self.param
+        self.files_to_download.append(rfile)
+
+    def set_files_to_download(self, files):
+        """
+        Convenience method to set the list of files to download.
+        """
+        self.files_to_download = []
+        for file_to_download in files:
+            self._append_file_to_download(file_to_download)
+
    def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
        '''
        Find files matching patterns. Sets instance variable files_to_download.
@@ -130,13 +197,12 @@ class DownloadInterface(object):
                if subdir == '^':
                    subdirs_pattern = subdirs_pattern[1:]
                    subdir = subdirs_pattern[0]
-                if not dir_list and pattern == '**/*':
-                    # Take all and no more dirs, take all files
+                # If getting all, get all files
+                if pattern == '**/*':
                    for rfile in file_list:
-                        rfile['root'] = self.rootdir
                        if prefix != '':
                            rfile['name'] = prefix + '/' + rfile['name']
-                        self.files_to_download.append(rfile)
+                        self._append_file_to_download(rfile)
                        self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
                    return
                for direlt in dir_list:
@@ -147,10 +213,9 @@ class DownloadInterface(object):
                        self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
                        for rfile in file_list:
                            if pattern == '**/*' or re.match(pattern, rfile['name']):
-                                rfile['root'] = self.rootdir
                                if prefix != '':
                                    rfile['name'] = prefix + '/' + rfile['name']
-                                self.files_to_download.append(rfile)
+                                self._append_file_to_download(rfile)
                                self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
                    else:
                        if re.match(subdirs_pattern[0], subdir):
@@ -163,10 +228,9 @@ class DownloadInterface(object):
            else:
                for rfile in file_list:
                    if re.match(pattern, rfile['name']):
-                        rfile['root'] = self.rootdir
                        if prefix != '':
                            rfile['name'] = prefix + '/' + rfile['name']
-                        self.files_to_download.append(rfile)
+                        self._append_file_to_download(rfile)
                        self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
        if not submatch and len(self.files_to_download) == 0:
            raise Exception('no file found matching expressions')
@@ -226,7 +290,6 @@ class DownloadInterface(object):
                        self.files_to_copy.append(dfile)
                    else:
                        new_files_to_download.append(dfile)
-
        else:
            # Copy everything
            for dfile in self.files_to_download:
@@ -236,17 +299,66 @@ class DownloadInterface(object):
                else:
                    new_files_to_download.append(dfile)

-        self.files_to_download = new_files_to_download
+        self.set_files_to_download(new_files_to_download)

-    def download(self, local_dir):
+    def _download(self, file_path, rfile):
+        '''
+        Download one file and return False in case of success and True
+        otherwise. This must be implemented in subclasses.
+        '''
+        raise NotImplementedError()
+
+    def download(self, local_dir, keep_dirs=True):
        '''
        Download remote files to local_dir

        :param local_dir: Directory where files should be downloaded
        :type local_dir: str
+        :param keep_dirs: keep file name directory structure or copy file in local_dir directly
+        :param keep_dirs: bool
        :return: list of downloaded files
        '''
-        pass
+        self.logger.debug(self.__class__.__name__ + ':Download')
+        nb_files = len(self.files_to_download)
+        cur_files = 1
+        self.offline_dir = local_dir
+        for rfile in self.files_to_download:
+            if self.kill_received:
+                raise Exception('Kill request received, exiting')
+            # Determine where to store file (directory and name)
+            file_dir = local_dir
+            if keep_dirs:
+                file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
+            if file_dir[-1] == "/":
+                file_path = file_dir + os.path.basename(rfile['save_as'])
+            else:
+                file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
+
+            # For unit tests only, workflow will take in charge directory
+            # creation before to avoid thread multi access
+            if not os.path.exists(file_dir):
+                os.makedirs(file_dir)
+
+            msg = self.__class__.__name__ + ':Download:Progress:'
+            msg += str(cur_files) + '/' + str(nb_files)
+            msg += ' downloading file ' + rfile['name'] + ' save as ' + rfile['save_as']
+            self.logger.debug(msg)
+            cur_files += 1
+            start_time = datetime.datetime.now()
+            start_time = time.mktime(start_time.timetuple())
+            error = self._download(file_path, rfile)
+            if error:
+                rfile['download_time'] = 0
+                rfile['error'] = True
+                raise Exception(self.__class__.__name__ + ":Download:Error:" + rfile["name"])
+            else:
+                end_time = datetime.datetime.now()
+                end_time = time.mktime(end_time.timetuple())
+                rfile['download_time'] = end_time - start_time
+            # Set permissions
+            self.set_permissions(file_path, rfile)
+
+        return self.files_to_download

    def list(self):
        '''
@@ -262,26 +374,6 @@ class DownloadInterface(object):
        '''
        pass

-    def set_credentials(self, userpwd):
-        '''
-        Set credentials in format user:pwd
-
-        :param userpwd: credentials
-        :type userpwd: str
-        '''
-        self.credentials = userpwd
-
-    def set_options(self, protocol_options):
-        """
-        Set protocol specific options.
-
-        Subclasses that override this method must call the
-        parent implementation.
-        """
-        self.protocol_options = protocol_options
-        if "skip_check_uncompress" in protocol_options:
-            self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
-
    def close(self):
        '''
        Close connection

--- a/biomaj_download/download/localcopy.py
+++ b/biomaj_download/download/localcopy.py
@@ -15,7 +15,6 @@ class LocalDownload(DownloadInterface):
    remote.dir=/blast/db/FASTA/

    remote.files=^alu.*\\.gz$
-
    '''

    def __init__(self, rootdir, use_hardlinks=False):
@@ -24,6 +23,11 @@ class LocalDownload(DownloadInterface):
        self.rootdir = rootdir
        self.use_hardlinks = use_hardlinks

+    def _append_file_to_download(self, rfile):
+        if 'root' not in rfile or not rfile['root']:
+            rfile['root'] = self.rootdir
+        super(LocalDownload, self)._append_file_to_download(rfile)
+
    def download(self, local_dir):
        '''
        Copy local files to local_dir

--- a/biomaj_download/download/protocolirods.py
+++ b/biomaj_download/download/protocolirods.py
-import logging
-import os
-from datetime import datetime
-import time
-
-from biomaj_core.utils import Utils
 from biomaj_download.download.interface import DownloadInterface
 from irods.session import iRODSSession
-from irods.models import Collection, DataObject, User
+from irods.models import DataObject, User


 class IRODSDownload(DownloadInterface):
-    # To connect to irods session : sess = iRODSSession(host='localhost', port=1247, user='rods', password='rods', zone='tempZone')
-    # password : self.credentials
-    def __init__(self, protocol, server, remote_dir):
+
+    # This is used only for messages
+    real_protocol = "irods"
+
+    def __init__(self, server, remote_dir):
        DownloadInterface.__init__(self)
-        self.port = None
-        self.remote_dir = remote_dir  # directory on the remote server : zone
+        self.port = 1247
+        self.remote_dir = remote_dir  # directory on the remote server including zone
        self.rootdir = remote_dir
        self.user = None
        self.password = None
        self.server = server
-        self.zone = None
+        self.zone = remote_dir.split("/")[0]
+
+    def _append_file_to_download(self, rfile):
+        if 'root' not in rfile or not rfile['root']:
+            rfile['root'] = self.rootdir
+        super(IRODSDownload, self)._append_file_to_download(rfile)

    def set_param(self, param):
-        # self.param is a dictionnary which has the following form :{'password': u'biomaj', 'protocol': u'iget', 'user': u'biomaj', 'port': u'port'}
+        # param is a dictionary which has the following form :
+        # {'password': u'biomaj', 'user': u'biomaj', 'port': u'port'}
+        # port is optional
        self.param = param
-        self.port = int(param['port'])
        self.user = str(param['user'])
        self.password = str(param['password'])
-        self.zone = str(param['zone'])
+        if 'port' in param:
+            self.port = int(param['port'])

    def list(self, directory=''):
        session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
@@ -36,10 +39,13 @@ class IRODSDownload(DownloadInterface):
        rdirs = []
        rfile = {}
        date = None
-        for result in session.query(Collection.name, DataObject.name, DataObject.size, DataObject.owner_name, DataObject.modify_time).filter(User.name == self.user).get_results():
-            # if the user is biomaj : he will have access to all the irods data (biomaj ressource) : drwxr-xr-x
+        query = session.query(DataObject.name, DataObject.size,
+                              DataObject.owner_name, DataObject.modify_time)
+        results = query.filter(User.name == self.user).get_results()
+        for result in results:
            # Avoid duplication
-            if rfile != {} and rfile['name'] == str(result[DataObject.name]) and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
+            if rfile != {} and rfile['name'] == str(result[DataObject.name]) \
+               and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
                continue
            rfile = {}
            date = str(result[DataObject.modify_time]).split(" ")[0].split('-')
@@ -49,81 +55,28 @@ class IRODSDownload(DownloadInterface):
            rfile['day'] = int(date[2])
            rfile['year'] = int(date[0])
            rfile['name'] = str(result[DataObject.name])
-            rfile['download_path'] = str(result[Collection.name])
            rfiles.append(rfile)
        session.cleanup()
        return (rfiles, rdirs)

-    def download(self, local_dir, keep_dirs=True):
-        '''
-        Download remote files to local_dir
-
-        :param local_dir: Directory where files should be downloaded
-        :type local_dir: str
-        :param keep_dirs: keep file name directory structure or copy file in local_dir directly
-        :param keep_dirs: bool
-        :return: list of downloaded files
-        '''
-        logging.debug('IRODS:Download')
-        try:
-            os.chdir(local_dir)
-        except TypeError:
-            logging.error("IRODS:list:Could not find offline_dir")
-        nb_files = len(self.files_to_download)
-        cur_files = 1
-        # give a working directory to copy the file from irods
-        remote_dir = self.remote_dir
-        for rfile in self.files_to_download:
-            if self.kill_received:
-                raise Exception('Kill request received, exiting')
-            file_dir = local_dir
-            if 'save_as' not in rfile or rfile['save_as'] is None:
-                rfile['save_as'] = rfile['name']
-            if keep_dirs:
-                file_dir = local_dir + os.path.dirname(rfile['save_as'])
-            file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
-            # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
-            if not os.path.exists(file_dir):
-                os.makedirs(file_dir)
-
-            logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
-            logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
-            cur_files += 1
-            start_time = datetime.now()
-            start_time = time.mktime(start_time.timetuple())
-            self.remote_dir = rfile['root']
-            error = self.irods_download(file_dir, str(self.remote_dir), str(rfile['name']))
-            if error:
-                rfile['download_time'] = 0
-                rfile['error'] = True
-                raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
-            else:
-                archive_status = Utils.archive_check(file_path)
-                if not archive_status:
-                    self.logger.error('Archive is invalid or corrupted, deleting file')
-                    rfile['error'] = True
-                    if os.path.exists(file_path):
-                        os.remove(file_path)
-                    raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
-
-            end_time = datetime.now()
-            end_time = time.mktime(end_time.timetuple())
-            rfile['download_time'] = end_time - start_time
-            self.set_permissions(file_path, rfile)
-        self.remote_dir = remote_dir
-        return(self.files_to_download)
-
-    def irods_download(self, file_dir, file_path, file_to_download):
+    def _download(self, file_dir, rfile):
        error = False
-        logging.debug('IRODS:IRODS DOWNLOAD')
-        session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
+        self.logger.debug('IRODS:IRODS DOWNLOAD')
+        session = iRODSSession(host=self.server, port=self.port,
+                               user=self.user, password=self.password,
+                               zone=self.zone)
        try:
-            file_to_get = str(file_path) + str(file_to_download)
-            # Write the file to download in the wanted file_dir : with the python-irods iget
+            # iRODS don't like multiple "/"
+            if rfile['root'][-1] == "/":
+                file_to_get = rfile['root'] + rfile['name']
+            else:
+                file_to_get = rfile['root'] + "/" + rfile['name']
+            # Write the file to download in the wanted file_dir with the
+            # python-irods iget
            obj = session.data_objects.get(file_to_get, file_dir)
        except ExceptionIRODS as e:
-            logging.error("RsyncError:" + str(e))
-            logging.error("RsyncError: irods object" + str(obj))
+            self.logger.error(self.__class__.__name__ + ":Download:Error:Can't get irods object " + str(obj))
+            self.logger.error(self.__class__.__name__ + ":Download:Error:" + str(e))
        session.cleanup()
        return(error)


--- a/biomaj_download/download/rsync.py
+++ b/biomaj_download/download/rsync.py
 # from future import standard_library
 # standard_library.install_aliases()
 # from builtins import str
-import logging
 import re
 import os
 import subprocess
-from datetime import datetime
-import time

 from biomaj_download.download.interface import DownloadInterface


 class RSYNCDownload(DownloadInterface):
    '''
-    Base class to download files from rsyncc
+    Base class to download files from rsync
    protocol = rsync
    server =
    remote.dir =
@@ -21,18 +18,76 @@ class RSYNCDownload(DownloadInterface):
    remote.files =
    '''

-    def __init__(self, protocol, server, remote_dir):
+    # This is used to forge the command
+    real_protocol = "rsync"
+
+    def __init__(self, server, rootdir):
        DownloadInterface.__init__(self)
-        logging.debug('Download')
-        self.rootdir = remote_dir
-        self.protocol = protocol
-        if server and remote_dir:
+        self.logger.debug('Download')
+        # If rootdir is not given, we are in local mode. In this case, server
+        # is interpreted as rootdir
+        self.local_mode = not rootdir
+        if not self.local_mode:
            self.server = server  # name of the remote server
-            self.remote_dir = remote_dir  # directory on the remote server
+            self.rootdir = rootdir  # directory on the remote server
        else:
-            if server:
-                self.server = server
-                self.remote_dir = ""
+            self.server = None
+            self.rootdir = server
+        # give a working directory to run rsync
+        if self.local_mode:
+            try:
+                os.chdir(self.rootdir)
+            except TypeError:
+                self.logger.error("RSYNC:Could not find local dir " + self.rootdir)
+
+    def _append_file_to_download(self, rfile):
+        if 'root' not in rfile or not rfile['root']:
+            rfile['root'] = self.rootdir
+        super(RSYNCDownload, self)._append_file_to_download(rfile)
+
+    def _remote_file_name(self, rfile):
+        # rfile['root'] is set to self.rootdir. We don't use os.path.join
+        # because rfile['name'] may starts with /
+        url = rfile['root'] + "/" + rfile['name']
+        if not self.local_mode:
+            url = self.server + ":" + url
+        return url
+
+    def _download(self, file_path, rfile):
+        error = False
+        err_code = ''
+        url = self._remote_file_name(rfile)
+        # Create the rsync command
+        if self.credentials:
+            cmd = str(self.real_protocol) + " " + str(self.credentials) + "@" + url + " " + str(file_path)
+        else:
+            cmd = str(self.real_protocol) + " " + url + " " + str(file_path)
+        self.logger.debug('RSYNC:RSYNC DOwNLOAD:' + cmd)
+        # Launch the command (we are in offline_dir)
+        try:
+            p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+            stdout, stderr = p.communicate()
+            err_code = p.returncode
+            self.test_stderr_rsync_message(stderr)
+            self.test_stderr_rsync_error(stderr)
+        except ExceptionRsync as e:
+            self.logger.error(str(self.real_protocol) + " error:" + str(e))
+        if err_code != 0:
+            self.logger.error('Error while downloading ' + rfile["name"] + ' - ' + str(err_code))
+            error = True
+        return(error)
+
+    def test_stderr_rsync_error(self, stderr):
+        stderr = str(stderr.decode('utf-8'))
+        if "rsync error" in str(stderr):
+            reason = stderr.split(str(self.real_protocol) + " error:")[1].split("\n")[0]
+            raise ExceptionRsync(reason)
+
+    def test_stderr_rsync_message(self, stderr):
+        stderr = str(stderr.decode('utf-8'))
+        if "rsync:" in str(stderr):
+            reason = stderr.split(str(self.real_protocol) + ":")[1].split("\n")[0]
+            raise ExceptionRsync(reason)

    def list(self, directory=''):
        '''
@@ -43,18 +98,14 @@ class RSYNCDownload(DownloadInterface):
        err_code = None
        rfiles = []
        rdirs = []
-        logging.debug('RSYNC:List')
-        # give a working directory to run rsync
-        try:
-            os.chdir(self.offline_dir)
-        except TypeError:
-            logging.error("RSYNC:list:Could not find offline_dir")
-        if self.remote_dir and self.credentials:
-            cmd = str(self.protocol) + " --list-only " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(directory)
-        elif (self.remote_dir and not self.credentials):
-            cmd = str(self.protocol) + " --list-only " + str(self.server) + ":" + str(self.remote_dir) + str(directory)
-        else:  # Local rsync for unitest
-            cmd = str(self.protocol) + " --list-only " + str(self.server) + str(directory)
+        self.logger.debug('RSYNC:List')
+        if self.local_mode:
+            remote = str(self.rootdir) + str(directory)
+        else:
+            remote = str(self.server) + ":" + str(self.rootdir) + str(directory)
+        if self.credentials:
+            remote = str(self.credentials) + "@" + remote
+        cmd = str(self.real_protocol) + " --list-only " + remote
        try:
            p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
            list_rsync, err = p.communicate()
@@ -62,9 +113,9 @@ class RSYNCDownload(DownloadInterface):
            self.test_stderr_rsync_error(err)
            err_code = p.returncode
        except ExceptionRsync as e:
-            logging.error("RsyncError:" + str(e))
+            self.logger.error("RsyncError:" + str(e))
        if err_code != 0:
-            logging.error('Error while listing ' + str(err_code))
+            self.logger.error('Error while listing ' + str(err_code))
            return(rfiles, rdirs)
        list_rsync = str(list_rsync.decode('utf-8'))
        lines = list_rsync.rstrip().split("\n")
@@ -92,97 +143,6 @@ class RSYNCDownload(DownloadInterface):

        return (rfiles, rdirs)

-    def download(self, local_dir, keep_dirs=True):
-        '''
-        Download remote files to local_dir
-
-        :param local_dir: Directory where files should be downloaded
-        :type local_dir: str
-        :param keep_dirs: keep file name directory structure or copy file in local_dir directly
-        :param keep_dirs: bool
-        :return: list of downloaded files
-        '''
-
-        logging.debug('RSYNC:Download')
-        nb_files = len(self.files_to_download)
-        cur_files = 1
-        # give a working directory to run rsync
-        try:
-            os.chdir(self.offline_dir)
-        except TypeError:
-            logging.error("RSYNC:list:Could not find offline_dir")
-        for rfile in self.files_to_download:
-            if self.kill_received:
-                raise Exception('Kill request received, exiting')
-            file_dir = local_dir
-            if 'save_as' not in rfile or rfile['save_as'] is None:
-                rfile['save_as'] = rfile['name']
-            if keep_dirs:
-                file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
-            if re.match(r'\S*\/$', file_dir):
-                file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
-            else:
-                file_path = file_dir + os.path.basename(rfile['save_as'])
-            # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
-            if not os.path.exists(file_dir):
-                os.makedirs(file_dir)
-
-            logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
-            logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
-            cur_files += 1
-            start_time = datetime.now()
-            start_time = time.mktime(start_time.timetuple())
-            error = self.rsync_download(file_path, rfile['name'])
-            if error:
-                rfile['download_time'] = 0
-                rfile['error'] = True
-                raise Exception("RSYNC:Download:Error:" + rfile['root'] + '/' + rfile['name'])
-            end_time = datetime.now()
-            end_time = time.mktime(end_time.timetuple())
-            rfile['download_time'] = end_time - start_time
-            self.set_permissions(file_path, rfile)
-        return(self.files_to_download)
-
-    def rsync_download(self, file_path, file_to_download):
-        error = False
-        err_code = ''
-        logging.debug('RSYNC:RSYNC DOwNLOAD')
-        # give a working directory to run rsync
-        try:
-            os.chdir(self.offline_dir)
-        except TypeError:
-            logging.error("RSYNC:list:Could not find offline_dir")
-        try:
-            if self.remote_dir and self.credentials:  # download on server
-                cmd = str(self.protocol) + " " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
-            elif self.remote_dir and not self.credentials:
-                cmd = str(self.protocol) + " " + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
-            else:  # Local rsync for unitest
-                cmd = str(self.protocol) + " " + str(self.server) + str(file_to_download) + " " + str(file_path)
-            p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
-            stdout, stderr = p.communicate()
-            err_code = p.returncode
-            self.test_stderr_rsync_message(stderr)
-            self.test_stderr_rsync_error(stderr)
-        except ExceptionRsync as e:
-            logging.error("RsyncError:" + str(e))
-        if err_code != 0:
-            logging.error('Error while downloading ' + file_to_download + ' - ' + str(err_code))
-            error = True
-        return(error)
-
-    def test_stderr_rsync_error(self, stderr):
-        stderr = str(stderr.decode('utf-8'))
-        if "rsync error" in str(stderr):
-            reason = stderr.split(str(self.protocol) + " error:")[1].split("\n")[0]
-            raise ExceptionRsync(reason)
-
-    def test_stderr_rsync_message(self, stderr):
-        stderr = str(stderr.decode('utf-8'))
-        if "rsync:" in str(stderr):
-            reason = stderr.split(str(self.protocol) + ":")[1].split("\n")[0]
-            raise ExceptionRsync(reason)
-

 class ExceptionRsync(Exception):
    def __init__(self, exception_reason):

--- a/biomaj_download/downloadservice.py
+++ b/biomaj_download/downloadservice.py
@@ -13,10 +13,9 @@ import pika
 from flask import Flask
 from flask import jsonify

-from biomaj_download.download.ftp import FTPDownload
-from biomaj_download.download.http import HTTPDownload
+from biomaj_download.download.curl import CurlDownload
 from biomaj_download.download.direct import DirectFTPDownload
-from biomaj_download.download.direct import DirectHttpDownload
+from biomaj_download.download.direct import DirectHTTPDownload
 from biomaj_download.download.localcopy import LocalDownload
 from biomaj_download.message import downmessage_pb2
 from biomaj_download.download.rsync import RSYNCDownload
@@ -134,24 +133,24 @@ class DownloadService(object):
                    protocol_options={}):
        protocol = downmessage_pb2.DownloadFile.Protocol.Value(protocol_name.upper())
        downloader = None
-        if protocol in [0, 1]:
-            downloader = FTPDownload(protocol_name, server, remote_dir)
-        if protocol in [2, 3]:
-            downloader = HTTPDownload(protocol_name, server, remote_dir, http_parse)
-        if protocol == 7:
-            downloader = LocalDownload(remote_dir)
-        if protocol == 4:
-            downloader = DirectFTPDownload('ftp', server, '/')
-        if protocol == 10:
+        if protocol in [0, 1]:  # FTP, SFTP
+            downloader = CurlDownload(protocol_name, server, remote_dir)
+        if protocol in [2, 3]:  # HTTP, HTTPS (could be factored with previous case)
+            downloader = CurlDownload(protocol_name, server, remote_dir, http_parse)
+        if protocol == 4:  # DirectFTP
+            downloader = DirectFTPDownload("ftp", server, '/')
+        if protocol == 5:  # DirectHTTP
+            downloader = DirectHTTPDownload("http", server, '/')
+        if protocol == 6:  # DirectHTTPS
+            downloader = DirectHTTPDownload("https", server, '/')
+        if protocol == 10:  # DirectFTPS
            downloader = DirectFTPDownload('ftps', server, '/')
-        if protocol == 5:
-            downloader = DirectHttpDownload('http', server, '/')
-        if protocol == 6:
-            downloader = DirectHttpDownload('https', server, '/')
-        if protocol == 8:
-            downloader = RSYNCDownload('rsync', server, remote_dir)
-        if protocol == 9:
-            downloader = IRODSDownload('irods', server, remote_dir)
+        if protocol == 7:  # Local
+            downloader = LocalDownload(remote_dir)
+        if protocol == 8:  # RSYNC
+            downloader = RSYNCDownload(server, remote_dir)
+        if protocol == 9:  # iRods
+            downloader = IRODSDownload(server, remote_dir)
        if downloader is None:
            return None

@@ -182,11 +181,13 @@ class DownloadService(object):

        if save_as:
            downloader.set_save_as(save_as)
+
        if param:
            downloader.set_param(param)

        downloader.set_server(server)

+        # Set the name of the BioMAJ protocol to which we respond.
        downloader.set_protocol(protocol_name)

        if protocol_options is not None:

--- a/debian/changelog
+++ b/debian/changelog
-biomaj3-download (3.0.27-1) UNRELEASED; urgency=medium
-
-  [ PENDING ]
-  Needs python3-ftputil, in NEW queue
+biomaj3-download (3.1.0-1) unstable; urgency=medium

  [ Olivier Sallou ]
  * New upstream release  

- -- Olivier Sallou <osallou@debian.org>  Wed, 16 Oct 2019 13:17:33 +0000
+ -- Olivier Sallou <osallou@debian.org>  Tue, 12 Nov 2019 10:18:15 +0000

 biomaj3-download (3.0.21-1) unstable; urgency=medium


--- a/debian/control
+++ b/debian/control
@@ -12,6 +12,7 @@ Build-Depends: debhelper (>= 12~),
               python3-consul,
               python3-flask,
               python3-humanfriendly,
+               python3-irodsclient,
               python3-mock,
               python3-nose,
               python3-pika,
@@ -25,7 +26,8 @@ Build-Depends: debhelper (>= 12~),
               python3-yaml,
               python3-biomaj3-core (>= 3.0.19),
               python3-biomaj3-zipkin,
-               python3-ftputil
+               python3-ftputil,
+               rsync
 Standards-Version: 4.3.0
 Vcs-Browser: https://salsa.debian.org/med-team/biomaj3-download
 Vcs-Git: https://salsa.debian.org/med-team/biomaj3-download.git

--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ config = {
    'url': 'http://biomaj.genouest.org',
    'download_url': 'http://biomaj.genouest.org',
    'author_email': 'olivier.sallou@irisa.fr',
-    'version': '3.0.27',
+    'version': '3.1.0',
     'classifiers': [
        # How mature is this project? Common values are
        #   3 - Alpha

--- a/tests/biomaj_tests.py
+++ b/tests/biomaj_tests.py
-from nose.tools import *
+"""
+Note that attributes 'network' and 'local_irods' are ignored for CI.
+"""
 from nose.plugins.attrib import attr

 import json
 import shutil
 import os
-import sys
 import tempfile
 import logging
-import copy
 import stat
-import time

 from mock import patch

-from optparse import OptionParser
-
-
 from biomaj_core.config import BiomajConfig
 from biomaj_core.utils import Utils
-from biomaj_download.download.ftp import FTPDownload
-from biomaj_download.download.direct import DirectFTPDownload, DirectHttpDownload
-from biomaj_download.download.http import HTTPDownload, HTTPParse
+from biomaj_download.download.curl import CurlDownload, HTTPParse
+from biomaj_download.download.direct import DirectFTPDownload, DirectHTTPDownload
 from biomaj_download.download.localcopy  import LocalDownload
-from biomaj_download.download.downloadthreads import DownloadThread
 from biomaj_download.download.rsync import RSYNCDownload
 from biomaj_download.download.protocolirods import IRODSDownload

-import pprint
-
 import unittest

 class UtilsForTest():
@@ -263,7 +255,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
    self.utils.clean()

  def test_http_list(self):
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    httpd.close()
    self.assertTrue(len(file_list) == 1)
@@ -271,7 +263,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
  def test_http_list_dateregexp(self):
    #self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
    self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    httpd.close()
    self.assertTrue(len(file_list) == 1)
@@ -287,7 +279,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
        -1
    )
    self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    httpd.match([r'^README$'], file_list, dir_list)
    httpd.download(self.utils.data_dir)
@@ -304,7 +296,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
        self.config.get('http.group.file.date_format', None),
        int(self.config.get('http.group.file.size'))
    )
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    httpd.match([r'^README$'], file_list, dir_list)
    httpd.download(self.utils.data_dir)
@@ -313,7 +305,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):

  def test_http_download(self):
    self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    print(str(file_list))
    httpd.match([r'^README$'], file_list, dir_list)
@@ -323,7 +315,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):

  def test_http_download_in_subdir(self):
    self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
-    httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
+    httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
    (file_list, dir_list) = httpd.list()
    httpd.match([r'^dists/README$'], file_list, dir_list)
    httpd.download(self.utils.data_dir)
@@ -331,6 +323,65 @@ class TestBiomajHTTPDownload(unittest.TestCase):
    self.assertTrue(len(httpd.files_to_download) == 1)


+@attr('network')
+@attr('https')
+class TestBiomajHTTPSDownload(unittest.TestCase):
+  """
+  Test HTTPS downloader
+  """
+
+  def setUp(self):
+    self.utils = UtilsForTest()
+
+  def tearDown(self):
+    self.utils.clean()
+
+  def test_download(self):
+    self.utils = UtilsForTest()
+    self.http_parse = HTTPParse(
+        "<a[\s]+href=\"([\w\-\.]+\">[\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
+        "<a[\s]+href=\"[\w\-\.]+\">([\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
+        1,
+        2,
+        1,
+        2,
+        None,
+        3
+    )
+    self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
+    httpd = CurlDownload('https', 'mirrors.edge.kernel.org', '/pub/software/scm/git/debian/', self.http_parse)
+    (file_list, dir_list) = httpd.list()
+    httpd.match([r'^git-core-0.99.6.tar.gz$'], file_list, dir_list)
+    httpd.download(self.utils.data_dir)
+    httpd.close()
+    self.assertTrue(len(httpd.files_to_download) == 1)
+
+
+@attr('network')
+@attr('sftp')
+class TestBiomajSFTPDownload(unittest.TestCase):
+  """
+  Test SFTP downloader
+  """
+
+  PROTOCOL = "ftps"
+  
+  def setUp(self):
+    self.utils = UtilsForTest()
+
+  def tearDown(self):
+    self.utils.clean()
+
+  def test_download(self):
+    sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
+    sftpd.set_credentials("demo:password")
+    (file_list, dir_list) = sftpd.list()
+    sftpd.match([r'^readme.txt$'], file_list, dir_list)
+    sftpd.download(self.utils.data_dir)
+    sftpd.close()
+    self.assertTrue(len(sftpd.files_to_download) == 1)
+
+
 @attr('directftp')
 @attr('network')
 class TestBiomajDirectFTPDownload(unittest.TestCase):
@@ -411,7 +462,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):

  def test_http_list(self):
    file_list = ['/debian/README.html']
-    ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+    ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
    ftpd.set_files_to_download(file_list)
    fday = ftpd.files_to_download[0]['day']
    fmonth = ftpd.files_to_download[0]['month']
@@ -424,7 +475,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):

  def test_download(self):
    file_list = ['/debian/README.html']
-    ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+    ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
    ftpd.set_files_to_download(file_list)
    (file_list, dir_list) = ftpd.list()
    ftpd.download(self.utils.data_dir, False)
@@ -433,7 +484,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):

  def test_download_get_params_save_as(self):
    file_list = ['/get']
-    ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+    ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
    ftpd.set_files_to_download(file_list)
    ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
    ftpd.save_as = 'test.json'
@@ -449,7 +500,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
  @attr('test')
  def test_download_save_as(self):
    file_list = ['/debian/README.html']
-    ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
+    ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
    ftpd.set_files_to_download(file_list)
    ftpd.save_as = 'test.html'
    (file_list, dir_list) = ftpd.list()
@@ -460,7 +511,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
  def test_download_post_params(self):
    #file_list = ['/debian/README.html']
    file_list = ['/post']
-    ftpd = DirectHttpDownload('http', 'httpbin.org', '')
+    ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
    ftpd.set_files_to_download(file_list)
    ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
    ftpd.save_as = 'test.json'
@@ -489,19 +540,19 @@ class TestBiomajFTPDownload(unittest.TestCase):
    self.utils.clean()

  def test_ftp_list(self):
-    ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+    ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
    (file_list, dir_list) = ftpd.list()
    ftpd.close()
    self.assertTrue(len(file_list) > 1)

  @attr('test')
  def test_download(self):
-    ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+    ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
    (file_list, dir_list) = ftpd.list()
    ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
    # This tests fails because the zip file is fake. We intercept the failure
    # and continue.
-    # See test_download_skip_uncompress_checks
+    # See test_download_skip_check_uncompress
    try:
        ftpd.download(self.utils.data_dir)
    except Exception:
@@ -511,9 +562,9 @@ class TestBiomajFTPDownload(unittest.TestCase):
        self.assertTrue(len(ftpd.files_to_download) == 2)
    ftpd.close()

-  def test_download_skip_checks_uncompress(self):
+  def test_download_skip_check_uncompress(self):
    # This test is similar to test_download but we skip test of zip file.
-    ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
+    ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
    ftpd.set_options(dict(skip_check_uncompress=True))
    (file_list, dir_list) = ftpd.list()
    ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
@@ -522,7 +573,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
    self.assertTrue(len(ftpd.files_to_download) == 2)

  def test_download_in_subdir(self):
-    ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
+    ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
    (file_list, dir_list) = ftpd.list()
    try:
        ftpd.match([r'^doc/mailing-lists.txt$'], file_list, dir_list)
@@ -534,7 +585,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
    self.assertTrue(len(ftpd.files_to_download) == 1)

  def test_download_or_copy(self):
-    ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
+    ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
    ftpd.files_to_download = [
          {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
          {'name':'/test2', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
@@ -566,7 +617,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
    self.assertTrue(release['day']=='12')

  def test_ms_server(self):
-      ftpd = FTPDownload("ftp", "test.rebex.net", "/")
+      ftpd = CurlDownload("ftp", "test.rebex.net", "/")
      ftpd.set_credentials("demo:password")
      (file_list, dir_list) = ftpd.list()
      ftpd.match(["^readme.txt$"], file_list, dir_list)
@@ -579,7 +630,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
      Test setting tcp_keepalive (it probably doesn't change anything here but
      we test that there is no obvious mistake in the code).
      """
-      ftpd = FTPDownload("ftp", "test.rebex.net", "/")
+      ftpd = CurlDownload("ftp", "test.rebex.net", "/")
      ftpd.set_options(dict(tcp_keepalive=10))
      ftpd.set_credentials("demo:password")
      (file_list, dir_list) = ftpd.list()
@@ -604,14 +655,14 @@ class TestBiomajFTPSDownload(unittest.TestCase):
    self.utils.clean()

  def test_ftps_list(self):
-    ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
+    ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
    ftpd.set_credentials("demo:password")
    (file_list, dir_list) = ftpd.list()
    ftpd.close()
    self.assertTrue(len(file_list) == 1)

  def test_download(self):
-    ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
+    ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
    ftpd.set_credentials("demo:password")
    (file_list, dir_list) = ftpd.list()
    ftpd.match([r'^readme.txt$'], file_list, dir_list)
@@ -624,7 +675,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
    SERVER = "demo.wftpserver.com"
    DIRECTORY = "/download/"
    CREDENTIALS = "demo-user:demo-user"
-    ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+    ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
    ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
    ftpd.set_credentials(CREDENTIALS)
    (file_list, dir_list) = ftpd.list()
@@ -636,7 +687,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
    SERVER = "demo.wftpserver.com"
    DIRECTORY = "/download/"
    CREDENTIALS = "demo-user:demo-user"
-    ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+    ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
    ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
    ftpd.set_credentials(CREDENTIALS)
    (file_list, dir_list) = ftpd.list()
@@ -651,7 +702,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
    SERVER = "demo.wftpserver.com"
    DIRECTORY = "/download/"
    CREDENTIALS = "demo-user:demo-user"
-    ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
+    ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
    curdir = os.path.dirname(os.path.realpath(__file__))
    cert_file = os.path.join(curdir, "caert.demo.wftpserver.com.pem")
    ftpd.set_options(dict(ssl_verifyhost="False", ssl_server_cert=cert_file))
@@ -672,7 +723,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
    def setUp(self):
        self.utils = UtilsForTest()

-        self.curdir = os.path.dirname(os.path.realpath(__file__))
+        self.curdir = os.path.dirname(os.path.realpath(__file__)) + '/'
        self.examples = os.path.join(self.curdir,'bank') + '/'
        BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)

@@ -680,40 +731,34 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
        self.utils.clean()

    def test_rsync_list(self):
-        rsyncd =  RSYNCDownload('rsync', self.examples, "")
-        rsyncd.set_credentials(None)
-        rsyncd.set_offline_dir(self.utils.data_dir)
+        rsyncd = RSYNCDownload(self.examples, "")
        (files_list, dir_list) = rsyncd.list()
        self.assertTrue(len(files_list) != 0)

    def test_rsync_match(self):
-        rsyncd =  RSYNCDownload('rsync', self.examples, "")
-        rsyncd.set_credentials(None)
-        rsyncd.set_offline_dir(self.utils.data_dir)
+        rsyncd = RSYNCDownload(self.examples, "")
        (files_list, dir_list) = rsyncd.list()
        rsyncd.match([r'^test.*\.gz$'], files_list, dir_list, prefix='', submatch=False)
        self.assertTrue(len(rsyncd.files_to_download) != 0)

    def test_rsync_download(self):
-        rsyncd = RSYNCDownload('rsync', self.examples, "")
-        rsyncd.set_credentials(None)
-        rsyncd.set_offline_dir(self.utils.data_dir)
-        error = rsyncd.rsync_download(self.utils.data_dir, "test2.fasta")
-        self.assertTrue(error == 0)
-
+        rsyncd = RSYNCDownload(self.examples, "")
+        rfile = {
+            "name": "test2.fasta",
+            "root": self.examples
+        }
+        error = rsyncd._download(self.utils.data_dir, rfile)
+        self.assertFalse(error)

    def test_rsync_general_download(self):
-        rsyncd =  RSYNCDownload('rsync', self.examples, "")
-        rsyncd.set_credentials(None)
-        rsyncd.set_offline_dir(self.utils.data_dir)
+        rsyncd = RSYNCDownload(self.examples, "")
        (files_list, dir_list) = rsyncd.list()
        rsyncd.match([r'^test.*\.gz$'],files_list,dir_list, prefix='')
        download_files=rsyncd.download(self.curdir)
        self.assertTrue(len(download_files)==1)

    def test_rsync_download_or_copy(self):
-        rsyncd =  RSYNCDownload('rsync', self.examples, "")
-        rsyncd.set_offline_dir(self.utils.data_dir)
+        rsyncd = RSYNCDownload(self.examples, "")
        (file_list, dir_list) = rsyncd.list()
        rsyncd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
        files_to_download_prev = rsyncd.files_to_download
@@ -721,8 +766,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
        self.assertTrue(files_to_download_prev != rsyncd.files_to_download)

    def test_rsync_download_in_subdir(self):
-        rsyncd = RSYNCDownload('rsync', self.curdir+'/', "")
-        rsyncd.set_offline_dir(self.curdir+'/')
+        rsyncd = RSYNCDownload(self.curdir, "")
        (file_list, dir_list) = rsyncd.list()
        rsyncd.match([r'^/bank/test*'], file_list, dir_list, prefix='')
        rsyncd.download(self.utils.data_dir)
@@ -824,8 +868,22 @@ class TestBiomajIRODSDownload(unittest.TestCase):
        initialize_mock.return_value=mock_session.configure()
        query_mock.return_value = mock_session.query(None,None,None,None,None)
        cleanup_mock.return_value = mock_session.cleanup()
-        irodsd =  IRODSDownload('irods', self.examples, "")
-        irodsd.set_credentials(None)
-        irodsd.set_offline_dir(self.utils.data_dir)
+        irodsd = IRODSDownload(self.examples, "")
        (files_list, dir_list) = irodsd.list()
        self.assertTrue(len(files_list) != 0)
+
+    @attr('local_irods')
+    def test_irods_download(self):
+        # To run this test, you need an iRODS server on localhost (default
+        # port, user 'rods', password 'rods'), and populate a zone
+        # /tempZone/home/rods with a file that matches r'^test.*\.gz$' (for
+        # instance, by copying tests/bank/test/test.fasta.gz).
+        irodsd = IRODSDownload("localhost", "/tempZone/home/rods")
+        irodsd.set_param(dict(
+            user='rods',
+            password='rods',
+        ))
+        (file_list, dir_list) = irodsd.list()
+        irodsd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
+        irodsd.download(self.utils.data_dir)
+        self.assertTrue(len(irodsd.files_to_download) == 1)