Skip to content
Commits on Source (5)
......@@ -20,7 +20,7 @@ install:
- pip install python-coveralls
- python setup.py -q install
script:
- nosetests -a '!network'
- nosetests -a '!network,!local_irods'
- flake8 --ignore E501 biomaj_download/*.py biomaj_download/download
deploy:
provider: pypi
......
3.1.0:
#16 Don't change name after download in DirectHTTPDownloader
PR #7 Refactor downloaders (*WARNING* breaks API)
3.0.27:
Fix previous release broken with a bug in direct protocols
3.0.26:
......
......@@ -17,6 +17,19 @@ To compile protobuf, in biomaj_download/message:
flake8 biomaj_download/\*.py biomaj_download/download
# Test
To run the test suite, use:
nosetests -a '!local_irods' tests/biomaj_tests.py
This command skips the test that need a local iRODS server.
Some test might fail due to network connection. You can skip them with:
nosetests -a '!network' tests/biomaj_tests.py
(To skip the local iRODS test and the network tests, use `-a '!network,!local_irods'`).
# Run
......
"""
Subclasses for direct download (i.e. downloading without regexp). The usage is
a bit different: instead of calling method:`list` and method:`match`, client
code explicitely calls method:`set_files_to_download` (passing a list
containing only the file name). method:`list` is used to get more information
about the file (if possile). method:`match` matches everything.
Also client code can use method:`set_save_as` to indicate the name of the file
to save.
The trick for the implementation is to override
method:`_append_file_to_download` to initialize the rfile with the file name
and dummy values. Note that we use a list of rfile even if it contains only one
file.
method:`list` will modify directly the files_to_download.
method:``match` don't call method:`_append_file_to_download` (since the list of
files to download is already set up).
We also override method:`set_files_to_download` to check that we pass only one
file.
"""
import datetime
import time
import pycurl
import os
import re
import hashlib
import sys
from biomaj_download.download.ftp import FTPDownload
from biomaj_download.download.curl import CurlDownload
from biomaj_core.utils import Utils
if sys.version_info[0] < 3:
......@@ -20,28 +37,21 @@ except ImportError:
from StringIO import StringIO as BytesIO
class DirectFTPDownload(FTPDownload):
class DirectFTPDownload(CurlDownload):
'''
download a list of files from FTP, no regexp
'''
def __init__(self, protocol, host, rootdir=''):
'''
ALL_PROTOCOLS = ["ftp", "ftps"]
def _append_file_to_download(self, filename):
'''
Initialize the files in list with today as last-modification date.
Size is also preset to zero, size will be set after download
Size is also preset to zero.
'''
FTPDownload.__init__(self, protocol, host, rootdir)
self.save_as = None
self.headers = {}
def set_files_to_download(self, files):
today = datetime.date.today()
self.files_to_download = []
for file_to_download in files:
rfile = {}
rfile['root'] = ''
rfile['root'] = self.rootdir
rfile['permissions'] = ''
rfile['group'] = ''
rfile['user'] = ''
......@@ -49,188 +59,76 @@ class DirectFTPDownload(FTPDownload):
rfile['month'] = today.month
rfile['day'] = today.day
rfile['year'] = today.year
if file_to_download.endswith('/'):
rfile['name'] = file_to_download[:-1]
if filename.endswith('/'):
rfile['name'] = filename[:-1]
else:
rfile['name'] = file_to_download
rfile['name'] = filename
rfile['hash'] = None
if self.param:
if 'param' not in file_to_download or not file_to_download['param']:
rfile['param'] = self.param
self.files_to_download.append(rfile)
# Use self.save_as even if we use it in list(). This is important.
rfile['save_as'] = self.save_as
super(DirectFTPDownload, self)._append_file_to_download(rfile)
def set_files_to_download(self, files_to_download):
if len(files_to_download) > 1:
self.files_to_download = []
msg = self.__class__.__name__ + ' accepts only 1 file'
self.logger.error(msg)
raise ValueError(msg)
return super(DirectFTPDownload, self).set_files_to_download(files_to_download)
def list(self, directory=''):
'''
FTP protocol does not give us the possibility to get file date from remote url
'''
for rfile in self.files_to_download:
if self.save_as is None:
self.save_as = rfile['name']
rfile['save_as'] = self.save_as
# TODO: are we sure about this implementation ?
return (self.files_to_download, [])
def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
'''
All files to download match, no pattern
'''
if dir_list is None:
dir_list = []
self.files_to_download = file_list
pass
class DirectHttpDownload(DirectFTPDownload):
class DirectHTTPDownload(DirectFTPDownload):
def __init__(self, protocol, host, rootdir=''):
'''
:param file_list: list of files to download on server
:type file_list: list
'''
DirectFTPDownload.__init__(self, protocol, host, rootdir)
self.save_as = None
ALL_PROTOCOLS = ["http", "https"]
def __init__(self, curl_protocol, host, rootdir=''):
DirectFTPDownload.__init__(self, curl_protocol, host, rootdir)
self.method = 'GET'
self.param = {}
def download(self, local_dir, keep_dirs=True):
'''
Download remote files to local_dir
:param local_dir: Directory where files should be downloaded
:type local_dir: str
:param keep_dirs: keep file name directory structure or copy file in local_dir directly
:param keep_dirs: bool
:return: list of downloaded files
'''
self.logger.debug('DirectHTTP:Download')
nb_files = len(self.files_to_download)
if nb_files > 1:
self.files_to_download = []
self.logger.error('DirectHTTP accepts only 1 file')
cur_files = 1
for rfile in self.files_to_download:
if self.kill_received:
raise Exception('Kill request received, exiting')
if not self.save_as:
self.save_as = rfile['name']
else:
rfile['save_as'] = self.save_as
file_dir = local_dir
if keep_dirs:
file_dir = local_dir + os.path.dirname(self.save_as)
file_path = file_dir + '/' + os.path.basename(self.save_as)
# For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
if not os.path.exists(file_dir):
os.makedirs(file_dir)
self.logger.debug('DirectHTTP:Download:Progress' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'] + ', save as ' + self.save_as)
cur_files += 1
if 'url' not in rfile:
rfile['url'] = self.url
fp = open(file_path, "wb")
curl = pycurl.Curl()
if self.proxy is not None:
curl.setopt(pycurl.PROXY, self.proxy)
if self.proxy_auth is not None:
curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
if self.method == 'POST':
# Form data must be provided already urlencoded.
postfields = urlencode(self.param)
# Sets request method to POST,
# Content-Type header to application/x-www-form-urlencoded
# and data to send in request body.
if self.credentials is not None:
curl.setopt(pycurl.USERPWD, self.credentials)
curl.setopt(pycurl.POSTFIELDS, postfields)
try:
curl.setopt(pycurl.URL, rfile['url'] + rfile['root'] + '/' + rfile['name'])
except Exception:
curl.setopt(pycurl.URL, (rfile['url'] + rfile['root'] + '/' + rfile['name']).encode('ascii', 'ignore'))
else:
url = rfile['url'] + rfile['root'] + '/' + rfile['name'] + '?' + urlencode(self.param)
try:
curl.setopt(pycurl.URL, url)
except Exception:
curl.setopt(pycurl.URL, url.encode('ascii', 'ignore'))
curl.setopt(pycurl.WRITEDATA, fp)
start_time = datetime.datetime.now()
start_time = time.mktime(start_time.timetuple())
curl.perform()
end_time = datetime.datetime.now()
end_time = time.mktime(end_time.timetuple())
rfile['download_time'] = end_time - start_time
curl.close()
fp.close()
self.logger.debug('downloaded!')
rfile['name'] = self.save_as
self.set_permissions(file_path, rfile)
return self.files_to_download
def header_function(self, header_line):
# HTTP standard specifies that headers are encoded in iso-8859-1.
# On Python 2, decoding step can be skipped.
# On Python 3, decoding step is required.
header_line = header_line.decode('iso-8859-1')
# Header lines include the first status line (HTTP/1.x ...).
# We are going to ignore all lines that don't have a colon in them.
# This will botch headers that are split on multiple lines...
if ':' not in header_line:
return
# Break the header line into header name and value.
name, value = header_line.split(':', 1)
# Remove whitespace that may be present.
# Header lines include the trailing newline, and there may be whitespace
# around the colon.
name = name.strip()
value = value.strip()
# Header names are case insensitive.
# Lowercase name here.
name = name.lower()
# Now we can actually record the header name and value.
self.headers[name] = value
def _file_url(self, file_to_download):
url = super(DirectHTTPDownload, self)._file_url(file_to_download)
if self.method == "GET":
url += '?' + urlencode(self.param)
return url
def list(self, directory=''):
'''
Try to get file headers to get last_modification and size
'''
self._basic_curl_configuration()
# Specific configuration
self.crl.setopt(pycurl.HEADER, True)
self.crl.setopt(pycurl.NOBODY, True)
for rfile in self.files_to_download:
if self.save_as is None:
self.save_as = rfile['name']
rfile['save_as'] = self.save_as
self.crl.setopt(pycurl.HEADER, True)
if self.credentials is not None:
self.crl.setopt(pycurl.USERPWD, self.credentials)
if self.proxy is not None:
self.crl.setopt(pycurl.PROXY, self.proxy)
if self.proxy_auth is not None:
self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
self.crl.setopt(pycurl.NOBODY, True)
file_url = self._file_url(rfile)
try:
self.crl.setopt(pycurl.URL, self.url + self.rootdir + rfile['name'])
self.crl.setopt(pycurl.URL, file_url)
except Exception:
self.crl.setopt(pycurl.URL, (self.url + self.rootdir + rfile['name']).encode('ascii', 'ignore'))
self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))
# Create a buffer and assign it to the pycurl object
output = BytesIO()
# lets assign this buffer to pycurl object
self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.crl.perform()
# Figure out what encoding was sent with the response, if any.
......
import pycurl
import re
import hashlib
import datetime
import humanfriendly
from biomaj_core.utils import Utils
from biomaj_download.download.ftp import FTPDownload
try:
from io import BytesIO
except ImportError:
from StringIO import StringIO as BytesIO
class HTTPParse(object):
def __init__(self, dir_line, file_line, dir_name=1, dir_date=2, file_name=1, file_date=2, file_date_format=None, file_size=3):
r'''
http.parse.dir.line: <img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
http.parse.file.line: <img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
http.group.dir.name: 1
http.group.dir.date: 2
http.group.file.name: 1
http.group.file.date: 2
http.group.file.size: 3
'''
self.dir_line = dir_line
self.file_line = file_line
self.dir_name = dir_name
self.dir_date = dir_date
self.file_name = file_name
self.file_date = file_date
self.file_size = file_size
self.file_date_format = file_date_format
class HTTPDownload(FTPDownload):
'''
Base class to download files from HTTP
Makes use of http.parse.dir.line etc.. regexps to extract page information
protocol=http
server=ftp.ncbi.nih.gov
remote.dir=/blast/db/FASTA/
remote.files=^alu.*\\.gz$
'''
def __init__(self, protocol, host, rootdir, http_parse=None):
FTPDownload.__init__(self, protocol, host, rootdir)
self.http_parse = http_parse
def list(self, directory=''):
'''
List FTP directory
:return: tuple of file and dirs in current directory with details
'''
self.logger.debug('Download:List:' + self.url + self.rootdir + directory)
try:
self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
except Exception:
self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode('ascii', 'ignore'))
if self.proxy is not None:
self.crl.setopt(pycurl.PROXY, self.proxy)
if self.proxy_auth is not None:
self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)
if self.credentials is not None:
self.crl.setopt(pycurl.USERPWD, self.credentials)
output = BytesIO()
# lets assign this buffer to pycurl object
self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.crl.perform()
# Figure out what encoding was sent with the response, if any.
# Check against lowercased header name.
encoding = None
if 'content-type' in self.headers:
content_type = self.headers['content-type'].lower()
match = re.search(r'charset=(\S+)', content_type)
if match:
encoding = match.group(1)
if encoding is None:
# Default encoding for HTML is iso-8859-1.
# Other content types may have different default encoding,
# or in case of binary data, may have no encoding at all.
encoding = 'iso-8859-1'
# lets get the output in a string
result = output.getvalue().decode(encoding)
r'''
http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
http.group.dir.name': 1,
http.group.dir.date': 2,
http.group.file.name': 1,
http.group.file.date': 2,
http.group.file.size': 3,
'''
rfiles = []
rdirs = []
dirs = re.findall(self.http_parse.dir_line, result)
if dirs is not None and len(dirs) > 0:
for founddir in dirs:
rfile = {}
rfile['permissions'] = ''
rfile['group'] = ''
rfile['user'] = ''
rfile['size'] = 0
date = founddir[self.http_parse.dir_date - 1]
dirdate = date.split()
parts = dirdate[0].split('-')
# 19-Jul-2014 13:02
rfile['month'] = Utils.month_to_num(parts[1])
rfile['day'] = int(parts[0])
rfile['year'] = int(parts[2])
rfile['name'] = founddir[self.http_parse.dir_name - 1]
rdirs.append(rfile)
files = re.findall(self.http_parse.file_line, result)
if files is not None and len(files) > 0:
for foundfile in files:
rfile = {}
rfile['permissions'] = ''
rfile['group'] = ''
rfile['user'] = ''
if self.http_parse.file_size != -1:
rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
else:
rfile['size'] = 0
if self.http_parse.file_date != -1:
date = foundfile[self.http_parse.file_date - 1]
if self.http_parse.file_date_format:
date_object = datetime.datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
rfile['month'] = date_object.month
rfile['day'] = date_object.day
rfile['year'] = date_object.year
else:
dirdate = date.split()
parts = dirdate[0].split('-')
# 19-Jul-2014 13:02
rfile['month'] = Utils.month_to_num(parts[1])
rfile['day'] = int(parts[0])
rfile['year'] = int(parts[2])
else:
today = datetime.datetime.now()
date = '%s-%s-%s' % (today.year, today.month, today.day)
rfile['month'] = today.month
rfile['day'] = today.day
rfile['year'] = today.year
rfile['name'] = foundfile[self.http_parse.file_name - 1]
filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
rfile['hash'] = hashlib.md5(filehash).hexdigest()
rfiles.append(rfile)
return (rfiles, rdirs)
......@@ -24,12 +24,30 @@ class _FakeLock(object):
class DownloadInterface(object):
'''
Main interface that all downloaders must extend
Main interface that all downloaders must extend.
The methods are divided into 2 broad categories:
- setters which act on properties of the downloader; those methods are
important in microservice mode
- file operations which are used to list and match remote files, download
them, etc.
Usually, it is enough to overload list, _append_file_to_download and
_download.
TODO:
- the purpose of some setters (set_server, set_protocol) is not clear
since a subclass cannot always change those parameters arbitrarily
- chroot is not used in BioMaJ
'''
files_num_threads = 4
def __init__(self):
# This variable defines the protocol as passed by the config file (i.e.
# this is directftp for DirectFTPDownload). It is used by the workflow
# to send the download message so it must be set.
self.protocol = None
self.config = None
self.files_to_download = []
self.files_to_copy = []
......@@ -47,13 +65,16 @@ class DownloadInterface(object):
self.logger = logging.getLogger('biomaj')
self.param = None
self.method = None
self.protocol = None
self.server = None
self.offline_dir = None
# Options
self.protocol_options = {}
self.skip_check_uncompress = False
#
# Setters for downloader
#
def set_offline_dir(self, offline_dir):
self.offline_dir = offline_dir
......@@ -61,15 +82,13 @@ class DownloadInterface(object):
self.server = server
def set_protocol(self, protocol):
"""
Method used by DownloadService to set the protocol. This value is
passed from the config file so is not always a real protocol (for
instance it can be "directhttp" for a direct downloader).
"""
self.protocol = protocol
def set_files_to_download(self, files):
self.files_to_download = files
for file_to_download in self.files_to_download:
if self.param:
if 'param' not in file_to_download or not file_to_download['param']:
file_to_download['param'] = self.param
def set_param(self, param):
self.param = param
......@@ -100,6 +119,54 @@ class DownloadInterface(object):
def set_method(self, method):
self.method = method
def set_credentials(self, userpwd):
'''
Set credentials in format user:pwd
:param userpwd: credentials
:type userpwd: str
'''
self.credentials = userpwd
def set_options(self, protocol_options):
"""
Set protocol specific options.
Subclasses that override this method must call the
parent implementation.
"""
self.protocol_options = protocol_options
if "skip_check_uncompress" in protocol_options:
self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
#
# File operations (match, list, download) and associated hook methods
#
def _append_file_to_download(self, rfile):
"""
Add a file to the download list and check its properties (this method
is called in `match` and `set_files_to_download`).
Downloaders can override this to add some properties to the file (for
instance, most of them will add "root").
"""
# Add properties to the file if needed (for safety)
if 'save_as' not in rfile or rfile['save_as'] is None:
rfile['save_as'] = rfile['name']
if self.param:
if 'param' not in rfile or not rfile['param']:
rfile['param'] = self.param
self.files_to_download.append(rfile)
def set_files_to_download(self, files):
"""
Convenience method to set the list of files to download.
"""
self.files_to_download = []
for file_to_download in files:
self._append_file_to_download(file_to_download)
def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False):
'''
Find files matching patterns. Sets instance variable files_to_download.
......@@ -130,13 +197,12 @@ class DownloadInterface(object):
if subdir == '^':
subdirs_pattern = subdirs_pattern[1:]
subdir = subdirs_pattern[0]
if not dir_list and pattern == '**/*':
# Take all and no more dirs, take all files
# If getting all, get all files
if pattern == '**/*':
for rfile in file_list:
rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
self.files_to_download.append(rfile)
self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
return
for direlt in dir_list:
......@@ -147,10 +213,9 @@ class DownloadInterface(object):
self.match([pattern], subfile_list, subdirs_list, prefix + '/' + subdir, True)
for rfile in file_list:
if pattern == '**/*' or re.match(pattern, rfile['name']):
rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
self.files_to_download.append(rfile)
self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
else:
if re.match(subdirs_pattern[0], subdir):
......@@ -163,10 +228,9 @@ class DownloadInterface(object):
else:
for rfile in file_list:
if re.match(pattern, rfile['name']):
rfile['root'] = self.rootdir
if prefix != '':
rfile['name'] = prefix + '/' + rfile['name']
self.files_to_download.append(rfile)
self._append_file_to_download(rfile)
self.logger.debug('Download:File:MatchRegExp:' + rfile['name'])
if not submatch and len(self.files_to_download) == 0:
raise Exception('no file found matching expressions')
......@@ -226,7 +290,6 @@ class DownloadInterface(object):
self.files_to_copy.append(dfile)
else:
new_files_to_download.append(dfile)
else:
# Copy everything
for dfile in self.files_to_download:
......@@ -236,17 +299,66 @@ class DownloadInterface(object):
else:
new_files_to_download.append(dfile)
self.files_to_download = new_files_to_download
self.set_files_to_download(new_files_to_download)
def download(self, local_dir):
def _download(self, file_path, rfile):
'''
Download one file and return False in case of success and True
otherwise. This must be implemented in subclasses.
'''
raise NotImplementedError()
def download(self, local_dir, keep_dirs=True):
'''
Download remote files to local_dir
:param local_dir: Directory where files should be downloaded
:type local_dir: str
:param keep_dirs: keep file name directory structure or copy file in local_dir directly
:param keep_dirs: bool
:return: list of downloaded files
'''
pass
self.logger.debug(self.__class__.__name__ + ':Download')
nb_files = len(self.files_to_download)
cur_files = 1
self.offline_dir = local_dir
for rfile in self.files_to_download:
if self.kill_received:
raise Exception('Kill request received, exiting')
# Determine where to store file (directory and name)
file_dir = local_dir
if keep_dirs:
file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
if file_dir[-1] == "/":
file_path = file_dir + os.path.basename(rfile['save_as'])
else:
file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
# For unit tests only, workflow will take in charge directory
# creation before to avoid thread multi access
if not os.path.exists(file_dir):
os.makedirs(file_dir)
msg = self.__class__.__name__ + ':Download:Progress:'
msg += str(cur_files) + '/' + str(nb_files)
msg += ' downloading file ' + rfile['name'] + ' save as ' + rfile['save_as']
self.logger.debug(msg)
cur_files += 1
start_time = datetime.datetime.now()
start_time = time.mktime(start_time.timetuple())
error = self._download(file_path, rfile)
if error:
rfile['download_time'] = 0
rfile['error'] = True
raise Exception(self.__class__.__name__ + ":Download:Error:" + rfile["name"])
else:
end_time = datetime.datetime.now()
end_time = time.mktime(end_time.timetuple())
rfile['download_time'] = end_time - start_time
# Set permissions
self.set_permissions(file_path, rfile)
return self.files_to_download
def list(self):
'''
......@@ -262,26 +374,6 @@ class DownloadInterface(object):
'''
pass
def set_credentials(self, userpwd):
'''
Set credentials in format user:pwd
:param userpwd: credentials
:type userpwd: str
'''
self.credentials = userpwd
def set_options(self, protocol_options):
"""
Set protocol specific options.
Subclasses that override this method must call the
parent implementation.
"""
self.protocol_options = protocol_options
if "skip_check_uncompress" in protocol_options:
self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"])
def close(self):
'''
Close connection
......
......@@ -15,7 +15,6 @@ class LocalDownload(DownloadInterface):
remote.dir=/blast/db/FASTA/
remote.files=^alu.*\\.gz$
'''
def __init__(self, rootdir, use_hardlinks=False):
......@@ -24,6 +23,11 @@ class LocalDownload(DownloadInterface):
self.rootdir = rootdir
self.use_hardlinks = use_hardlinks
def _append_file_to_download(self, rfile):
if 'root' not in rfile or not rfile['root']:
rfile['root'] = self.rootdir
super(LocalDownload, self)._append_file_to_download(rfile)
def download(self, local_dir):
'''
Copy local files to local_dir
......
import logging
import os
from datetime import datetime
import time
from biomaj_core.utils import Utils
from biomaj_download.download.interface import DownloadInterface
from irods.session import iRODSSession
from irods.models import Collection, DataObject, User
from irods.models import DataObject, User
class IRODSDownload(DownloadInterface):
# To connect to irods session : sess = iRODSSession(host='localhost', port=1247, user='rods', password='rods', zone='tempZone')
# password : self.credentials
def __init__(self, protocol, server, remote_dir):
# This is used only for messages
real_protocol = "irods"
def __init__(self, server, remote_dir):
DownloadInterface.__init__(self)
self.port = None
self.remote_dir = remote_dir # directory on the remote server : zone
self.port = 1247
self.remote_dir = remote_dir # directory on the remote server including zone
self.rootdir = remote_dir
self.user = None
self.password = None
self.server = server
self.zone = None
self.zone = remote_dir.split("/")[0]
def _append_file_to_download(self, rfile):
if 'root' not in rfile or not rfile['root']:
rfile['root'] = self.rootdir
super(IRODSDownload, self)._append_file_to_download(rfile)
def set_param(self, param):
# self.param is a dictionnary which has the following form :{'password': u'biomaj', 'protocol': u'iget', 'user': u'biomaj', 'port': u'port'}
# param is a dictionary which has the following form :
# {'password': u'biomaj', 'user': u'biomaj', 'port': u'port'}
# port is optional
self.param = param
self.port = int(param['port'])
self.user = str(param['user'])
self.password = str(param['password'])
self.zone = str(param['zone'])
if 'port' in param:
self.port = int(param['port'])
def list(self, directory=''):
session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
......@@ -36,10 +39,13 @@ class IRODSDownload(DownloadInterface):
rdirs = []
rfile = {}
date = None
for result in session.query(Collection.name, DataObject.name, DataObject.size, DataObject.owner_name, DataObject.modify_time).filter(User.name == self.user).get_results():
# if the user is biomaj : he will have access to all the irods data (biomaj ressource) : drwxr-xr-x
query = session.query(DataObject.name, DataObject.size,
DataObject.owner_name, DataObject.modify_time)
results = query.filter(User.name == self.user).get_results()
for result in results:
# Avoid duplication
if rfile != {} and rfile['name'] == str(result[DataObject.name]) and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
if rfile != {} and rfile['name'] == str(result[DataObject.name]) \
and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'):
continue
rfile = {}
date = str(result[DataObject.modify_time]).split(" ")[0].split('-')
......@@ -49,81 +55,28 @@ class IRODSDownload(DownloadInterface):
rfile['day'] = int(date[2])
rfile['year'] = int(date[0])
rfile['name'] = str(result[DataObject.name])
rfile['download_path'] = str(result[Collection.name])
rfiles.append(rfile)
session.cleanup()
return (rfiles, rdirs)
def download(self, local_dir, keep_dirs=True):
'''
Download remote files to local_dir
:param local_dir: Directory where files should be downloaded
:type local_dir: str
:param keep_dirs: keep file name directory structure or copy file in local_dir directly
:param keep_dirs: bool
:return: list of downloaded files
'''
logging.debug('IRODS:Download')
try:
os.chdir(local_dir)
except TypeError:
logging.error("IRODS:list:Could not find offline_dir")
nb_files = len(self.files_to_download)
cur_files = 1
# give a working directory to copy the file from irods
remote_dir = self.remote_dir
for rfile in self.files_to_download:
if self.kill_received:
raise Exception('Kill request received, exiting')
file_dir = local_dir
if 'save_as' not in rfile or rfile['save_as'] is None:
rfile['save_as'] = rfile['name']
if keep_dirs:
file_dir = local_dir + os.path.dirname(rfile['save_as'])
file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
# For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
if not os.path.exists(file_dir):
os.makedirs(file_dir)
logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
cur_files += 1
start_time = datetime.now()
start_time = time.mktime(start_time.timetuple())
self.remote_dir = rfile['root']
error = self.irods_download(file_dir, str(self.remote_dir), str(rfile['name']))
if error:
rfile['download_time'] = 0
rfile['error'] = True
raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
else:
archive_status = Utils.archive_check(file_path)
if not archive_status:
self.logger.error('Archive is invalid or corrupted, deleting file')
rfile['error'] = True
if os.path.exists(file_path):
os.remove(file_path)
raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name'])
end_time = datetime.now()
end_time = time.mktime(end_time.timetuple())
rfile['download_time'] = end_time - start_time
self.set_permissions(file_path, rfile)
self.remote_dir = remote_dir
return(self.files_to_download)
def irods_download(self, file_dir, file_path, file_to_download):
def _download(self, file_dir, rfile):
error = False
logging.debug('IRODS:IRODS DOWNLOAD')
session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone)
self.logger.debug('IRODS:IRODS DOWNLOAD')
session = iRODSSession(host=self.server, port=self.port,
user=self.user, password=self.password,
zone=self.zone)
try:
file_to_get = str(file_path) + str(file_to_download)
# Write the file to download in the wanted file_dir : with the python-irods iget
# iRODS don't like multiple "/"
if rfile['root'][-1] == "/":
file_to_get = rfile['root'] + rfile['name']
else:
file_to_get = rfile['root'] + "/" + rfile['name']
# Write the file to download in the wanted file_dir with the
# python-irods iget
obj = session.data_objects.get(file_to_get, file_dir)
except ExceptionIRODS as e:
logging.error("RsyncError:" + str(e))
logging.error("RsyncError: irods object" + str(obj))
self.logger.error(self.__class__.__name__ + ":Download:Error:Can't get irods object " + str(obj))
self.logger.error(self.__class__.__name__ + ":Download:Error:" + str(e))
session.cleanup()
return(error)
......
# from future import standard_library
# standard_library.install_aliases()
# from builtins import str
import logging
import re
import os
import subprocess
from datetime import datetime
import time
from biomaj_download.download.interface import DownloadInterface
class RSYNCDownload(DownloadInterface):
'''
Base class to download files from rsyncc
Base class to download files from rsync
protocol = rsync
server =
remote.dir =
......@@ -21,18 +18,76 @@ class RSYNCDownload(DownloadInterface):
remote.files =
'''
def __init__(self, protocol, server, remote_dir):
# This is used to forge the command
real_protocol = "rsync"
def __init__(self, server, rootdir):
DownloadInterface.__init__(self)
logging.debug('Download')
self.rootdir = remote_dir
self.protocol = protocol
if server and remote_dir:
self.logger.debug('Download')
# If rootdir is not given, we are in local mode. In this case, server
# is interpreted as rootdir
self.local_mode = not rootdir
if not self.local_mode:
self.server = server # name of the remote server
self.remote_dir = remote_dir # directory on the remote server
self.rootdir = rootdir # directory on the remote server
else:
if server:
self.server = server
self.remote_dir = ""
self.server = None
self.rootdir = server
# give a working directory to run rsync
if self.local_mode:
try:
os.chdir(self.rootdir)
except TypeError:
self.logger.error("RSYNC:Could not find local dir " + self.rootdir)
def _append_file_to_download(self, rfile):
if 'root' not in rfile or not rfile['root']:
rfile['root'] = self.rootdir
super(RSYNCDownload, self)._append_file_to_download(rfile)
def _remote_file_name(self, rfile):
# rfile['root'] is set to self.rootdir. We don't use os.path.join
# because rfile['name'] may starts with /
url = rfile['root'] + "/" + rfile['name']
if not self.local_mode:
url = self.server + ":" + url
return url
def _download(self, file_path, rfile):
error = False
err_code = ''
url = self._remote_file_name(rfile)
# Create the rsync command
if self.credentials:
cmd = str(self.real_protocol) + " " + str(self.credentials) + "@" + url + " " + str(file_path)
else:
cmd = str(self.real_protocol) + " " + url + " " + str(file_path)
self.logger.debug('RSYNC:RSYNC DOwNLOAD:' + cmd)
# Launch the command (we are in offline_dir)
try:
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
err_code = p.returncode
self.test_stderr_rsync_message(stderr)
self.test_stderr_rsync_error(stderr)
except ExceptionRsync as e:
self.logger.error(str(self.real_protocol) + " error:" + str(e))
if err_code != 0:
self.logger.error('Error while downloading ' + rfile["name"] + ' - ' + str(err_code))
error = True
return(error)
def test_stderr_rsync_error(self, stderr):
stderr = str(stderr.decode('utf-8'))
if "rsync error" in str(stderr):
reason = stderr.split(str(self.real_protocol) + " error:")[1].split("\n")[0]
raise ExceptionRsync(reason)
def test_stderr_rsync_message(self, stderr):
stderr = str(stderr.decode('utf-8'))
if "rsync:" in str(stderr):
reason = stderr.split(str(self.real_protocol) + ":")[1].split("\n")[0]
raise ExceptionRsync(reason)
def list(self, directory=''):
'''
......@@ -43,18 +98,14 @@ class RSYNCDownload(DownloadInterface):
err_code = None
rfiles = []
rdirs = []
logging.debug('RSYNC:List')
# give a working directory to run rsync
try:
os.chdir(self.offline_dir)
except TypeError:
logging.error("RSYNC:list:Could not find offline_dir")
if self.remote_dir and self.credentials:
cmd = str(self.protocol) + " --list-only " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(directory)
elif (self.remote_dir and not self.credentials):
cmd = str(self.protocol) + " --list-only " + str(self.server) + ":" + str(self.remote_dir) + str(directory)
else: # Local rsync for unitest
cmd = str(self.protocol) + " --list-only " + str(self.server) + str(directory)
self.logger.debug('RSYNC:List')
if self.local_mode:
remote = str(self.rootdir) + str(directory)
else:
remote = str(self.server) + ":" + str(self.rootdir) + str(directory)
if self.credentials:
remote = str(self.credentials) + "@" + remote
cmd = str(self.real_protocol) + " --list-only " + remote
try:
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
list_rsync, err = p.communicate()
......@@ -62,9 +113,9 @@ class RSYNCDownload(DownloadInterface):
self.test_stderr_rsync_error(err)
err_code = p.returncode
except ExceptionRsync as e:
logging.error("RsyncError:" + str(e))
self.logger.error("RsyncError:" + str(e))
if err_code != 0:
logging.error('Error while listing ' + str(err_code))
self.logger.error('Error while listing ' + str(err_code))
return(rfiles, rdirs)
list_rsync = str(list_rsync.decode('utf-8'))
lines = list_rsync.rstrip().split("\n")
......@@ -92,97 +143,6 @@ class RSYNCDownload(DownloadInterface):
return (rfiles, rdirs)
def download(self, local_dir, keep_dirs=True):
'''
Download remote files to local_dir
:param local_dir: Directory where files should be downloaded
:type local_dir: str
:param keep_dirs: keep file name directory structure or copy file in local_dir directly
:param keep_dirs: bool
:return: list of downloaded files
'''
logging.debug('RSYNC:Download')
nb_files = len(self.files_to_download)
cur_files = 1
# give a working directory to run rsync
try:
os.chdir(self.offline_dir)
except TypeError:
logging.error("RSYNC:list:Could not find offline_dir")
for rfile in self.files_to_download:
if self.kill_received:
raise Exception('Kill request received, exiting')
file_dir = local_dir
if 'save_as' not in rfile or rfile['save_as'] is None:
rfile['save_as'] = rfile['name']
if keep_dirs:
file_dir = local_dir + '/' + os.path.dirname(rfile['save_as'])
if re.match(r'\S*\/$', file_dir):
file_path = file_dir + '/' + os.path.basename(rfile['save_as'])
else:
file_path = file_dir + os.path.basename(rfile['save_as'])
# For unit tests only, workflow will take in charge directory creation before to avoid thread multi access
if not os.path.exists(file_dir):
os.makedirs(file_dir)
logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name'])
logging.debug('RSYNC:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as'])
cur_files += 1
start_time = datetime.now()
start_time = time.mktime(start_time.timetuple())
error = self.rsync_download(file_path, rfile['name'])
if error:
rfile['download_time'] = 0
rfile['error'] = True
raise Exception("RSYNC:Download:Error:" + rfile['root'] + '/' + rfile['name'])
end_time = datetime.now()
end_time = time.mktime(end_time.timetuple())
rfile['download_time'] = end_time - start_time
self.set_permissions(file_path, rfile)
return(self.files_to_download)
def rsync_download(self, file_path, file_to_download):
error = False
err_code = ''
logging.debug('RSYNC:RSYNC DOwNLOAD')
# give a working directory to run rsync
try:
os.chdir(self.offline_dir)
except TypeError:
logging.error("RSYNC:list:Could not find offline_dir")
try:
if self.remote_dir and self.credentials: # download on server
cmd = str(self.protocol) + " " + str(self.credentials) + "@" + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
elif self.remote_dir and not self.credentials:
cmd = str(self.protocol) + " " + str(self.server) + ":" + str(self.remote_dir) + str(file_to_download) + " " + str(file_path)
else: # Local rsync for unitest
cmd = str(self.protocol) + " " + str(self.server) + str(file_to_download) + " " + str(file_path)
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
err_code = p.returncode
self.test_stderr_rsync_message(stderr)
self.test_stderr_rsync_error(stderr)
except ExceptionRsync as e:
logging.error("RsyncError:" + str(e))
if err_code != 0:
logging.error('Error while downloading ' + file_to_download + ' - ' + str(err_code))
error = True
return(error)
def test_stderr_rsync_error(self, stderr):
stderr = str(stderr.decode('utf-8'))
if "rsync error" in str(stderr):
reason = stderr.split(str(self.protocol) + " error:")[1].split("\n")[0]
raise ExceptionRsync(reason)
def test_stderr_rsync_message(self, stderr):
stderr = str(stderr.decode('utf-8'))
if "rsync:" in str(stderr):
reason = stderr.split(str(self.protocol) + ":")[1].split("\n")[0]
raise ExceptionRsync(reason)
class ExceptionRsync(Exception):
def __init__(self, exception_reason):
......
......@@ -13,10 +13,9 @@ import pika
from flask import Flask
from flask import jsonify
from biomaj_download.download.ftp import FTPDownload
from biomaj_download.download.http import HTTPDownload
from biomaj_download.download.curl import CurlDownload
from biomaj_download.download.direct import DirectFTPDownload
from biomaj_download.download.direct import DirectHttpDownload
from biomaj_download.download.direct import DirectHTTPDownload
from biomaj_download.download.localcopy import LocalDownload
from biomaj_download.message import downmessage_pb2
from biomaj_download.download.rsync import RSYNCDownload
......@@ -134,24 +133,24 @@ class DownloadService(object):
protocol_options={}):
protocol = downmessage_pb2.DownloadFile.Protocol.Value(protocol_name.upper())
downloader = None
if protocol in [0, 1]:
downloader = FTPDownload(protocol_name, server, remote_dir)
if protocol in [2, 3]:
downloader = HTTPDownload(protocol_name, server, remote_dir, http_parse)
if protocol == 7:
downloader = LocalDownload(remote_dir)
if protocol == 4:
downloader = DirectFTPDownload('ftp', server, '/')
if protocol == 10:
if protocol in [0, 1]: # FTP, SFTP
downloader = CurlDownload(protocol_name, server, remote_dir)
if protocol in [2, 3]: # HTTP, HTTPS (could be factored with previous case)
downloader = CurlDownload(protocol_name, server, remote_dir, http_parse)
if protocol == 4: # DirectFTP
downloader = DirectFTPDownload("ftp", server, '/')
if protocol == 5: # DirectHTTP
downloader = DirectHTTPDownload("http", server, '/')
if protocol == 6: # DirectHTTPS
downloader = DirectHTTPDownload("https", server, '/')
if protocol == 10: # DirectFTPS
downloader = DirectFTPDownload('ftps', server, '/')
if protocol == 5:
downloader = DirectHttpDownload('http', server, '/')
if protocol == 6:
downloader = DirectHttpDownload('https', server, '/')
if protocol == 8:
downloader = RSYNCDownload('rsync', server, remote_dir)
if protocol == 9:
downloader = IRODSDownload('irods', server, remote_dir)
if protocol == 7: # Local
downloader = LocalDownload(remote_dir)
if protocol == 8: # RSYNC
downloader = RSYNCDownload(server, remote_dir)
if protocol == 9: # iRods
downloader = IRODSDownload(server, remote_dir)
if downloader is None:
return None
......@@ -182,11 +181,13 @@ class DownloadService(object):
if save_as:
downloader.set_save_as(save_as)
if param:
downloader.set_param(param)
downloader.set_server(server)
# Set the name of the BioMAJ protocol to which we respond.
downloader.set_protocol(protocol_name)
if protocol_options is not None:
......
biomaj3-download (3.0.27-1) UNRELEASED; urgency=medium
[ PENDING ]
Needs python3-ftputil, in NEW queue
biomaj3-download (3.1.0-1) unstable; urgency=medium
[ Olivier Sallou ]
* New upstream release
-- Olivier Sallou <osallou@debian.org> Wed, 16 Oct 2019 13:17:33 +0000
-- Olivier Sallou <osallou@debian.org> Tue, 12 Nov 2019 10:18:15 +0000
biomaj3-download (3.0.21-1) unstable; urgency=medium
......
......@@ -12,6 +12,7 @@ Build-Depends: debhelper (>= 12~),
python3-consul,
python3-flask,
python3-humanfriendly,
python3-irodsclient,
python3-mock,
python3-nose,
python3-pika,
......@@ -25,7 +26,8 @@ Build-Depends: debhelper (>= 12~),
python3-yaml,
python3-biomaj3-core (>= 3.0.19),
python3-biomaj3-zipkin,
python3-ftputil
python3-ftputil,
rsync
Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/biomaj3-download
Vcs-Git: https://salsa.debian.org/med-team/biomaj3-download.git
......
......@@ -22,7 +22,7 @@ config = {
'url': 'http://biomaj.genouest.org',
'download_url': 'http://biomaj.genouest.org',
'author_email': 'olivier.sallou@irisa.fr',
'version': '3.0.27',
'version': '3.1.0',
'classifiers': [
# How mature is this project? Common values are
# 3 - Alpha
......
from nose.tools import *
"""
Note that attributes 'network' and 'local_irods' are ignored for CI.
"""
from nose.plugins.attrib import attr
import json
import shutil
import os
import sys
import tempfile
import logging
import copy
import stat
import time
from mock import patch
from optparse import OptionParser
from biomaj_core.config import BiomajConfig
from biomaj_core.utils import Utils
from biomaj_download.download.ftp import FTPDownload
from biomaj_download.download.direct import DirectFTPDownload, DirectHttpDownload
from biomaj_download.download.http import HTTPDownload, HTTPParse
from biomaj_download.download.curl import CurlDownload, HTTPParse
from biomaj_download.download.direct import DirectFTPDownload, DirectHTTPDownload
from biomaj_download.download.localcopy import LocalDownload
from biomaj_download.download.downloadthreads import DownloadThread
from biomaj_download.download.rsync import RSYNCDownload
from biomaj_download.download.protocolirods import IRODSDownload
import pprint
import unittest
class UtilsForTest():
......@@ -263,7 +255,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.utils.clean()
def test_http_list(self):
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.close()
self.assertTrue(len(file_list) == 1)
......@@ -271,7 +263,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_list_dateregexp(self):
#self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.close()
self.assertTrue(len(file_list) == 1)
......@@ -287,7 +279,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
-1
)
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
......@@ -304,7 +296,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.config.get('http.group.file.date_format', None),
int(self.config.get('http.group.file.size'))
)
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
......@@ -313,7 +305,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_download(self):
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse)
(file_list, dir_list) = httpd.list()
print(str(file_list))
httpd.match([r'^README$'], file_list, dir_list)
......@@ -323,7 +315,7 @@ class TestBiomajHTTPDownload(unittest.TestCase):
def test_http_download_in_subdir(self):
self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M"
httpd = HTTPDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^dists/README$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
......@@ -331,6 +323,65 @@ class TestBiomajHTTPDownload(unittest.TestCase):
self.assertTrue(len(httpd.files_to_download) == 1)
@attr('network')
@attr('https')
class TestBiomajHTTPSDownload(unittest.TestCase):
"""
Test HTTPS downloader
"""
def setUp(self):
self.utils = UtilsForTest()
def tearDown(self):
self.utils.clean()
def test_download(self):
self.utils = UtilsForTest()
self.http_parse = HTTPParse(
"<a[\s]+href=\"([\w\-\.]+\">[\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
"<a[\s]+href=\"[\w\-\.]+\">([\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])",
1,
2,
1,
2,
None,
3
)
self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M"
httpd = CurlDownload('https', 'mirrors.edge.kernel.org', '/pub/software/scm/git/debian/', self.http_parse)
(file_list, dir_list) = httpd.list()
httpd.match([r'^git-core-0.99.6.tar.gz$'], file_list, dir_list)
httpd.download(self.utils.data_dir)
httpd.close()
self.assertTrue(len(httpd.files_to_download) == 1)
@attr('network')
@attr('sftp')
class TestBiomajSFTPDownload(unittest.TestCase):
"""
Test SFTP downloader
"""
PROTOCOL = "ftps"
def setUp(self):
self.utils = UtilsForTest()
def tearDown(self):
self.utils.clean()
def test_download(self):
sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
sftpd.set_credentials("demo:password")
(file_list, dir_list) = sftpd.list()
sftpd.match([r'^readme.txt$'], file_list, dir_list)
sftpd.download(self.utils.data_dir)
sftpd.close()
self.assertTrue(len(sftpd.files_to_download) == 1)
@attr('directftp')
@attr('network')
class TestBiomajDirectFTPDownload(unittest.TestCase):
......@@ -411,7 +462,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_http_list(self):
file_list = ['/debian/README.html']
ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
fday = ftpd.files_to_download[0]['day']
fmonth = ftpd.files_to_download[0]['month']
......@@ -424,7 +475,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download(self):
file_list = ['/debian/README.html']
ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
(file_list, dir_list) = ftpd.list()
ftpd.download(self.utils.data_dir, False)
......@@ -433,7 +484,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download_get_params_save_as(self):
file_list = ['/get']
ftpd = DirectHttpDownload('http', 'httpbin.org', '')
ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
ftpd.set_files_to_download(file_list)
ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
ftpd.save_as = 'test.json'
......@@ -449,7 +500,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
@attr('test')
def test_download_save_as(self):
file_list = ['/debian/README.html']
ftpd = DirectHttpDownload('http', 'ftp2.fr.debian.org', '')
ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '')
ftpd.set_files_to_download(file_list)
ftpd.save_as = 'test.html'
(file_list, dir_list) = ftpd.list()
......@@ -460,7 +511,7 @@ class TestBiomajDirectHTTPDownload(unittest.TestCase):
def test_download_post_params(self):
#file_list = ['/debian/README.html']
file_list = ['/post']
ftpd = DirectHttpDownload('http', 'httpbin.org', '')
ftpd = DirectHTTPDownload('http', 'httpbin.org', '')
ftpd.set_files_to_download(file_list)
ftpd.param = { 'key1': 'value1', 'key2': 'value2'}
ftpd.save_as = 'test.json'
......@@ -489,19 +540,19 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.utils.clean()
def test_ftp_list(self):
ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
(file_list, dir_list) = ftpd.list()
ftpd.close()
self.assertTrue(len(file_list) > 1)
@attr('test')
def test_download(self):
ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
# This tests fails because the zip file is fake. We intercept the failure
# and continue.
# See test_download_skip_uncompress_checks
# See test_download_skip_check_uncompress
try:
ftpd.download(self.utils.data_dir)
except Exception:
......@@ -511,9 +562,9 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 2)
ftpd.close()
def test_download_skip_checks_uncompress(self):
def test_download_skip_check_uncompress(self):
# This test is similar to test_download but we skip test of zip file.
ftpd = FTPDownload('ftp', 'speedtest.tele2.net', '/')
ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/')
ftpd.set_options(dict(skip_check_uncompress=True))
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^1.*KB\.zip$'], file_list, dir_list)
......@@ -522,7 +573,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 2)
def test_download_in_subdir(self):
ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
(file_list, dir_list) = ftpd.list()
try:
ftpd.match([r'^doc/mailing-lists.txt$'], file_list, dir_list)
......@@ -534,7 +585,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(len(ftpd.files_to_download) == 1)
def test_download_or_copy(self):
ftpd = FTPDownload('ftp', 'ftp.fr.debian.org', '/debian/')
ftpd = CurlDownload('ftp', 'ftp.fr.debian.org', '/debian/')
ftpd.files_to_download = [
{'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
{'name':'/test2', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
......@@ -566,7 +617,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
self.assertTrue(release['day']=='12')
def test_ms_server(self):
ftpd = FTPDownload("ftp", "test.rebex.net", "/")
ftpd = CurlDownload("ftp", "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.match(["^readme.txt$"], file_list, dir_list)
......@@ -579,7 +630,7 @@ class TestBiomajFTPDownload(unittest.TestCase):
Test setting tcp_keepalive (it probably doesn't change anything here but
we test that there is no obvious mistake in the code).
"""
ftpd = FTPDownload("ftp", "test.rebex.net", "/")
ftpd = CurlDownload("ftp", "test.rebex.net", "/")
ftpd.set_options(dict(tcp_keepalive=10))
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
......@@ -604,14 +655,14 @@ class TestBiomajFTPSDownload(unittest.TestCase):
self.utils.clean()
def test_ftps_list(self):
ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.close()
self.assertTrue(len(file_list) == 1)
def test_download(self):
ftpd = FTPDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/")
ftpd.set_credentials("demo:password")
(file_list, dir_list) = ftpd.list()
ftpd.match([r'^readme.txt$'], file_list, dir_list)
......@@ -624,7 +675,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
ftpd.set_credentials(CREDENTIALS)
(file_list, dir_list) = ftpd.list()
......@@ -636,7 +687,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False"))
ftpd.set_credentials(CREDENTIALS)
(file_list, dir_list) = ftpd.list()
......@@ -651,7 +702,7 @@ class TestBiomajFTPSDownload(unittest.TestCase):
SERVER = "demo.wftpserver.com"
DIRECTORY = "/download/"
CREDENTIALS = "demo-user:demo-user"
ftpd = FTPDownload(self.PROTOCOL, SERVER, DIRECTORY)
ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY)
curdir = os.path.dirname(os.path.realpath(__file__))
cert_file = os.path.join(curdir, "caert.demo.wftpserver.com.pem")
ftpd.set_options(dict(ssl_verifyhost="False", ssl_server_cert=cert_file))
......@@ -672,7 +723,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
def setUp(self):
self.utils = UtilsForTest()
self.curdir = os.path.dirname(os.path.realpath(__file__))
self.curdir = os.path.dirname(os.path.realpath(__file__)) + '/'
self.examples = os.path.join(self.curdir,'bank') + '/'
BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False)
......@@ -680,40 +731,34 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
self.utils.clean()
def test_rsync_list(self):
rsyncd = RSYNCDownload('rsync', self.examples, "")
rsyncd.set_credentials(None)
rsyncd.set_offline_dir(self.utils.data_dir)
rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
self.assertTrue(len(files_list) != 0)
def test_rsync_match(self):
rsyncd = RSYNCDownload('rsync', self.examples, "")
rsyncd.set_credentials(None)
rsyncd.set_offline_dir(self.utils.data_dir)
rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'], files_list, dir_list, prefix='', submatch=False)
self.assertTrue(len(rsyncd.files_to_download) != 0)
def test_rsync_download(self):
rsyncd = RSYNCDownload('rsync', self.examples, "")
rsyncd.set_credentials(None)
rsyncd.set_offline_dir(self.utils.data_dir)
error = rsyncd.rsync_download(self.utils.data_dir, "test2.fasta")
self.assertTrue(error == 0)
rsyncd = RSYNCDownload(self.examples, "")
rfile = {
"name": "test2.fasta",
"root": self.examples
}
error = rsyncd._download(self.utils.data_dir, rfile)
self.assertFalse(error)
def test_rsync_general_download(self):
rsyncd = RSYNCDownload('rsync', self.examples, "")
rsyncd.set_credentials(None)
rsyncd.set_offline_dir(self.utils.data_dir)
rsyncd = RSYNCDownload(self.examples, "")
(files_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'],files_list,dir_list, prefix='')
download_files=rsyncd.download(self.curdir)
self.assertTrue(len(download_files)==1)
def test_rsync_download_or_copy(self):
rsyncd = RSYNCDownload('rsync', self.examples, "")
rsyncd.set_offline_dir(self.utils.data_dir)
rsyncd = RSYNCDownload(self.examples, "")
(file_list, dir_list) = rsyncd.list()
rsyncd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
files_to_download_prev = rsyncd.files_to_download
......@@ -721,8 +766,7 @@ class TestBiomajRSYNCDownload(unittest.TestCase):
self.assertTrue(files_to_download_prev != rsyncd.files_to_download)
def test_rsync_download_in_subdir(self):
rsyncd = RSYNCDownload('rsync', self.curdir+'/', "")
rsyncd.set_offline_dir(self.curdir+'/')
rsyncd = RSYNCDownload(self.curdir, "")
(file_list, dir_list) = rsyncd.list()
rsyncd.match([r'^/bank/test*'], file_list, dir_list, prefix='')
rsyncd.download(self.utils.data_dir)
......@@ -824,8 +868,22 @@ class TestBiomajIRODSDownload(unittest.TestCase):
initialize_mock.return_value=mock_session.configure()
query_mock.return_value = mock_session.query(None,None,None,None,None)
cleanup_mock.return_value = mock_session.cleanup()
irodsd = IRODSDownload('irods', self.examples, "")
irodsd.set_credentials(None)
irodsd.set_offline_dir(self.utils.data_dir)
irodsd = IRODSDownload(self.examples, "")
(files_list, dir_list) = irodsd.list()
self.assertTrue(len(files_list) != 0)
@attr('local_irods')
def test_irods_download(self):
# To run this test, you need an iRODS server on localhost (default
# port, user 'rods', password 'rods'), and populate a zone
# /tempZone/home/rods with a file that matches r'^test.*\.gz$' (for
# instance, by copying tests/bank/test/test.fasta.gz).
irodsd = IRODSDownload("localhost", "/tempZone/home/rods")
irodsd.set_param(dict(
user='rods',
password='rods',
))
(file_list, dir_list) = irodsd.list()
irodsd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='')
irodsd.download(self.utils.data_dir)
self.assertTrue(len(irodsd.files_to_download) == 1)