Skip to content
Commits on Source (4)
language: python
sudo: false
python:
- '2.7'
- '3.4'
- '3.5'
- '2.7'
- '3.4'
- '3.5'
- '3.6'
branches:
except:
- "/^feature.*$/"
......
3.0.19 [unreleased]:
Add ftps and directftps protocols
#4 Harden hardlinks
copy_files_with_regexp: remove duplicates
3.0.18:
Allow to use hardlinks in copy_files and copy_files_with_regexp
3.0.17:
Fix --log option
3.0.16:
Add some warnings if some file is missing
3.0.15:
......
......@@ -8,3 +8,5 @@ Properties can be overriden by environment variables with pattern BIOMAJ_X_Y_Z f
Example:
use BIOMAJ_LDAP_HOST for ldap.host
[![PyPI version](https://badge.fury.io/py/biomaj-core.svg)](https://badge.fury.io/py/biomaj-core)
......@@ -8,6 +8,7 @@ import os
import time
import sys
from biomaj_core.utils import Utils
from biomaj_core.bmajindex import BmajIndex
if sys.version < '3':
......@@ -40,8 +41,10 @@ class BiomajConfig(object):
'db.type': '',
'db.formats': '',
'keep.old.version': 1,
'keep.old.sessions': 0,
'docker.sudo': '1',
'auto_publish': 0
'auto_publish': 0,
'use_hardlinks': 0
}
# Old biomaj level compatibility
......@@ -191,9 +194,11 @@ class BiomajConfig(object):
if options is not None and options.get_option('log') is not None:
hdlr.setLevel(BiomajConfig.LOGLEVEL[options.get_option('log')])
self.log_level = BiomajConfig.LOGLEVEL[options.get_option('log')]
logger.setLevel(self.log_level)
else:
hdlr.setLevel(BiomajConfig.LOGLEVEL[self.get('historic.logfile.level')])
self.log_level = BiomajConfig.LOGLEVEL[self.get('historic.logfile.level')]
logger.setLevel(self.log_level)
formatter = logging.Formatter('%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
......@@ -281,19 +286,9 @@ class BiomajConfig(object):
"""
Get a boolean property from bank or general configration. Optionally in section.
"""
value = None
value = self.get(prop, section, escape, default)
if self._in_env(prop):
value = self._in_env(prop)
else:
value = self.get(prop, section, escape, default)
if value is None:
return False
if value is True or value == 'true' or value == '1':
return True
else:
return False
return Utils.to_bool(value)
def get(self, prop, section='GENERAL', escape=True, default=None):
"""
......@@ -402,8 +397,8 @@ class BiomajConfig(object):
status = False
else:
protocol = self.get('protocol')
allowed_protocols = ['none', 'multi', 'local', 'ftp', 'sftp', 'http',
'https', 'directftp', 'directhttp', 'directhttps', 'rsync', 'irods']
allowed_protocols = ['none', 'multi', 'local', 'ftp', 'ftps', 'http', 'https',
'directftp', 'directftps', 'directhttp', 'directhttps', 'rsync', 'irods']
if protocol not in allowed_protocols:
logging.error('Protocol not supported: ' + protocol)
status = False
......@@ -417,7 +412,7 @@ class BiomajConfig(object):
elif not self.get('remote.dir').endswith('/'):
logging.error('remote.dir must end with a /')
return False
if protocol not in ['direcftp', 'directhttp', 'directhttps'] and\
if protocol not in ['directftp', 'directftps', 'directhttp', 'directhttps'] and\
not self.get('remote.files') and\
not self.get('remote.list'):
logging.error('remote.files not set')
......
import os
import errno
import re
import logging
import shutil
......@@ -152,17 +153,18 @@ class Utils(object):
Each file is a dict like with (at least) parameters: year, month, day
"""
release = None
if not files:
return None
# release = None
rfile = files[0]
release = {'year': rfile['year'], 'month': rfile['month'], 'day': rfile['day']}
for rfile in files:
if release is None:
release = {'year': rfile['year'], 'month': rfile['month'], 'day': rfile['day']}
else:
rel_date = datetime.date(int(release['year']), int(release['month']), int(release['day']))
file_date = datetime.date(int(rfile['year']), int(rfile['month']), int(rfile['day']))
if file_date > rel_date:
release['year'] = rfile['year']
release['month'] = rfile['month']
release['day'] = rfile['day']
rel_date = datetime.date(int(release['year']), int(release['month']), int(release['day']))
file_date = datetime.date(int(rfile['year']), int(rfile['month']), int(rfile['day']))
if file_date > rel_date:
release['year'] = rfile['year']
release['month'] = rfile['month']
release['day'] = rfile['day']
return release
@staticmethod
......@@ -195,7 +197,8 @@ class Utils(object):
}[date]
@staticmethod
def copy_files(files_to_copy, to_dir, move=False, lock=None):
def copy_files(files_to_copy, to_dir, move=False, lock=None,
use_hardlinks=False):
"""
Copy or move files to to_dir, keeping directory structure.
......@@ -214,6 +217,8 @@ class Utils(object):
:type move: bool
:param lock: thread lock object for multi-threads
:type lock: Lock
:param use_hardlinks: use hard links (if possible)
:type link: bool
"""
logger = logging.getLogger('biomaj')
nb_files = len(files_to_copy)
......@@ -244,14 +249,48 @@ class Utils(object):
else:
start_time = datetime.datetime.now()
start_time = time.mktime(start_time.timetuple())
shutil.copyfile(from_file, to_file)
if use_hardlinks:
try:
os.link(from_file, to_file)
logger.debug("Using hardlinks to copy %s",
file_to_copy['name'])
except OSError as e:
if e.errno in (errno.ENOSYS, errno.ENOTSUP):
msg = "Your system doesn't support hard links. Using regular copy."
logger.warn(msg)
# Copy this file (the stats are copied at the end
# of the function)
shutil.copyfile(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
elif e.errno == errno.EPERM:
msg = "The FS at %s doesn't support hard links. Using regular copy."
logger.warn(msg, to_dir)
# Copy this file (the stats are copied at the end
# of the function)
shutil.copyfile(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
elif e.errno == errno.EXDEV:
msg = "Cross device hard link is impossible (source: %s, dest: %s). Using regular copy."
logger.warn(msg, from_file, to_dir)
# Copy this file
shutil.copyfile(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
else:
raise
else:
shutil.copyfile(from_file, to_file)
end_time = datetime.datetime.now()
end_time = time.mktime(end_time.timetuple())
file_to_copy['download_time'] = end_time - start_time
shutil.copystat(from_file, to_file)
if not use_hardlinks:
shutil.copystat(from_file, to_file)
@staticmethod
def copy_files_with_regexp(from_dir, to_dir, regexps, move=False, lock=None):
def copy_files_with_regexp(from_dir, to_dir, regexps, move=False, lock=None,
use_hardlinks=False):
"""
Copy or move files from from_dir to to_dir matching regexps.
Copy keeps the original file stats.
......@@ -266,20 +305,28 @@ class Utils(object):
:type move: bool
:param lock: thread lock object for multi-threads
:type lock: Lock
:param use_hardlinks: use hard links (if possible)
:type link: bool
:return: list of copied files with their size
"""
logger = logging.getLogger('biomaj')
files_to_copy = []
for root, dirs, files in os.walk(from_dir, topdown=True):
files_list = []
for root, _, files in os.walk(from_dir, topdown=True):
for name in files:
for reg in regexps:
file_relative_path = os.path.join(root, name).replace(from_dir, '')
if file_relative_path.startswith('/'):
file_relative_path = file_relative_path.replace('/', '', 1)
# sometimes files appear twice.... check not already managed
if file_relative_path in files_list:
continue
if reg == "**/*":
files_to_copy.append({'name': file_relative_path})
files_list.append(file_relative_path)
continue
if re.match(reg, file_relative_path):
files_list.append(file_relative_path)
files_to_copy.append({'name': file_relative_path})
continue
......@@ -305,8 +352,43 @@ class Utils(object):
if move:
shutil.move(from_file, to_file)
else:
shutil.copyfile(from_file, to_file)
shutil.copystat(from_file, to_file)
if use_hardlinks:
try:
os.link(from_file, to_file)
logger.debug("Using hardlinks to copy %s",
file_to_copy['name'])
except OSError as e:
if e.errno in (errno.ENOSYS, errno.ENOTSUP):
msg = "Your system doesn't support hard links. Using regular copy."
logger.warn(msg)
# Copy this file (the stats are copied at the end
# of the function)
shutil.copyfile(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
elif e.errno == errno.EPERM:
msg = "The FS at %s doesn't support hard links. Using regular copy."
logger.warn(msg, to_dir)
# Copy this file (we copy the stats here because
# it's not done at the end of the function)
shutil.copyfile(from_file, to_file)
shutil.copystat(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
elif e.errno == errno.EXDEV:
msg = "Cross device hard link is impossible (source: %s, dest: %s). Using regular copy."
logger.warn(msg, from_file, to_dir)
# Copy this file (we copy the stats here because
# it's not done at the end of the function)
shutil.copyfile(from_file, to_file)
shutil.copystat(from_file, to_file)
# Don't try links anymore
use_hardlinks = False
else:
raise
else:
shutil.copyfile(from_file, to_file)
shutil.copystat(from_file, to_file)
file_to_copy['size'] = os.path.getsize(to_file)
f_stat = datetime.datetime.fromtimestamp(os.path.getmtime(to_file))
file_to_copy['year'] = str(f_stat.year)
......@@ -384,3 +466,28 @@ class Utils(object):
os.remove(archivefile)
return True
@staticmethod
def to_bool(value):
if isinstance(value, bool):
return value
if not value:
return False
try:
if value.lower() == 'true' or value == '1':
return True
else:
return False
except Exception:
return False
@staticmethod
def to_int(value):
if isinstance(value, int):
return value
if not value:
return 0
try:
return int(value)
except Exception:
return 0
biomaj3-core (3.0.19-1) UNRELEASED; urgency=medium
* New upstream release
-- Olivier Sallou <osallou@debian.org> Wed, 16 Oct 2019 13:14:38 +0000
biomaj3-core (3.0.16-1) unstable; urgency=medium
* New upstream release
......
......@@ -21,7 +21,7 @@ config = {
'url': 'http://biomaj.genouest.org',
'download_url': 'http://biomaj.genouest.org',
'author_email': 'olivier.sallou@irisa.fr',
'version': '3.0.16',
'version': '3.0.19',
'classifiers': [
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
......
from __future__ import print_function
from nose.tools import *
from nose.plugins.attrib import attr
import shutil
import os
import sys
import tempfile
import stat
......@@ -24,6 +27,7 @@ class UtilsForTest:
Setup the temp dirs and files.
"""
self.global_properties = None
self.global_properties_hl = None
self.bank_properties = None
self.test_dir = tempfile.mkdtemp('biomaj')
......@@ -50,6 +54,9 @@ class UtilsForTest:
if self.global_properties is None:
self.__copy_global_properties()
if self.global_properties_hl is None:
self.__copy_global_properties_hl()
if self.bank_properties is None:
self.__copy_test_bank_properties()
......@@ -86,7 +93,8 @@ class UtilsForTest:
'error.properties', 'local.properties',
'localprocess.properties', 'testhttp.properties',
'computed.properties', 'computed2.properties',
'sub1.properties', 'sub2.properties']
'sub1.properties', 'sub2.properties',
'hardlinks.properties']
for prop in properties:
from_file = os.path.join(curdir, prop)
to_file = os.path.join(self.conf_dir, prop)
......@@ -126,6 +134,29 @@ class UtilsForTest:
fout.write(line)
fout.close()
def __copy_global_properties_hl(self):
if self.global_properties_hl is not None:
return
self.global_properties_hl = os.path.join(self.conf_dir, 'global_hardlinks.properties')
curdir = os.path.dirname(os.path.realpath(__file__))
global_template = os.path.join(curdir, 'global_hardlinks.properties')
fout = open(self.global_properties_hl, 'w')
with open(global_template,'r') as fin:
for line in fin:
if line.startswith('conf.dir'):
fout.write("conf.dir="+self.conf_dir+"\n")
elif line.startswith('log.dir'):
fout.write("log.dir="+self.log_dir+"\n")
elif line.startswith('data.dir'):
fout.write("data.dir="+self.data_dir+"\n")
elif line.startswith('process.dir'):
fout.write("process.dir="+self.process_dir+"\n")
elif line.startswith('lock.dir'):
fout.write("lock.dir="+self.lock_dir+"\n")
else:
fout.write(line)
fout.close()
class TestBiomajUtils(unittest.TestCase):
......@@ -163,6 +194,24 @@ class TestBiomajUtils(unittest.TestCase):
endpoint = Utils.get_service_endpoint(config, 'process')
self.assertTrue(endpoint == 'http://localhost')
def test_use_hardlinks_config(self):
"""
Test that hardlinks are disabled by default and can be overridden.
"""
BiomajConfig.load_config(self.utils.global_properties,
allow_user_config=False)
# Must be disabled in local.properties
config = BiomajConfig('local')
self.assertFalse(config.get_bool("use_hardlinks"))
# Must be enabled for hardlinks.properties (override)
config = BiomajConfig('hardlinks')
self.assertTrue(config.get_bool("use_hardlinks"))
# Reload file with use_hardlinks=1
BiomajConfig.load_config(self.utils.global_properties_hl,
allow_user_config=False)
config = BiomajConfig('local')
self.assertTrue(config.get_bool("use_hardlinks"))
def test_mimes(self):
fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'bank/test2.fasta')
......@@ -186,6 +235,39 @@ class TestBiomajUtils(unittest.TestCase):
Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py'])
self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
def test_copy_with_regexp_hardlink(self):
"""
Test copy with hardlinks: we create files in data_dir and try to
link them. This should work unless /tmp don't accept hardlinks.
"""
# Create 5 files and a directory in data_dir. We don't destroy them
# since they are in /tmp.
suffix = ".dat"
regexp = ".*\\" + suffix
orig_file_full = [
tempfile.mkstemp(dir=self.utils.data_dir, suffix=suffix)[1]
for i in range(5)
]
to_dir = tempfile.mkdtemp(dir=self.utils.data_dir)
new_file_full = [os.path.join(to_dir, os.path.basename(f))
for f in orig_file_full]
# Copy
from_dir = self.utils.data_dir
Utils.copy_files_with_regexp(from_dir, to_dir, [regexp],
use_hardlinks=True)
# Check if files was copied
for orig, new in zip(orig_file_full, new_file_full):
self.assertTrue(os.path.exists(new))
# Check if it's really a hardlink. This may fail so we catch
# any exceptions.
orig_file_stat = os.stat(orig)
new_file_stat = os.stat(new)
try:
self.assertTrue(orig_file_stat.st_ino == new_file_stat.st_ino)
except Exception:
msg = "In %s: copy worked but hardlinks were not used." % self.id()
print(msg, file=sys.stderr)
def test_copy(self):
from_dir = os.path.dirname(os.path.realpath(__file__))
local_file = 'biomaj_tests.py'
......@@ -194,6 +276,33 @@ class TestBiomajUtils(unittest.TestCase):
Utils.copy_files(files_to_copy, to_dir)
self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
def test_copy_hardlink(self):
"""
Test copy with hardlinks: we create a file in data_dir and try to
link it. This should work unless /tmp don't accept hardlinks.
"""
# Create a file and a directory in data_dir. We don't destroy them
# since they are in /tmp.
_, orig_file_full = tempfile.mkstemp(dir=self.utils.data_dir)
orig_file = os.path.basename(orig_file_full)
to_dir = tempfile.mkdtemp(dir=self.utils.data_dir)
new_file_full = os.path.join(to_dir, orig_file)
# Copy
from_dir = self.utils.data_dir
files_to_copy = [{'root': from_dir, 'name': orig_file}]
Utils.copy_files(files_to_copy, to_dir, use_hardlinks=True)
# Check if file was copied
self.assertTrue(os.path.exists(new_file_full))
# Check if it's really a hardlink. This may fail so we catch
# any exceptions.
orig_file_stat = os.stat(orig_file_full)
new_file_stat = os.stat(new_file_full)
try:
self.assertTrue(orig_file_stat.st_ino == new_file_stat.st_ino)
except Exception:
msg = "In %s: copy worked but hardlinks were not used." % self.id()
print(msg, file=sys.stderr)
@attr('check')
def test_check_method(self):
"""Check .name, .exe and .args are well check during bank configuration
......
[GENERAL]
test=1
conf.dir=/tmp/biomaj/config
log.dir=/tmp/biomaj/log
process.dir=/tmp/biomaj/process
#The root directory where all databases are stored.
#If your data is not stored under one directory hirearchy
#you can override this value in the database properties file.
data.dir=/tmp/biomaj/
lock.dir=/tmp/biomaj/lock
cache.dir=/tmp/biomaj/cache
use_hardlinks=1
db.url=mongodb://localhost:27017
db.name=biomaj_test
use_ldap=1
ldap.host=localhost
ldap.port=389
ldap.dn=nodomain
# Use ElasticSearch for index/search capabilities
use_elastic=0
#Comma separated list of elasticsearch nodes host1,host2:port2
elastic_nodes=localhost
elastic_index=biomaj_test
celery.queue=biomaj
celery.broker=mongodb://localhost:27017/biomaj_celery
# Get directory stats (can be time consuming depending on number of files etc...)
data.stats=1
# List of user admin (linux user id, comma separated)
admin=
# Auto publish on updates (do not need publish flag, can be ovveriden in bank property file)
auto_publish=0
########################
# Global properties file
#To override these settings for a specific database go to its
#properties file and uncomment or add the specific line you want
#to override.
#----------------
# Mail Configuration
#---------------
#Uncomment thes lines if you want receive mail when the workflow is finished
mail.smtp.host=
mail.admin=
mail.from=
#---------------------
#Proxy authentification
#---------------------
#proxyHost=
#proxyPort=
#proxyUser=
#proxyPassword=
#Number of thread for processes
bank.num.threads=2
#Number of threads to use for downloading
files.num.threads=4
#to keep more than one release increase this value
keep.old.version=0
#----------------------
# Release configuration
#----------------------
release.separator=_
#The historic log file is generated in log/
#define level information for output : DEBUG,INFO,WARN,ERR
historic.logfile.level=DEBUG
#http.parse.dir.line=<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
http.parse.dir.line=<img[\s]+src="[\S]+"[\s]+alt="\[DIR\]"[\s]*/?>[\s]*<a[\s]+href="([\S]+)/"[\s]*>.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})
http.parse.file.line=<img[\s]+src="[\S]+"[\s]+alt="\[[\s]+\]"[\s]*/?>[\s]<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})
http.group.dir.name=1
http.group.dir.date=2
http.group.file.name=1
http.group.file.date=2
http.group.file.size=3
# Bank default access
visibility.default=public
[loggers]
keys = root, biomaj
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = INFO
handlers = console
[logger_biomaj]
level = DEBUG
handlers = console
qualname = biomaj
propagate=0
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = DEBUG
formatter = generic
[formatter_generic]
format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
[GENERAL]
######################
### Initialization ###
db.fullname="local system bank test with hardlinks"
db.name=local
db.type=nucleic_protein
offline.dir.name=offline/test/local_tmp
dir.version=test/local
frequency.update=0
use_hardlinks=1
### Synchronization ###
files.num.threads=1
# NCBI (download fasta)
protocol=local
server=
release.file=
release.regexp=
release.file.compressed=
remote.dir=/tmp/
remote.files=^test.*
#Uncomment if you don't want to extract the data files.
#no.extract=true
local.files=^test.*
## Post Process ## The files should be located in the projectfiles/process directory
db.post.process=
### Deployment ###
keep.old.version=1