Commit ab5e237f authored by BODERE's avatar BODERE
Browse files

Merge branch 'hotfix_6.0.7' into 'master'

Hotfix 6.0.7

Closes #57

See merge request downloader/downloader_daemon!33
parents e72d65c2 76d9982d
......@@ -77,10 +77,12 @@ def spliturl(url, defaultpath='/'):
if path == '':
path = '/'
# on enleve le dernier '/'
while len(
path) > 1 and path[-1] == '/': # > 1 because we don't want to remove / directory
path = path[:-1]
if scheme != 'https':
# on enleve le dernier '/'
while len(
path) > 1 and path[-1] == '/': # > 1 because we don't want to remove / directory
path = path[:-1]
return (scheme, domain, path, port, username, password)
......@@ -94,9 +96,10 @@ def getProtocolName(scheme, path, domain, exitonfail=True):
return 'http'
elif scheme == 'https':
if (domain == "oceandata.sci.gsfc.nasa.gov"):
return "https_filetable"
#return "https_filetable"
return "https_directorylist"
else:
return "http_directorylist"
return "https_directorylist"
elif scheme == 'file' \
or (scheme == '' and path.startswith('/')) \
or (scheme == '' and path.startswith('./')):
......
......@@ -12,7 +12,7 @@ from dchecktools.protocols import localpath
from dchecktools.protocols import http_ostia
from dchecktools.protocols import http_navy_rainglobal
from dchecktools.protocols import http_jma_jra_gribfinal
from dchecktools.protocols import http_directorylist
from dchecktools.protocols import https_directorylist
from dchecktools.protocols import ftp
from dchecktools.protocols import lslr_file
from dchecktools.protocols import http_stdmet
......@@ -20,6 +20,7 @@ from dchecktools.protocols import https_opensearch
from dchecktools.protocols import sftp
from dchecktools.protocols import webdav
from dchecktools.protocols import ftps
from dchecktools.protocols import file_extractor
from dchecktools.filters.StringFilters import FileFilter
from dchecktools.common.errors import DC_Error, DC_ConfigError, DC_DbError
from dchecktools.plugins import DataReaderFactory
......@@ -64,6 +65,7 @@ class Config:
# Protocol parameters
self.protocol = None
self.protocol_option = None
self.path = None
self.server_address = None
self.server_port = None
......@@ -149,6 +151,8 @@ class Config:
# Protocol parameters
if 'PROTOCOL' in module_attr:
self.protocol = module.PROTOCOL
if 'PROTOCOL_OPTION' in module_attr:
self.protocol_option = module.PROTOCOL_OPTION
if 'PATH' in module_attr:
self.path = module.PATH
if 'SERVER_ADDRESS' in module_attr:
......@@ -295,6 +299,7 @@ class Config:
cfg_str += "</database>"
cfg_str += "<protocol>"
cfg_str += self.getNodeString('protocol')
cfg_str += self.getNodeString('protocol_option')
cfg_str += self.getNodeString('path')
cfg_str += self.getNodeString('server_address')
cfg_str += self.getNodeString('server_port')
......@@ -349,7 +354,7 @@ class Config:
class DCheck(object):
def getProtocolObject(self, protocolname):
def getProtocolObject(self, protocolname, option=None):
obj = None
if protocolname == "localpath":
obj = localpath.Protocol_localpath()
......@@ -363,11 +368,20 @@ class DCheck(object):
obj = http_navy_rainglobal.Protocol_http_navy_rainglobal()
elif protocolname == "http_jma_jra_gribfinal":
obj = http_jma_jra_gribfinal.Protocol_http_jma_jra_gribfinal()
elif protocolname == "http_directorylist":
obj = http_directorylist.Protocol_http_directorylist()
# 27/03/2018 PMT#37
# elif protocolname == 'https_filetable':
# obj = http_directorylist.Protocol_https_filetable()
elif protocolname == "https_directorylist":
classFileExtractor = https_directorylist.FileExtractor
if "FileExtractor" in option:
try:
module = file_extractor
except:
# module to take in the plugins directory,
# directory add to sys.path in loadFromModule().
module = __import__("https_FileExtractor_" + option["FileExtractor"])
log.warning("The HTTP protocol uses the FileExtractor module of the plugin directory")
classFileExtractor=getattr(module,
"FileExtractor_" + option["FileExtractor"],
https_directorylist.FileExtractor)
obj = https_directorylist.Protocol_https_directorylist(classFileExtractor)
elif protocolname == "lslr_file":
obj = lslr_file.Protocol_lslr_file()
elif protocolname == "https_opensearch":
......@@ -377,7 +391,7 @@ class DCheck(object):
elif protocolname == "webdav":
obj = webdav.Protocol_webdav()
elif protocolname[:4] == "ftps":
obj = ftps.Protocol_ftps(protocolname[5:])
obj = ftps.Protocol_ftps(option)
return obj
def getOptions(self, argsList=None):
......@@ -531,8 +545,7 @@ class DCheck(object):
log.setLevel(config.logLevel)
log.debug("Config : " + config.constConfigString())
proto = self.getProtocolObject(config.protocol)
proto = self.getProtocolObject(config.protocol, config.protocol_option)
if proto is None:
raise DC_ConfigError("Unknown protocol : %s" % (config.protocol))
......
......@@ -92,7 +92,7 @@ class FileFilter(object):
return False
# check force file regexp
for regexp in self.__forceRegexp:
if regexp.search(os.path.join(dirpath,dirname)):
if regexp.search(dirpath):
return True
return self.__interestingByDefault
import os
import re
import html.parser
from dchecktools.protocols.https_directorylist import Protocol_https_directorylist, getNumericSize
class FileExtractor_UniBremen(html.parser.HTMLParser):
DATE_FORMAT = "%Y-%m-%d %H:%M"
FACTOR = {'': 1,
'K': 1 << 10, # 1024
'M': 1 << 20,
'G': 1 << 30,
'T': 1 << 40,
'P': 1 << 50}
@staticmethod
def get_download_url(dcheck_url):
return dcheck_url
@staticmethod
def get_new_dcheck_url(old_dcheck_url, path):
return old_dcheck_url + '/' + path
def __init__(self):
html.parser.HTMLParser.__init__(self)
self.links = {}
self._to_ignore = False
self._is_directory = False
self._is_file = False
self._current_name = ''
self.__mtime_pattern = re.compile(' *([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}) *')
self.__size_pattern = re.compile(' *([0-9a-zA-Z.-]*) *')
self.__td_count = 0
self.__mtime_str = ""
def handle_data(self, data):
size_str = None
#print(f"handle_data : {self._current_name} :{data} (count = {self.__td_count})")
if not self._to_ignore:
if len(data) > 0 and self._current_name != '':
if self.__td_count == 3:
mg = self.__mtime_pattern.match(data)
if mg is not None:
self.__mtime_str = mg.groups()[0]
elif self.__td_count == 4:
if self._is_file:
isdirectory = 0
mg = self.__size_pattern.match(data)
if mg is not None:
size_str = mg.groups()[0]
self.links[self._current_name] = \
(isdirectory,self.__mtime_str, size_str)
#print("handle_data =====> FILE name : %s, mtime : %s, size : %s" % (self._current_name, self.__mtime_str, size_str))
else:
isdirectory = 1
size_str = "0"
self.links[self._current_name] = (isdirectory, self.__mtime_str, size_str)
#print("handle_data =====> DIR name : %s, mtime : %s"%(self._current_name, self.__mtime_str))
def handle_starttag(self, tag, attrs):
#print(f"handle_starttag =====> {tag} = {attrs} (count = {self.__td_count})")
if tag == "img" and self.__td_count == 1:
if len(attrs) > 0:
for attr in attrs:
if attr[0] == 'alt':
if attr[1] == '[DIR]':
self._to_ignore = False
self._is_directory = True
self._is_file = False
elif not attr[1].startswith('['):
self._to_ignore = True
self._is_directory = False
self._is_file = False
elif attr[1] == '[PARENTDIR]':
self._to_ignore = True
self._is_directory = True
self._is_file = False
else:
self._is_directory = False
self._is_file = True
self._to_ignore = False
elif tag == "a" and self.__td_count == 2:
if not self._to_ignore:
if len(attrs) > 0:
for attr in attrs:
if attr[0] == "href":
if attr[1][0:1] == '/':
continue
self._current_name = attr[1]
self._current_name = self._current_name.rstrip('/')
if self._is_directory:
self.links[self._current_name] = list()
if self._is_file:
self.links[self._current_name] = list()
elif tag == "tr":
self.__td_count = 0
self.__mtime_str = None
elif tag == "td":
self.__td_count += 1
#print(f"handle_starttag =====> ({self._is_directory}, {self._is_file}, {self._to_ignore})")
def handle_endtag(self, tag):
if tag == "tr":
if self._is_directory:
self._is_directory = False
elif tag == "td":
if self.__td_count == 4:
self.__td_count = 0
def get_files(self):
return self.links
def set_mtime_pattern(self, pattern):
self.__mtime_pattern = re.compile(pattern)
def set_size_pattern(self, pattern):
self.__size_pattern = re.compile(pattern)
class FileExtractor_Nomads(FileExtractor_UniBremen):
DATE_FORMAT = "%d-%b-%Y %H:%M"
def __init__(self):
super().__init__()
self.set_mtime_pattern(' *([0-9]{2}-[a-zA-Z]{3}-[0-9]{4} [0-9]{2}:[0-9]{2}) *')
class FileExtractor_UniHamburg(html.parser.HTMLParser): #thredds
DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
FACTOR = {'bytes': 1,
'Kbytes': 10E3, # 1024
'Mbytes': 10E6,
'Gbytes': 10E9,
'Tbytes': 10E12,
'Pbytes': 10E15}
@staticmethod
def get_download_url(dcheck_url):
url = os.path.split(dcheck_url)[0].replace('catalog', 'fileServer')
return url
@staticmethod
def get_new_dcheck_url(old_dcheck_url, path):
url = os.path.split(old_dcheck_url)
return url[0] + '/' + path.rstrip('/') + '/' + url[1]
def __init__(self):
html.parser.HTMLParser.__init__(self)
self.links = {}
self._to_ignore = False
self._is_file = False
self._current_name = ''
self.__mtime_pattern = re.compile(' *([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z) *')
self.__size_pattern = re.compile(' *([0-9.]* [a-zA-z]*) *')
self.__td_count = 0
self.__size_str = None
self.__is_value = False
def handle_data(self, data):
mtime_str = ""
#print(f"handle_data : {self._current_name} :{data} (count = {self.__td_count})")
if not self._to_ignore and self.__is_value and len(data) > 0:
if self._current_name == '' and self.__td_count == 1:
self._current_name = data
if self._current_name != '':
if self.__td_count == 2:
if self._is_file:
mg = self.__size_pattern.match(data)
if mg is not None:
self.__size_str = mg.groups()[0]
else:
self.__size_str = "0"
elif self.__td_count == 3:
mg = self.__mtime_pattern.match(data)
if mg is not None:
mtime_str = mg.groups()[0]
if self._is_file:
isdirectory = 0
self.links[self._current_name] = \
(isdirectory,mtime_str, self.__size_str)
#print("handle_data =====> FILE name : %s, mtime : %s, size : %s" % (
# self._current_name, mtime_str, self.__size_str))
else:
# skip current directory entry
if self._current_name[-1:] == '/':
isdirectory = 1
self.links[self._current_name] = (isdirectory, mtime_str, self.__size_str)
#print("handle_data =====> DIR name : %s, mtime : %s"%(self._current_name, mtime_str))
def handle_starttag(self, tag, attrs):
#print(f"handle_starttag =====> {tag} = {attrs} (count = {self.__td_count})")
if tag == "img" and self.__td_count == 1:
if len(attrs) > 0:
for attr in attrs:
if attr[0] == 'alt':
if attr[1] == '[DIR]' or attr[1] == 'Folder':
self._to_ignore = False
self._is_file = False
elif attr[1] == '[PARENTDIR]' or not attr[1].startswith('['):
self._to_ignore = True
self._is_file = False
else:
self._is_file = True
self._to_ignore = False
elif tag == "tr":
self.__td_count = 0
self._to_ignore = False
self.__mtime_str = None
self._is_file = True
self._current_name = ''
elif tag == "td":
self.__td_count += 1
elif tag == "tt":
self.__is_value = True
#print(f"handle_starttag =====> ({self._is_file}, {self._to_ignore})")
def handle_endtag(self, tag):
if tag == "tr":
self._to_ignore = True
elif tag == "tt":
self.__is_value = False
def get_files(self):
return self.links
def set_mtime_pattern(self, pattern):
self.__mtime_pattern = re.compile(pattern)
def set_size_pattern(self, pattern):
self.__size_pattern = re.compile(pattern)
......@@ -92,7 +92,7 @@ def ftpwalk(
# Filter directory
if directoryFilter is not None:
interestingDirectory = directoryFilter.isInterestingDirectory(
dirname=dname, mtime=mtime, dirpath=top)
dirname=dname, mtime=mtime, dirpath=os.path.join(top, dname))
log.debug(
"directory %s is interesting ? %s" %
(path, interestingDirectory))
......@@ -476,7 +476,7 @@ class Protocol_ftp(AbstractProtocol):
dirpath = dirname[0]
mtime = dirname[2]
interestingDir = self.directoryFilter.isInterestingDirectory(
dirname=dirpath, mtime=mtime, dirpath=basedir)
dirname=dirpath, mtime=mtime, dirpath=os.path.join(basedir, dirpath))
if not interestingDir:
continue
......
......@@ -92,7 +92,7 @@ def ftpswalk(
# Filter directory
if directoryFilter is not None:
interestingDirectory = directoryFilter.isInterestingDirectory(
dirname=dname, mtime=mtime, dirpath=top)
dirname=dname, mtime=mtime, dirpath=os.path.join(top, dname))
log.debug(
"directory %s is interesting ? %s" %
(path, interestingDirectory))
......@@ -477,7 +477,7 @@ class Protocol_ftps(AbstractProtocol):
dirpath = dirname[0]
mtime = dirname[2]
interestingDir = self.directoryFilter.isInterestingDirectory(
dirname=dirpath, mtime=mtime, dirpath=basedir)
dirname=dirpath, mtime=mtime, dirpath=os.path.join(basedir, dirpath))
if not interestingDir:
continue
......
......@@ -75,7 +75,7 @@ class Protocol_localpath(AbstractProtocol):
if dirpath != path and \
not self.directoryFilter.isInterestingDirectory(dirname=current_directory[1],
dirpath=current_directory[0]):
dirpath=current_directory):
log.debug("memorization of the excluded path : %s" % (dirpath))
excludeDirectory = dirpath
continue
......
......@@ -87,7 +87,7 @@ def sftpwalk(
# Filter directory
if directoryFilter is not None:
interestingDirectory = directoryFilter.isInterestingDirectory(
dirname=dname, mtime=mtime, dirpath=top)
dirname=dname, mtime=mtime, dirpath=os.path.join(top, dname))
log.debug("directory %s is interesting ? %s" %
(path, interestingDirectory))
if not interestingDirectory:
......@@ -315,7 +315,7 @@ class Protocol_sftp(AbstractProtocol):
dirpath = dirname[0]
mtime = dirname[2]
interestingDir = self.directoryFilter.isInterestingDirectory(
dirname=dirpath, mtime=mtime, dirpath=basedir)
dirname=dirpath, mtime=mtime, dirpath=os.path.join(basedir, dirpath))
if not interestingDir:
continue
......
......@@ -37,7 +37,7 @@ def webdavwalk(isSmartCrawler, directoryFilter, webdav, baseurl, root,
Generator that yields tuples of (root, dirs, nondirs).
"""
# Make the FTP object's current directory to the top dir.
log.info('webdavwalk : dir=%s' % (root))
#log.info('webdavwalk : dir=%s' % (root))
try:
try:
......@@ -60,7 +60,7 @@ def webdavwalk(isSmartCrawler, directoryFilter, webdav, baseurl, root,
if directoryFilter is not None:
currentDirectory = os.path.split(dname.rstrip('/'))
interestingDirectory = directoryFilter.isInterestingDirectory(
dirname=currentDirectory[1], mtime=mtime, dirpath=currentDirectory[0])
dirname=currentDirectory[1], mtime=mtime, dirpath=currentDirectory)
log.debug("directory %s is interesting ? %s" % (path, interestingDirectory))
if not interestingDirectory:
continue
......@@ -282,7 +282,7 @@ class Protocol_webdav(AbstractProtocol):
mtime = dirname[2]
currentDirectory = os.path.split(dirpath.rstrip('/'))
interestingDirectory = self.directoryFilter.isInterestingDirectory(
dirname=currentDirectory[1], mtime=mtime, dirpath=currentDirectory[0])
dirname=currentDirectory[1], mtime=mtime, dirpath=currentDirectory)
if not interestingDirectory:
continue
......
......@@ -98,4 +98,4 @@ class EMMWriter(IMessageWriter):
self.__emm_rabbitmq.writeMessage(dataset_id, message)
def setProjectName(self, project_name):
self.projet_name = project_name
\ No newline at end of file
self.projet_name = project_name
......@@ -62,7 +62,8 @@ class File(object):
BZIP2 = "bz2"
ZIP = "zip"
LZW = "z"
LIST_COMPRESS = [ZIP, GZIP, BZIP2, LZW]
TGZ = "tgz"
LIST_COMPRESS = [ZIP, GZIP, BZIP2, LZW, TGZ]
# Digest algorithm
MD5 = "md5"
......@@ -141,7 +142,7 @@ class File(object):
raise ValueError(
"bad `compress`, value must be in File.LIST_COMPRESS")
if compress == File.GZIP:
if compress in (File.GZIP, File.TGZ):
opener = GzipFile
validMode = ("r", "a", "w", "rb", "ab", "wb")
args = {'filename': self.getName(), 'mode': mode, 'compresslevel': compressLevel}
......
......@@ -85,6 +85,8 @@ class Folder(object):
# to force the use of hosts by input_spool instead of by node
self.possible_host = []
self.nfs_check_error = False
# Update the content of the folder
if scan is True:
try:
......@@ -303,9 +305,14 @@ class Folder(object):
# Check if the network folder is accessible (in case of NFS mounts)
def __nfsCheck(self, timeout=0.5):
from ifr_lib.ifr_os import is_io_locked
self.nfs_check_error = False
try:
if is_io_locked(self.path, timeout):
self.nfs_check_error = True
raise IOError('Folder::nfs_check: server not responding')
except Exception as e:
raise IOError(str(e))
if is_io_locked(self.path):
raise IOError('Folder::nfs_check: server not responding')
# Update Folder content from FileSystem
def scan(self):
......@@ -323,7 +330,7 @@ class Folder(object):
# check that the directory is available to avoid NFS Freezes for example. The probeDir program
# always exits shortly (and do not wait for the folder to become available again when nfs problems occurs)
if self.__probing:
self.__nfsCheck()
self.__nfsCheck(2.0)
for key, isFile in self._walkInFolder():
if isFile:
......
......@@ -120,15 +120,21 @@ class DCheckReportListingBuilder(IListingBuilder):
self.__download.remote_storage_repository.lstrip('/'))
elif provider_type == 'HttpsProvider':
# 2018/12/18 PMT#14 protocol type in lowercase
if self.__download.remoteStorageProviderType == 'Https_OpenSearch' or self.__download.remoteStorageProviderType == 'Https_opensearch':
if self.__download.remoteStorageProviderType == 'Https_OpenSearch' \
or self.__download.remoteStorageProviderType == 'Https_opensearch':
protocol = "https_opensearch"
else:
protocol = "https"
self.__base_url = '%s://%s:%s@%s/%s' % (protocol,
provider.username,
provider.password,
provider.server,
self.__download.remote_storage_repository.lstrip('/'))
if provider.username is not None and provider.password is not None:
self.__base_url = '%s://%s:%s@%s/%s' % (protocol,
provider.username,
provider.password,
provider.server,
self.__download.remote_storage_repository.lstrip('/'))
else:
self.__base_url = f"{protocol}://{provider.server}/" \
f"{self.__download.remote_storage_repository.lstrip('/')}"
# 06/04/2018 PMT add LocalmoveProvider
elif provider_type in ['LocalpathProvider', 'LocalpointerProvider', 'OnlynotifyProvider', 'LocalmoveProvider']:
self.__base_url = self.__download.remote_storage_repository
......@@ -310,6 +316,12 @@ class DCheckReportListingBuilder(IListingBuilder):
self.__download.configuration.source.opensearch_requestFormat),
"RESULT_FORMAT",
"")
self.__writeLineDCheckFileConfig(
configFile,
self.__makeDictParameter(
self.__download.configuration.source.protocol_option),
"PROTOCOL_OPTION",
"")
configFile.close()
......@@ -333,6 +345,16 @@ class DCheckReportListingBuilder(IListingBuilder):
returnValue = "['" + value + "']"
return returnValue
def __makeDictParameter(self, value):
returnValue = "{"
line = ""
for key in value.keys():
line += f"'{key}': '{value[key]}',"
if line != "":
returnValue += line[:-1]
returnValue += '}'
return returnValue
def __makeBooleanParameter(self, value):
returnValue = value
if value:
......
......@@ -5,11 +5,18 @@
# Log file management
import logging
from string import capwords
import json
from ifr_lib.ifr_yaml import YamlConfig