Commit 64f74175 authored by MAGHOUZ's avatar MAGHOUZ
Browse files

#96 download files using https listing file

parent 9bdfc0be
......@@ -8,7 +8,7 @@ from eo_dataflow_manager.dchecktools.common.basefileinfos import SqliteBaseFiles
from eo_dataflow_manager.dchecktools.common.spliturl import setConfigFromUrl
from eo_dataflow_manager.dchecktools.common import workingdir
from eo_dataflow_manager.dchecktools.protocols import localpath
from eo_dataflow_manager.dchecktools.protocols import localpath, https_listing_file
from eo_dataflow_manager.dchecktools.protocols import https_directorylist
from eo_dataflow_manager.dchecktools.protocols import ftp
from eo_dataflow_manager.dchecktools.protocols import lslr_file
......@@ -374,6 +374,11 @@ class DCheck(object):
obj = lslr_file.Protocol_lslr_file()
elif protocolname == "https_opensearch":
obj = https_opensearch.Protocol_https_opensearch()
elif protocolname == "https_listing_file":
data_server_address = ""
if "DataServerAddress" in option:
data_server_address = option["DataServerAddress"]
obj = https_listing_file.Protocol_https_listing_file(data_server_address)
elif protocolname == "sftp":
obj = sftp.Protocol_sftp()
elif protocolname == "webdav":
......
import os
import re
from eo_dataflow_manager.dchecktools.protocols.AbstractProtocol import AbstractProtocol
from eo_dataflow_manager.dchecktools.common.basefileinfos import File, FILE_URL_SEPARATOR
from eo_dataflow_manager.dchecktools.common.errors import DC_ConfigError, DC_UrlError
import urllib.request
import urllib.error
import urllib.parse
import logging
import socket
import time
from datetime import datetime
FILE_FORMAT_REGEXP = "([rwxdblcpsStT-]{10}). +([0-9]+) +(.*?) +(.*?) +([0-9]*?) +([a-zA-Z]{3} +[0-9]* [0-9]*:[0-9]*) (.*)\n?"
# rights links user group size date (ex: Sep 17 11:04) name
FILE_FORMAT_REGEXP2 = "([rwxdlcpsStT-]{10}). +([0-9]+) +(.*?) +(.*?) +([0-9]*?) +([a-zA-Z]{3} +[0-9]* +[0-9]*) (.*)\n?"
# rights links user group size date (ex: Sep 17 2019) name
DIR_FORMAT_REGEXP = "(.*):\n"
LNK_FORMAT_REGEXP = "(.*) -> (.*)"
lnk_regexp = re.compile(LNK_FORMAT_REGEXP)
DEFAULT_SERVER_PORT = 443
DEFAULT_USERNAME = ""
DEFAULT_PASSWORD = ""
log = logging.getLogger('https_listing_file')
log.setLevel(logging.DEBUG)
class Protocol_https_listing_file(AbstractProtocol):
def __init__(self,data_server_address=""):
AbstractProtocol.__init__(self)
# default socket timeout
socket.setdefaulttimeout(self.getDefaultTimeout())
# default config
self.port = DEFAULT_SERVER_PORT
self.username = DEFAULT_USERNAME
self.password = DEFAULT_PASSWORD
# required config
self.path = None
self.server_address = None
self.data_server_address = data_server_address
self.check_infos = ['size', 'mtime']
self._checkable_infos = [
'size',
'mtime']
self.__current_path = None
def setConfig(self, config):
AbstractProtocol.setConfig(self, config)
# overwrite default config
if config.server_port is not None:
self.port = config.server_port
if config.auth_username != "" and config.auth_username is not None:
self.username = config.auth_username
if config.auth_password != "" and config.auth_password is not None:
self.password = config.auth_password
# check for required config
if config.path is not None:
self.path = config.path
else:
raise DC_ConfigError("Configuration error : missing path")
if config.server_address is not None:
self.server_address = config.server_address
else:
raise DC_ConfigError(
"Configuration error : missing server_address")
# add default values if needed
if config.check_infos is None:
config.check_infos = self.check_infos
if self.config.logLevel:
log.setLevel(self.config.logLevel)
if self.data_server_address == "":
self.data_server_address = self.server_address
def setDirectoryFilter(self, directoryFilter):
self.directoryFilter = directoryFilter
def setFileFilter(self, fileFilter):
self.fileFilter = fileFilter
def getFileInfoList(self):
url = self.server_address.rstrip('/') + '/' + self.path.lstrip('/')
if not url.startswith("http"):
url = 'https://' + url
data_server_address = self.data_server_address.rstrip('/')
if not data_server_address.startswith("http"):
data_server_address = 'https://' + data_server_address + "/"
try:
files = self.https_get_files_from_listing_file(
url=url,
data_server_adress=data_server_address,
timeout=self.getDefaultTimeout()
)
files_to_insert = []
try:
for filename in files:
filepath = filename[0].split(FILE_URL_SEPARATOR)[0]
try:
mtime = time.mktime(
time.strptime(
filename[2],
"%b %d %Y"))
except ValueError:
try:
mtime = time.mktime(
time.strptime(
filename[2],
"%b %d %H:%M"))
mtime
except:
raise
interestingFile = self.fileFilter.isInteresting(
filepath, mtime)
log.debug(
"file %s is interesting ? %s" %
(filepath, interestingFile))
if interestingFile:
self.nbr_walked_files += 1
files_to_insert.append(File(
id_execution=None,
filename=filename[0],
isDirectory=False,
isSymLink=False,
size=filename[1],
mtime=datetime.fromtimestamp(mtime),
))
except StopIteration as msg:
log.debug(
"getFileInfoList iteration stopped '%s'" %
(str(msg)))
pass # fin normale de l'iteration
if files_to_insert:
self.fileInfoDatabase.addFileList(files_to_insert)
except DC_UrlError as msg:
log.debug(
'https_iteration error : %s. Stopping iteration.' %
(str(msg)))
raise msg
self.updateValidExecutionStatus(True)
def getLnkSrcDst(self, lnk_str):
m_lnk_str = lnk_regexp.match(lnk_str)
if m_lnk_str is not None:
name_src, name_dst = m_lnk_str.groups()
else:
name = lnk_str
return name_src, name_dst
def extract_files_from_listing_file(self, data, data_server_address=""):
current_dir = ""
file_regexp = re.compile(FILE_FORMAT_REGEXP)
file_regexp2 = re.compile(FILE_FORMAT_REGEXP2)
dir_regexp = re.compile(DIR_FORMAT_REGEXP)
files = []
for line in data.splitlines():
line_recognized = False
if line.startswith('total '):
continue
if line.isspace():
continue
m = file_regexp.match(line)
if m is not None:
line_recognized = True
rights, links, user, group, size, date_time, name = m.groups()
isSymLink = False
if rights[0] == 'l':
isSymLink = True
name, dst = self.getLnkSrcDst(name)
path = os.path.join(current_dir, name)
files.append((
path + FILE_URL_SEPARATOR + data_server_address.rstrip('/') + "/" + path.lstrip('/'),
size,
date_time
))
continue
m = file_regexp2.match(line)
if m is not None:
line_recognized = True
rights, links, user, group, size, date_time, name = m.groups()
isSymLink = False
if rights[0] == 'l':
isSymLink = True
name, dst = self.getLnkSrcDst(name)
path = os.path.join(current_dir, name)
files.append((
path + FILE_URL_SEPARATOR + data_server_address.rstrip('/') + "/" + path.lstrip('/'),
size,
date_time
))
continue
m = dir_regexp.match(line)
if m is not None:
line_recognized = True
dirpath = m.groups()
dirpath = dirpath[0]
print("DIR_FMT : dir=%s" % (dirpath))
size = 0
current_dir = dirpath
continue
if not line_recognized:
print("NO MATCH : %s" % (line))
continue
return files
def https_get_files_from_listing_file(self, url="", data_server_address="", timeout=120):
log.debug('https_listing_file : url=%s' % url)
try:
data = urllib.request \
.urlopen(url, timeout=timeout) \
.read() \
.decode("utf-8")
except urllib.error.URLError as msg:
error = DC_UrlError("URL/HTTP error : %s" % (msg))
log.debug(error)
raise error
return self.extract_files_from_listing_file(data, data_server_address)
total 13
-rw-r--r--. 1 atcfcm atcf 1320 Mar 15 2021 bsh992021.210315.dat
-rw-r--r--. 1 atcfcm atcf 5679 Mar 15 2021 bwp012021.210315.dat
-rw-r--r--. 1 atcfcm atcf 21849 Mar 15 2021 bsh242021.210315.dat
-rw-r--r--. 1 atcfcm atcf 22509 Mar 16 2021 bsh242021.210316.dat
-rw-r--r--. 1 atcfcm atcf 22509 Mar 17 2021 bsh242021.210317.dat
-rw-r--r--. 1 atcfcm atcf 165 Mar 17 2021 bsh942021.210317.dat
-rw-r--r--. 1 atcfcm atcf 7989 Mar 18 18:25 bsh012021.210318.dat
-rw-r--r--. 1 atcfcm atcf 11454 Mar 18 18:25 bsh022021.210318.dat
-rw-r--r--. 1 atcfcm atcf 4524 Mar 18 18:25 bsh032021.210318.dat
-rw-r--r--. 1 atcfcm atcf 2379 Mar 18 18:25 bsh042021.210318.dat
-rw-r--r--. 1 atcfcm atcf 15744 Mar 18 18:25 bsh052021.210318.dat
-rw-r--r--. 1 atcfcm atcf 4524 Mar 18 18:25 bsh062021.210318.dat
-rw-r--r--. 1 atcfcm atcf 7164 Mar 18 18:25 bsh072021.210318.dat
\ No newline at end of file
import unittest
import os
import re
import ast
import shutil
from eo_dataflow_manager.dchecktools.protocols.https_directorylist import FileExtractor
from eo_dataflow_manager.dchecktools.protocols.file_extractor import *
class PluginsFileExtractorTestCase(unittest.TestCase):
set_of_tests = {}
def run_Extractor(self, extractor, set_of_test):
html = ""
with open(set_of_test['filepath'], 'r') as f:
for line in f.readlines():
html += line.rstrip("\n")
extractor.feed(html)
files = extractor.get_files()
self.assertEqual(str(files), set_of_test['files_extracted'])
def setUp(self) -> None:
if PluginsFileExtractorTestCase.set_of_tests == {}:
cfg = ""
cfg_file = os.path.splitext(__file__)[0] + '.cfg'
with open(cfg_file, 'r') as f:
for line in f.readlines():
cfg += line.rstrip("\n")
PluginsFileExtractorTestCase.set_of_tests = ast.literal_eval(cfg)
if self._testMethodName in PluginsFileExtractorTestCase.set_of_tests.keys():
self.set_of_test = PluginsFileExtractorTestCase.set_of_tests[self._testMethodName]
else:
self.skipTest(f"No set of test for {self._testMethodName} in {os.path.splitext(__file__)[0] + '.cfg'}")
# timeout in minutes
self.__timeout = 60
def test_FileExtractor(self) -> None:
for set_of_test in self.set_of_test:
extractor = FileExtractor()
self.run_Extractor(extractor, set_of_test)
def test_UniBremen(self) -> None:
for set_of_test in self.set_of_test:
extractor = FileExtractor_UniBremen()
self.run_Extractor(extractor, set_of_test)
def test_UniHamburg(self) -> None:
for set_of_test in self.set_of_test:
extractor = FileExtractor_UniHamburg()
self.run_Extractor(extractor, set_of_test)
def test_Nomads(self) -> None:
for set_of_test in self.set_of_test:
extractor = FileExtractor_Nomads()
self.run_Extractor(extractor, set_of_test)
{
"test_https_get_files_from_listing_file":
[
{
"filepath": "../../data/fileextractor/tracks_2021.list",
"files_extracted": "[('bsh992021.210315.dat -|- /bsh992021.210315.dat', '1320', 'Mar 15 2021'), ('bwp012021.210315.dat -|- /bwp012021.210315.dat', '5679', 'Mar 15 2021'), ('bsh242021.210315.dat -|- /bsh242021.210315.dat', '21849', 'Mar 15 2021'), ('bsh242021.210316.dat -|- /bsh242021.210316.dat', '22509', 'Mar 16 2021'), ('bsh242021.210317.dat -|- /bsh242021.210317.dat', '22509', 'Mar 17 2021'), ('bsh942021.210317.dat -|- /bsh942021.210317.dat', '165', 'Mar 17 2021'), ('bsh012021.210318.dat -|- /bsh012021.210318.dat', '7989', 'Mar 18 18:25'), ('bsh022021.210318.dat -|- /bsh022021.210318.dat', '11454', 'Mar 18 18:25'), ('bsh032021.210318.dat -|- /bsh032021.210318.dat', '4524', 'Mar 18 18:25'), ('bsh042021.210318.dat -|- /bsh042021.210318.dat', '2379', 'Mar 18 18:25'), ('bsh052021.210318.dat -|- /bsh052021.210318.dat', '15744', 'Mar 18 18:25'), ('bsh062021.210318.dat -|- /bsh062021.210318.dat', '4524', 'Mar 18 18:25'), ('bsh072021.210318.dat -|- /bsh072021.210318.dat', '7164', 'Mar 18 18:25')]",
}
],
}
import unittest
import ast
from eo_dataflow_manager.dchecktools.protocols.https_listing_file import Protocol_https_listing_file
from eo_dataflow_manager.dchecktools.protocols.file_extractor import *
class HttpsListingFileTestCase(unittest.TestCase):
set_of_tests = {}
def run_Extractor(self, extractor, set_of_test):
html = ""
with open(set_of_test['filepath'], 'r') as f:
for line in f.readlines():
html += line.rstrip("\n")
files = extractor.get_files()
self.assertEqual(str(files), set_of_test['files_extracted'])
def setUp(self) -> None:
if HttpsListingFileTestCase.set_of_tests == {}:
cfg = ""
cfg_file = os.path.splitext(__file__)[0] + '.cfg'
with open(cfg_file, 'r') as f:
for line in f.readlines():
cfg += line.rstrip("\n")
HttpsListingFileTestCase.set_of_tests = ast.literal_eval(cfg)
if self._testMethodName in HttpsListingFileTestCase.set_of_tests.keys():
self.set_of_test = HttpsListingFileTestCase.set_of_tests[self._testMethodName]
else:
self.skipTest(f"No set of test for {self._testMethodName} in {os.path.splitext(__file__)[0] + '.cfg'}")
# timeout in minutes
self.__timeout = 60
self.protocol = Protocol_https_listing_file()
def test_https_get_files_from_listing_file(self) -> None:
for set_of_test in self.set_of_test:
data = ""
with open(set_of_test['filepath'], 'r') as f:
for line in f.readlines():
data += line
files = self.protocol.extract_files_from_listing_file(data)
self.assertEqual(str(files), set_of_test['files_extracted'])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment