From 6171664a1cde33bc0bcbb9982e5c6390bcf12fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Zi=C3=B3=C5=82ek?= Date: Mon, 23 May 2016 17:11:55 +0200 Subject: [PATCH 1/3] ftp_utils: migration to Paramiko FTP library --- harvestingkit/elsevier_package.py | 7 +-- harvestingkit/ftp_utils.py | 92 +++++++++++++++---------------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/harvestingkit/elsevier_package.py b/harvestingkit/elsevier_package.py index 638f3d5..3b2d549 100644 --- a/harvestingkit/elsevier_package.py +++ b/harvestingkit/elsevier_package.py @@ -38,7 +38,6 @@ from zipfile import ZipFile from xml.dom.minidom import parse - try: from invenio.errorlib import register_exception except ImportError: @@ -423,14 +422,14 @@ def get_abstract(self, xml_doc): def get_keywords(self, xml_doc): head = xml_doc.getElementsByTagName("ja:head") - if not head: + if not head: head = xml_doc.getElementsByTagName("cja:head") if not head: keywords = xml_doc.getElementsByTagName("ce:keyword") else: keywords = head[0].getElementsByTagName("ce:keyword") - return [get_value_in_tag(keyword, "ce:text") - for keyword in keywords + return [get_value_in_tag(keyword, "ce:text") + for keyword in keywords if get_value_in_tag(keyword, "ce:text")] def get_copyright(self, xml_doc): diff --git a/harvestingkit/ftp_utils.py b/harvestingkit/ftp_utils.py index e3a4b9a..575b209 100644 --- a/harvestingkit/ftp_utils.py +++ b/harvestingkit/ftp_utils.py @@ -28,6 +28,7 @@ from urlparse import urlparse from netrc import netrc from datetime import datetime +import paramiko class FtpHandler(object): @@ -44,25 +45,29 @@ class FtpHandler(object): for authentication with the server. :type netrc_file: string """ - def __init__(self, server, username='', passwd='', netrc_file=''): + def __init__(self, server, username='', passwd='', netrc_file='', sftp=False): + if sftp: + port = 22 + else: + port = 21 server = urlparse(server) if server.netloc: server = server.netloc elif server.path: server = server.path - self._ftp = FTP(server) + self._ftp = paramiko.Transport((server, port)) self._username = username self._passwd = passwd if netrc_file: logininfo = netrc(netrc_file).authenticators(server) self._username, _, self._passwd = logininfo self.connect() - self._home = self._ftp.pwd() + self._home = self._sftp_client.getcwd() def connect(self): """ Connects and logins to the server. """ - self._ftp.connect() - self._ftp.login(user=self._username, passwd=self._passwd) + self._ftp.connect(username=self._username, password=self._passwd) + self._sftp_client = paramiko.SFTPClient.from_transport(self._ftp) def close(self): """ Closes the connection to the server. """ @@ -99,13 +104,14 @@ def download(self, source_file, target_folder=''): working directory. :type target_folder: string """ - current_folder = self._ftp.pwd() + current_folder = self._sftp_client.getcwd() if not target_folder.startswith('/'): # relative path target_folder = join(getcwd(), target_folder) folder = os.path.dirname(source_file) - self.cd(folder) + if folder: + self._sftp_client.chdir(folder) if folder.startswith("/"): folder = folder[1:] @@ -118,14 +124,15 @@ def download(self, source_file, target_folder=''): source_file = os.path.basename(source_file) destination = join(destination_folder, source_file) try: - with open(destination, 'wb') as result: - self._ftp.retrbinary('RETR %s' % (source_file,), - result.write) + # with open(destination, 'wb') as result: + # self._ftp.retrbinary('RETR %s' % (source_file,), + # result.write) + self._sftp_client.get(source_file, destination) except error_perm as e: # source_file is a folder print(e) remove(join(target_folder, source_file)) raise - self._ftp.cwd(current_folder) + self._sftp_client.chdir(current_folder) def cd(self, folder): """ Changes the working directory on the server. @@ -134,13 +141,13 @@ def cd(self, folder): :type folder: string """ if folder.startswith('/'): - self._ftp.cwd(folder) + self._sftp_client.chdir(folder) else: for subfolder in folder.split('/'): if subfolder: - self._ftp.cwd(subfolder) + self._sftp_client.chdir(subfolder) - def ls(self, folder=''): + def ls(self, folder='.'): """ Lists the files and folders of a specific directory default is the current working directory. @@ -150,15 +157,16 @@ def ls(self, folder=''): :returns: a tuple with the list of files in the folder and the list of subfolders in the folder. """ - current_folder = self._ftp.pwd() - self.cd(folder) + current_folder = self._sftp_client.getcwd() + self._sftp_client.chdir(folder) + files = [] + folders = [] contents = [] - self._ftp.retrlines('LIST', lambda a: contents.append(a)) - files = filter(lambda a: a.split()[0].startswith('-'), contents) - folders = filter(lambda a: a.split()[0].startswith('d'), contents) - files = map(lambda a: ' '.join(a.split()[8:]), files) - folders = map(lambda a: ' '.join(a.split()[8:]), folders) - self._ftp.cwd(current_folder) + contents = self._sftp_client.listdir() + + files = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('-'), contents) + folders = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('d'), contents) + self._sftp_client.chdir(current_folder) return files, folders def dir(self, folder='', prefix=''): @@ -192,7 +200,7 @@ def mkdir(self, folder): :param folder: the folder to be created. :type folder: string """ - current_folder = self._ftp.pwd() + current_folder = self._sftp_client.getcwd() #creates the necessary folders on #the server if they don't exist folders = folder.split('/') @@ -200,7 +208,7 @@ def mkdir(self, folder): try: self.cd(fld) except error_perm: # folder does not exist - self._ftp.mkd(fld) + self._sftp_client.mkdir(fld) self.cd(fld) self.cd(current_folder) @@ -211,11 +219,11 @@ def rm(self, filename): :type filename: string """ try: - self._ftp.delete(filename) + self._sftp_client.remove(filename) except error_perm: # target is either a directory # either it does not exist try: - current_folder = self._ftp.pwd() + current_folder = self._sftp_client.getcwd() self.cd(filename) except error_perm: print('550 Delete operation failed %s ' @@ -232,7 +240,7 @@ def rmdir(self, foldername): :param foldername: the folder to be deleted. :type foldername: string """ - current_folder = self._ftp.pwd() + current_folder = self._sftp_client.getcwd() try: self.cd(foldername) except error_perm: @@ -241,16 +249,16 @@ def rmdir(self, foldername): else: self.cd(current_folder) try: - self._ftp.rmd(foldername) + self._sftp_client.rmdir(foldername) except error_perm: # folder not empty self.cd(foldername) contents = self.ls() #delete the files - map(self._ftp.delete, contents[0]) + map(self._sftp_client.remove, contents[0]) #delete the subfolders map(self.rmdir, contents[1]) self.cd(current_folder) - self._ftp.rmd(foldername) + self._sftp_client.rmdir(foldername) def get_filesize(self, filename): """ Returns the filesize of a file @@ -260,19 +268,14 @@ def get_filesize(self, filename): :returns: string representation of the filesize. """ - result = [] - - def dir_callback(val): - result.append(val.split()[4]) - - self._ftp.dir(filename, dir_callback) - return result[0] + return self._sftp_client.lstat(filename).st_size def get_datestamp(self, filename): - datestamp = self._ftp.sendcmd('MDTM ' + filename) - datestamp = datetime.strptime(datestamp[4:], - "%Y%m%d%H%M%S").strftime("%Y-%M-%d") - return datestamp + # datestamp = self._ftp.sendcmd('MDTM ' + filename) + # datestamp = datetime.strptime(datestamp[4:], + # "%Y%m%d%H%M%S").strftime("%Y-%M-%d") + datestamp = datetime.fromtimestamp(self._sftp_client.lstat(filename).st_mtime) + return datestamp.strftime("%Y-%m-%d") def check_pkgs_integrity(self, filelist, logger, timeout=120, sleep_time=10): @@ -324,11 +327,8 @@ def upload(self, filename, location=''): be stored. :type location: string """ - current_folder = self._ftp.pwd() + current_folder = self._sftp_client.getcwd() self.mkdir(location) self.cd(location) - fl = open(filename, 'rb') - filename = filename.split('/')[-1] - self._ftp.storbinary('STOR %s' % filename, fl) - fl.close() + self._sftp_client.put(filename, location) self.cd(current_folder) From c44fd8b627922566c6d5c418e0612d7e6427d75c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Zi=C3=B3=C5=82ek?= Date: Mon, 30 May 2016 18:07:26 +0200 Subject: [PATCH 2/3] ftp_utils: Adds back support of standard Python FTP connection --- harvestingkit/ftp_utils.py | 162 +++++++++++++++++++++++++++---------- 1 file changed, 121 insertions(+), 41 deletions(-) diff --git a/harvestingkit/ftp_utils.py b/harvestingkit/ftp_utils.py index 575b209..123516a 100644 --- a/harvestingkit/ftp_utils.py +++ b/harvestingkit/ftp_utils.py @@ -45,29 +45,37 @@ class FtpHandler(object): for authentication with the server. :type netrc_file: string """ - def __init__(self, server, username='', passwd='', netrc_file='', sftp=False): - if sftp: - port = 22 - else: - port = 21 + def __init__(self, server, username='', passwd='', netrc_file='', port=21, sftp=False): + self.port = port + self.sftp = sftp server = urlparse(server) if server.netloc: server = server.netloc elif server.path: server = server.path - self._ftp = paramiko.Transport((server, port)) + if self.sftp: + self._ftp = paramiko.Transport((server, self.port)) + else: + self._ftp = FTP(server) self._username = username self._passwd = passwd if netrc_file: logininfo = netrc(netrc_file).authenticators(server) self._username, _, self._passwd = logininfo self.connect() - self._home = self._sftp_client.getcwd() + if self.sftp: + self._home = self._sftp_client.getcwd() + else: + self._home = self._ftp.pwd() def connect(self): """ Connects and logins to the server. """ - self._ftp.connect(username=self._username, password=self._passwd) - self._sftp_client = paramiko.SFTPClient.from_transport(self._ftp) + if self.sftp: + self._ftp.connect(username=self._username, password=self._passwd) + self._sftp_client = paramiko.SFTPClient.from_transport(self._ftp) + else: + self._ftp.connect(port=self.port) + self._ftp.login(user=self._username, passwd=self._passwd) def close(self): """ Closes the connection to the server. """ @@ -104,14 +112,20 @@ def download(self, source_file, target_folder=''): working directory. :type target_folder: string """ - current_folder = self._sftp_client.getcwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() if not target_folder.startswith('/'): # relative path target_folder = join(getcwd(), target_folder) folder = os.path.dirname(source_file) if folder: - self._sftp_client.chdir(folder) + if self.sftp: + self._sftp_client.chdir(folder) + else: + self.cd(folder) if folder.startswith("/"): folder = folder[1:] @@ -124,15 +138,20 @@ def download(self, source_file, target_folder=''): source_file = os.path.basename(source_file) destination = join(destination_folder, source_file) try: - # with open(destination, 'wb') as result: - # self._ftp.retrbinary('RETR %s' % (source_file,), - # result.write) - self._sftp_client.get(source_file, destination) + if self.sftp: + self._sftp_client.get(source_file, destination) + else: + with open(destination, 'wb') as result: + self._ftp.retrbinary('RETR %s' % (source_file,), + result.write) except error_perm as e: # source_file is a folder print(e) remove(join(target_folder, source_file)) raise - self._sftp_client.chdir(current_folder) + if self.sftp: + self._sftp_client.chdir(current_folder) + else: + self._ftp.cwd(current_folder) def cd(self, folder): """ Changes the working directory on the server. @@ -141,13 +160,19 @@ def cd(self, folder): :type folder: string """ if folder.startswith('/'): - self._sftp_client.chdir(folder) + if self.sftp: + self._sftp_client.chdir(folder) + else: + self._ftp.cwd(folder) else: for subfolder in folder.split('/'): if subfolder: - self._sftp_client.chdir(subfolder) + if self.sftp: + self._sftp_client.chdir(subfolder) + else: + self._ftp.cwd(subfolder) - def ls(self, folder='.'): + def ls(self, folder=''): """ Lists the files and folders of a specific directory default is the current working directory. @@ -157,16 +182,27 @@ def ls(self, folder='.'): :returns: a tuple with the list of files in the folder and the list of subfolders in the folder. """ - current_folder = self._sftp_client.getcwd() - self._sftp_client.chdir(folder) + if self.sftp and folder == '': + folder = '.' + files = [] folders = [] contents = [] - contents = self._sftp_client.listdir() - files = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('-'), contents) - folders = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('d'), contents) - self._sftp_client.chdir(current_folder) + if self.sftp: + current_folder = self._sftp_client.getcwd() + self._sftp_client.chdir(folder) + contents = self._sftp_client.listdir() + files = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('-'), contents) + folders = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('d'), contents) + self._sftp_client.chdir(current_folder) + else: + current_folder = self._ftp.pwd() + self.cd(folder) + self._ftp.retrlines('LIST', lambda a: contents.append(a)) + files = filter(lambda a: a.split()[0].startswith('-'), contents) + folders = filter(lambda a: a.split()[0].startswith('d'), contents) + self._ftp.cwd(current_folder) return files, folders def dir(self, folder='', prefix=''): @@ -200,7 +236,11 @@ def mkdir(self, folder): :param folder: the folder to be created. :type folder: string """ - current_folder = self._sftp_client.getcwd() + + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() #creates the necessary folders on #the server if they don't exist folders = folder.split('/') @@ -208,7 +248,10 @@ def mkdir(self, folder): try: self.cd(fld) except error_perm: # folder does not exist - self._sftp_client.mkdir(fld) + if self.sftp: + self._sftp_client.mkdir(fld) + else: + self._ftp.mkd(fld) self.cd(fld) self.cd(current_folder) @@ -219,11 +262,17 @@ def rm(self, filename): :type filename: string """ try: - self._sftp_client.remove(filename) + if self.sftp: + self._sftp_client.remove(filename) + else: + self._ftp.delete(filename) except error_perm: # target is either a directory # either it does not exist try: - current_folder = self._sftp_client.getcwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() self.cd(filename) except error_perm: print('550 Delete operation failed %s ' @@ -240,7 +289,11 @@ def rmdir(self, foldername): :param foldername: the folder to be deleted. :type foldername: string """ - current_folder = self._sftp_client.getcwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() + try: self.cd(foldername) except error_perm: @@ -249,16 +302,25 @@ def rmdir(self, foldername): else: self.cd(current_folder) try: - self._sftp_client.rmdir(foldername) + if self.sftp: + self._sftp_client.rmdir(foldername) + else: + self._ftp.rmd(foldername) except error_perm: # folder not empty self.cd(foldername) contents = self.ls() #delete the files - map(self._sftp_client.remove, contents[0]) + if self.sftp: + map(self._sftp_client.remove, contents[0]) + else: + map(self._ftp.delete, contents[0]) #delete the subfolders map(self.rmdir, contents[1]) self.cd(current_folder) - self._sftp_client.rmdir(foldername) + if self.sftp: + self._sftp_client.rmdir(foldername) + else: + self._ftp.rmd(foldername) def get_filesize(self, filename): """ Returns the filesize of a file @@ -268,14 +330,23 @@ def get_filesize(self, filename): :returns: string representation of the filesize. """ - return self._sftp_client.lstat(filename).st_size + if self.sftp: + return self._sftp_client.lstat(filename).st_size + else: + result = [] + def dir_callback(val): + result.append(val.split()[4]) + self._ftp.dir(filename, dir_callback) + return result[0] def get_datestamp(self, filename): - # datestamp = self._ftp.sendcmd('MDTM ' + filename) - # datestamp = datetime.strptime(datestamp[4:], - # "%Y%m%d%H%M%S").strftime("%Y-%M-%d") - datestamp = datetime.fromtimestamp(self._sftp_client.lstat(filename).st_mtime) - return datestamp.strftime("%Y-%m-%d") + if self.sftp: + datestamp = datetime.fromtimestamp(self._sftp_client.lstat(filename).st_mtime) + return datestamp.strftime("%Y-%m-%d") + else: + datestamp = self._ftp.sendcmd('MDTM ' + filename) + return datetime.strptime(datestamp[4:], + "%Y%m%d%H%M%S").strftime("%Y-%M-%d") def check_pkgs_integrity(self, filelist, logger, timeout=120, sleep_time=10): @@ -327,8 +398,17 @@ def upload(self, filename, location=''): be stored. :type location: string """ - current_folder = self._sftp_client.getcwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() self.mkdir(location) self.cd(location) - self._sftp_client.put(filename, location) + if self.sftp: + self._sftp_client.put(filename, location) + else: + fl = open(filename, 'rb') + filename = filename.split('/')[-1] + self._ftp.storbinary('STOR %s' % filename, fl) + fl.close() self.cd(current_folder) From 2d60aa86c28d137d2912c53fb504d7c7068d4b6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Zi=C3=B3=C5=82ek?= Date: Mon, 6 Jun 2016 16:37:16 +0200 Subject: [PATCH 3/3] ftp_utils: New configuration variables and FTP/SFTP ls method update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds new configuration variables for Elsevier SFTP connection - port, sftp * Adds an addition cleaning to file names on the list of files/folders from ls method in ftp_utils.py Signed-off-by: Wojciech Ziółek --- harvestingkit/contrast_out.py | 6 ++++-- harvestingkit/elsevier_package.py | 2 +- harvestingkit/ftp_utils.py | 6 +++++- user_config.cfg | 2 ++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/harvestingkit/contrast_out.py b/harvestingkit/contrast_out.py index ec47dd4..3752244 100644 --- a/harvestingkit/contrast_out.py +++ b/harvestingkit/contrast_out.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Harvesting Kit. -## Copyright (C) 2013, 2014 CERN. +## Copyright (C) 2013, 2014, 2016 CERN. ## ## Harvesting Kit is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -99,7 +99,9 @@ def connect(self): try: self.ftp = FtpHandler(self.config.ELSEVIER.URL, self.config.ELSEVIER.LOGIN, - self.config.ELSEVIER.PASSWORD) + self.config.ELSEVIER.PASSWORD, + port = int(self.config.ELSEVIER.PORT), + sftp = self.config.ELSEVIER.SFTP) self.logger.debug(('Successful connection to the ' 'Elsevier server')) return diff --git a/harvestingkit/elsevier_package.py b/harvestingkit/elsevier_package.py index 3b2d549..5a85e54 100644 --- a/harvestingkit/elsevier_package.py +++ b/harvestingkit/elsevier_package.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Harvesting Kit. -# Copyright (C) 2013, 2014, 2015 CERN. +# Copyright (C) 2013, 2014, 2015, 2016 CERN. # # Harvesting Kit is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as diff --git a/harvestingkit/ftp_utils.py b/harvestingkit/ftp_utils.py index 123516a..84e902a 100644 --- a/harvestingkit/ftp_utils.py +++ b/harvestingkit/ftp_utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Harvesting Kit. -## Copyright (C) 2014 CERN. +## Copyright (C) 2014, 2016 CERN. ## ## Harvesting Kit is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -195,6 +195,8 @@ def ls(self, folder=''): contents = self._sftp_client.listdir() files = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('-'), contents) folders = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('d'), contents) + files = map(lambda a: ' '.join(a.split()[8:]), files) + folders = map(lambda a: ' '.join(a.split()[8:]), folders) self._sftp_client.chdir(current_folder) else: current_folder = self._ftp.pwd() @@ -202,6 +204,8 @@ def ls(self, folder=''): self._ftp.retrlines('LIST', lambda a: contents.append(a)) files = filter(lambda a: a.split()[0].startswith('-'), contents) folders = filter(lambda a: a.split()[0].startswith('d'), contents) + files = map(lambda a: ' '.join(a.split()[8:]), files) + folders = map(lambda a: ' '.join(a.split()[8:]), folders) self._ftp.cwd(current_folder) return files, folders diff --git a/user_config.cfg b/user_config.cfg index 840f37c..4c4b2b7 100644 --- a/user_config.cfg +++ b/user_config.cfg @@ -2,6 +2,8 @@ login = empty password = empty url = empty +port = 21 +sftp = True [OXFORD] login = empty