#!/usr/bin/env python """ This script allows me to make a quick partial mirror of a particular FTP directory, except for those files, which are already downloaded (but not physically present, only given in a list). It was intended get Fedora updates for home use at the university """ # Last change: 04.04.2007 # TODO: implement interactive downloading # TODO: implement threaded, controlled downloading # TODO: make clear verbosity levels: critical, error, warning, info, debug import ftplib import string import os import os.path import sys import getopt import socket import re import time import gzip import ConfigParser FTPSOURCES="ftpsources" ACTIVE="active" localListingFile = "" # file containg the local listing localScanDir = "" # directory from which make initial file list ftpStartDir = "" # In which FTP dir should we start localStartDir = "" # same for local dir desiredLogLevel = 0 # how verbose should debug information be resumeDownloads = False # should we resume downloads? ghostResumes = False # shall we download only parts of an non-existing file? DANGEROUS! regExpressions = [] # compiled regular expressions baseDownDir = "" # base directory, downloads go to baseConfigDir = "" # base directory, where config files are located ftpHostName = "" loginName = "anonymous" loginPassword = "nobody@nowhere.uk" # Unfortunately, some callback functions need to store persistent data # or have to obtain additional information. This is achieved by use of # global variables (it's not pretty, I know) ftpListing = {} # all directories along with their files on FTP localListing={} # the same for the local part downloadListing = [] # listing of files to download parserDirectories = [] # which directories the LIST-command parser found parserFiles = {} # the same for files downloadFileD = None # in which files store data we've got from FTP totalDownloadSize = 0L totalDownloadSizeSoFar = 0L # for all files downloadSizeSoFar = 0L # for a single file... downloadSize = 0L # symbolic names for fields from the LIST input RIGHTS =0 NUMLINKS=1 UID =2 GID =3 SIZE =4 MONTH =5 DAY =6 TIME =7 FNAME =8 SLEEP_BETWEEN_RECONNECTS=5 SLEEP_BETWEEN_DIRS=0 FTP_DEBUG_LEVEL=0 MAX_CONNECT_RETRIES=3 def printUsage(): print "Usage:", os.path.basename(sys.argv[0]), " configfile" print """ Creating a new listing: -l|--listing A file containing listing of files we already have. -d|--dir create a listing of files we already have in the given directory. The listing will be saved to the file given with the option --listing. Additional options: -h|--help Prints this help text. -v|--verbose Increases verbosity level by one. Can be given multiple times. -c|--config This is the base directory for config files. All paths in local_listing and exclude_list directive are relative to this directory. -s|--savedir This is the base directory for downloads. All paths in local_dir directive are relative to this directory. -r|--resume Resume downloads instead of overwriting on files which are smaller on our side than on server's side. -u|--user Altenative username for FTP. Default is anonymous -p|--password Altenative password for FTP. --ghost Allows to download trailing parts of files which are not present localy. Use with extreme CAUTION! (the best, if you don't) Format of config file: [name-for-section] ftpsources = ftp.example.com:/path/to/mirror ftp.example2.com:/another/path/to/mirror local_listing = file_with_files_we_already_have.gz local_dir = director_where_downloads_go exclude_list = file_with_regex_of_files_to_exlude (can be ommited) active=0|1 (can be ommited, defaults to 1) """ def debug(message, level): """ possibly prints a debug message """ if desiredLogLevel >= level: print message def parserReset(): """ deletes data parsed from a LIST command output """ global parserDirectories, parserFiles parserDirectories = [] parserFiles = {} def readRegularExpressions(regExFile): """ Read regular expressions for file exclusions and store them for later use. """ debug("Reading regular expressions from "+regExFile, 3) regExpressions = [] try: f=open(regExFile) except IOError: print "Could not open regular expressions file", regExFile sys.exit(1) for line in f: line=string.strip(line, "\n") debug("Compiling line "+line, 3) regExpressions.append(re.compile(line)) return regExpressions def applyExclusions(dir, files): """ Checks, which files are to be excluded and deletes them from the list """ debug("Applying exlusions.", 3) if regExpressions == []: debug("No reg-ex defined.", 3) return files for regEx in regExpressions: if regEx.search(dir): # if dir matches the regEx, no file can survive... if type(files) == dict: return {} elif type(files) == list: return [] else: print "Unexpected list type "+str(type(files))+" (1)" sys.exit(1) if type(files) == dict: for file in files.keys(): filepath=os.path.join(dir,file) debug("Looking, if "+regEx.pattern+" matches "+filepath, 2) if regEx.search(filepath): debug("Regex "+regEx.pattern+" matches "+filepath, 1) del(files[file]) elif type(files) == list: i=0 while i < len(files): filepath=os.path.join(dir,files[i])+os.sep # directories should end with a separator debug("Looking, if "+regEx.pattern+" matches "+filepath, 2) if regEx.search(filepath): debug("Regex "+regEx.pattern+" matches "+filepath, 1) del(files[i]) else: i=i+1 else: print "Unexpected list type "+str(type(files))+" (2)" sys.exit(1) return files def listparser(line): """ parses a line from LIST command. It will determine if an object is a file, or a dir, save filenames and file size. Right now, no more files types are recognized. Symbolic links will be ignored. """ global parserFile, parserDirectories debug(line, 4) parts = string.split(line) ftype = parts[RIGHTS][0] if ftype == '-': try: size=long(parts[SIZE]) except ValueError: size=0L parserFiles[parts[FNAME]] = size elif ftype == 'd': parserDirectories.append(parts[FNAME]) else: debug("Unknown type "+ftype+" for file "+parts[FNAME], 1) # debug("Unknown type "+ftype+" for file "+str(parts), 1) def ftpWalker(currDir): """ Recursively look into the FTP directories and create a FTP file list """ global parserFiles, parserDirectories, ftpHostName, loginName, loginPassword, ftpStartDir debug("FTP walker in "+currDir, 1) parserReset() debug("Parser resettet", 3) retries=0 while (retries > -1 and retries < 3): try: ftp.dir(currDir, listparser) retries=-1 except ftplib.error_temp: print "Connection timeout. I'll try to reconnect." connectFTP(ftpHostName, loginName, loginPassword) changeFTPdir(ftpStartDir) retries=retries+1 except socket.error: print "Connection timeout. I'll try to reconnect." connectFTP(ftpHostName, loginName, loginPassword) changeFTPdir(ftpStartDir) retries=retries+1 if retries != -1: print "Maximal count on retries reached. Exiting" return debug("Got FTP list", 3) # exclusions are important for files AND directories - it saves time # otherwise wasted in searching excluded directories files = applyExclusions(currDir, parserFiles) directories = applyExclusions(currDir, parserDirectories) debug("Got dirs: "+str(directories), 2) debug("Got files: "+str(files), 2) if files != {}: ftpListing[currDir] = files for directory in directories: if SLEEP_BETWEEN_DIRS > 0: time.sleep(SLEEP_BETWEEN_DIRS) # pause x second debug("Now diving to "+currDir+"/"+directory, 2) ftpWalker(currDir+"/"+directory) def localFileStore(chunk): """ Callback-function. Stores file chunks which were downloaded from the FTP into the file. The file descriptor has to be given by a global variable called downloadFileD. """ global downloadFileD, totalDownloadSizeSoFar, totalDownloadSize, downloadSize, downloadSizeSoFar chunkSize = len(chunk) debug("Called localFileStore() for file "+downloadFileD.name+" with chunk size="+str(chunkSize), 5) downloadFileD.write(chunk) downloadSizeSoFar = downloadSizeSoFar + chunkSize totalDownloadSizeSoFar = totalDownloadSizeSoFar + chunkSize # escape sequence brings the cursor to the beginning of the line sys.stdout.write("\033[0G(%6dk /%6dk) [%3d%%] Total: (%5dM /%5dM) [%3d%%]" % (downloadSizeSoFar/1024, downloadSize/1024, 100*downloadSizeSoFar/downloadSize, totalDownloadSizeSoFar/1048576, totalDownloadSize/1048576, 100*totalDownloadSizeSoFar/totalDownloadSize) ) # Do not close! Whoever opened the file will also close it. def downloadFile(dir, file): """ Starts the FTP download for a single file. """ global downloadFileD, localListing, ftpListing, resumeDownloads, ghostResumes, totalDownloadSizeSoFar, totalDownloadSize, downloadSize, downloadSizeSoFar downloadFileName = os.path.join(dir, file) debug("Downloading file "+downloadFileName, 1) # make sure a local directory path exists if not os.path.exists(dir): os.makedirs(dir) # Here we have to look, if we should resume downloads. Resuming downloads only makes # sense, if the local file is smaller than the remote file. If it's the other way # round, overwrite the local file. if resumeDownloads: if ghostResumes: # If the user wants ghost resumes, we cannot append to already existing files! # We will set such files to zero size first. if localListing[dir].has_key(file) and localListing[dir][file] > 0: if os.path.exists(downloadFileName) and os.path.getsize(downloadFileName) > 0: debug("You wanted ghost resumes but "+downloadFileName+" is present. I'll reset it.", 1) f=open(downloadFileName, "w") f.close() else: if os.path.exists(downloadFileName): localListing[dir][file] = os.path.getsize(downloadFileName) else: if os.path.exists(downloadFileName): localListing[dir][file] = os.path.getsize(downloadFileName) else: print "Local file", downloadFileName, "does not exist. Assuming zero size." localListing[dir][file] = 0L downloadSize = ftpListing[dir][file] if localListing[dir].has_key(file): downloadSizeSoFar = localListing[dir][file] else: downloadSizeSoFar = 0L vfsstat = os.statvfs(dir) free = vfsstat.f_frsize * vfsstat.f_bfree if ( free < (downloadSize - downloadSizeSoFar) ): print "File %s does not fit into %s" % (file, dir) return False totalDownloadSizeSoFar = totalDownloadSizeSoFar + downloadSizeSoFar # If files have already identical sizes, skip it. if localListing[dir].has_key(file) and localListing[dir][file] == ftpListing[dir][file]: debug("Files have same sizes, download cancelled.", 3) return True resumeThisDownload = resumeDownloads and (localListing[dir][file] < ftpListing[dir][file]) and (localListing[dir][file] > 0) try: if resumeThisDownload: downloadFileD = open(downloadFileName, "ab") debug("Resuming FTP download for "+downloadFileName, 2) ftp.retrbinary("RETR "+downloadFileName, localFileStore, 8192, localListing[dir][file]) print "" else: downloadFileD = open(downloadFileName, "wb") debug("Starting FTP download for "+downloadFileName, 2) ftp.retrbinary("RETR "+downloadFileName, localFileStore, 8192) print "" except IOError, msg: print "Could not open file", downloadFileName, "for writing:", msg return False except ftplib.all_errors, msg: print "FTP error while downloading", downloadFileName print " reason:", msg return False downloadFileD.close() localListing[dir][file] = os.path.getsize(downloadFileName) if localListing[dir][file] != ftpListing[dir][file]: print "The final size of "+downloadFileName+" ("+str(localListing[dir][file])+") differs from the FTP's file size ("+str(ftpListing[dir][file])+")" return True def searchDir(dir): """ Searches the directory for files and appends them to the download list This function is *NOT* recursive. But most likely it will be called for every directory we've found on the FTP which is almost as good.""" debug("Searching dir "+dir, 2) localListing[dir] = {} for file in ftpListing[dir]: downloadListing.append((dir, file)) def createLocalListing(startDir, outFile): """ Create a listing from a existing directory. """ global localListing debug("Creating local listing of "+startDir+" in file "+outFile, 2) if os.path.isdir(startDir): originCWD = os.getcwd() os.chdir(startDir) else: print "Directory", startDir, "does not exist." return False localListing = {} for root, dirs, files in os.walk("."): localListing[root] = {} for file in files: filepath = os.path.join(root,file) if os.path.isfile(filepath): localListing[root][file] = os.path.getsize(filepath) os.chdir(originCWD) return saveLocalListing(localListing, outFile) def saveLocalListing(localListing, outFile): try: f=gzip.open(outFile, "w", 9) except IOError: print "Could not open file", outFile, "for writing" return False for dir in localListing: for file in localListing[dir]: line = "%s:%d\n" % ( os.path.join(dir,file), localListing[dir][file] ) f.write(line) f.close() return True def loadLocalListing(localListingFile): localListing = {} try: f=gzip.open(localListingFile) except IOError: print "Could not open", localListingFile, "for reading." sys.exit(1) lineCounter=0 for line in f: lineCounter = lineCounter+1 line=line.strip() if len(line) == 0 or line.startswith("#"): # just a comment line or empty continue lineParts = line.split(":") linePartsLen = len(lineParts) if linePartsLen < 2: print "%s, line %d: too few fields." % (localListingFile, lineCounter) continue try: fileSize=long(lineParts[linePartsLen-1]) except ValueError: print "%s, line %d: bad file size %s: not a number." % (localListingFile, lineCounter, lineParts[linePartsLen-1]) continue filepath=string.join(lineParts[0:linePartsLen-1],":") dir, file = os.path.split(filepath) if dir == "": dir = "." if not localListing.has_key(dir): localListing[dir] = {} if len(file) > 0: localListing[dir][file] = fileSize f.close() return localListing def parseOptions(cmdLineArgs): """ Parse command line options. """ global localListingFile, desiredLogLevel, localScanDir, resumeDownloads, ftpHostName, loginName, loginPassword, ghostResumes, showLocalListing, baseConfigDir, baseDownDir if len(cmdLineArgs) == 0: debug("No command line arguments detected.", 3) printUsage() sys.exit(0) try: opts, args = getopt.getopt(cmdLineArgs, "hl:vd:u:p:s:rc:", ["help", "listing=", "verbose", "dir=", "user=", "password=", "savedir=", "resume", "ghost","config"]) except getopt.GetoptError, msg: print "Uknown options on command line:", msg printUsage() sys.exit(1) for o, v in opts: if o in ("-h", "--help"): printUsage() sys.exit(0) elif o in ("-l", "--listing"): localListingFile=v elif o in ("-v", "--verbose"): desiredLogLevel=desiredLogLevel+1 elif o in ("-d", "--dir"): localScanDir=v elif o in ("-u", "--user"): loginName=v elif o in ("-p", "--password"): loginPassword=v elif o in ("-s", "--savedir"): baseDownDir=v elif o in ("-c", "--config"): baseConfigDir=v elif o in ("-r", "--resume"): resumeDownloads=True elif o == "--ghost": ghostResumes = True resumeDownloads = True else: print "Unknown option:",o,v printUsage() sys.exit(1) return args def optionsAreSane(args): """ Check if all options we have parsed so far seem to be sane. In other words: check some sematics. """ global localListingFile, ftpHostName, ftpStartDir, localStartDir, ghostResumes, showLocalListing errorsPresent=False if ghostResumes: print "You want ghost-resumes! This function is presently completely untested." if localScanDir == "" and localListingFile == "": if len(args) != 1: print "We need exactly one argument: the config file with directives how to mirror." errorsPresent=True else: if len(args) > 0: print "Creating a directory listing is a stand-alone operation. You are not allowed\nto specify a config file for downloads at the same time." errorsPresent=True if localScanDir == "" or localListingFile == "": if localScanDir == "": print "You used the --listing option. You have to specify the --dir option as well." else: print "You used the --dir option. You have to specify the --listing option as well." errorsPresent=True if baseConfigDir != "" and not os.path.isdir(baseConfigDir): print baseConfigDir, "is not a directory!" errorsPresent=True return not errorsPresent def showLocalListingContents(localListing): for dir in localListing: for file in localListing[dir]: print "%s : %d bytes" % (os.path.join(dir,file), localListing[dir][file]) def connectFTP(ftpHostName, loginName, loginPassword): tries = 0 while tries < MAX_CONNECT_RETRIES: try: print "Connecting", ftpHostName ftp.connect(ftpHostName) # everything else was ok, so break out of while...# break except socket.gaierror, msg: print "ERROR: Could not connect to", ftpHostName, "->", msg sys.exit(1) except socket.error, msg: print "ERROR: Count not connect to", ftpHostName, "->", msg sys.exit(1) except ftplib.error_temp, msg: if str(msg).startswith("421 "): print "Too many connected users, sleeping", SLEEP_BETWEEN_RECONNECTS, "seconds" time.sleep(SLEEP_BETWEEN_RECONNECTS) tries=tries+1 if not tries < MAX_CONNECT_RETRIES: return False try: ftp.login(loginName, loginPassword) except ftplib.error_perm, msg: print "ERROR: login with username", loginName, "failed. ->", msg sys.exit(1) return True def changeFTPdir(ftpDir): try: ftp.cwd(ftpDir) except ftplib.error_perm: print "ERROR: failed on changing to directory", ftpDir, "on FTP server." print " Most certainly this directory does not exist" return False return True # parse and check command line options args=parseOptions(sys.argv[1:]) if not optionsAreSane(args): print "Stopping because of wrong options." sys.exit(1) # check if user wants to make a new listing if localScanDir != "": if createLocalListing(localScanDir, localListingFile): # a new listing has been created, nothing more to do # we do not allow listing creation and normal dowload operation together sys.exit(0) else: print "Could not create a new listing" sys.exit(1) config=ConfigParser.ConfigParser() config.read(args[0]) sections=config.sections() sections.sort() servers={} # Make a list of sections sorted by servers for section in sections: if not config.has_option(section, ACTIVE) or config.getboolean(section, ACTIVE): if config.has_option(section, FTPSOURCES): for source in config.get(section, FTPSOURCES).split(): act_server, path = source.split(":",1) debug("Adding section "+section+" for server "+act_server, 2) if servers.has_key(act_server): servers[act_server].append(section) else: servers[act_server]=[section] else: print "There is no", FTPSOURCES, "option in section", section # sort servers for priority prio = servers.keys() for i in range(len(prio)): print prio[i] if ( prio[i].endswith("bonn.de") ): tmp = prio.pop(i) prio.insert(0, tmp) for i in range(len(prio)): if ( prio[i].endswith("informatik.uni-bonn.de") ): tmp = prio.pop(i) prio.insert(0, tmp) # And now go through the list by servers and start the download for server in prio: debug("Working with server "+server, 2) ftp = ftplib.FTP() ftp.set_pasv(True) ftp.set_debuglevel(FTP_DEBUG_LEVEL) socket.setdefaulttimeout(30) ftpHostName=server if not connectFTP(ftpHostName, loginName, loginPassword): print "Connection to", ftpHostName, "failed. Proceeding with next server." continue for section in servers[server]: debug("Doing section "+section, 2) # reset state variables for each section ftpListing = {} # all directories along with their files on FTP localListing={} # the same for the local part downloadListing = [] # listing of files to download parserDirectories = [] # which directories the LIST-command parser found parserFiles = {} # the same for files downloadFileD = None # in which files store data we've got from FTP totalDownloadSize = 0L totalDownloadSizeSoFar = 0L # for all files downloadSizeSoFar = 0L # for a single file... downloadSize = 0L localListingFile = os.path.join(baseConfigDir, config.get(section, "local_listing")) print "Loading local listing file", localListingFile localListing = loadLocalListing(localListingFile) for ftpsource in config.get(section, FTPSOURCES).split(): act_server, act_path = ftpsource.split(":",1) if ( act_server == server ): ftpStartDir = act_path localStartDir = os.path.join(baseDownDir, config.get(section, "local_dir")) if config.has_option(section, "exclude_list"): regExpressions = readRegularExpressions(os.path.join(baseConfigDir, config.get(section, "exclude_list"))) if not changeFTPdir(ftpStartDir): print "Changing to directory", ftpStartDir, "failed." continue else: print "Changing to server directory", ftpStartDir, "was successfull." ftpWalker(".") if (ftpListing == {}): break debug("Local listing: "+str(localListing), 4) debug("FTP listing: "+str(ftpListing), 4) # now look at all files and put those which we do not have in our local list # into the download listing for dir in ftpListing: debug("Checking dir: "+dir, 1) if localListing.has_key(dir): # directory already exists on our side for file in ftpListing[dir]: # download, if file does not exist or has different size if not localListing[dir].has_key(file) or ftpListing[dir][file] != localListing[dir][file]: downloadListing.append( (dir, file) ) else: # directory doesn't exist on our side, get it! searchDir(dir) # compute total download size for (dir, file) in downloadListing: totalDownloadSize = totalDownloadSize + ftpListing[dir][file] originCWD=os.getcwd() if downloadListing != []: if not os.path.isdir(localStartDir): try: print "Directory ", localStartDir, " does not exist. Creating it..." os.makedirs(localStartDir) except error: print "Could not create download directory." sys.exit(1) print "Changing to local directory", localStartDir os.chdir(localStartDir) # download everything we have have in the list filesToDownload = len(downloadListing) for i in range(filesToDownload): dir, file = downloadListing[i] print "Downloading %s (%3d /%3d)" % (file, i+1, filesToDownload) if downloadFile(dir, file): debug("Download succeeded", 2) else: debug("Download failed", 1) # now, when everything is downloaded, save the new listing os.chdir(originCWD) if downloadListing != []: # this means we downloaded something print "Saving changes to listing file." if not saveLocalListing(localListing, localListingFile): print "Error saving updated localListing", localListingFile sys.exit(2) print "Closing FTP connection" ftp.quit()