Dan Scholes Posted February 22, 2018 Share Posted February 22, 2018 Below I have included a Python 3.6 sample script for downloading files from the PDS Geosciences Node. The sample includes a configuration for downloading data files from both a PDS Geosciences Node archive and files from an Orbital Data Explorer (ODE) cart request. The script supports multiple levels of sub directories, as well. The script includes variables that should be set by the user for his or her environment. The example PDS data set and ODE cart request both exist, and they are available for test executions of the script. Python 3.6 is required for the script to function. This script is also available for download in the downloads section of the forum. # PDSGeosciencesNode_FileDownload.py # Dan Scholes 2/19/18 # Pypthon 3.6 compatible version # Example of downloading data files using # links from HTTP PDS Geosciences Node Data Archive # or Orbital Data Explorer (ODE) Cart location # Note: One drawback of this script is that it downloads one file at a time, rather than multiple streams. # Additional Note: In the future, changes to the PDS Geosciences Node website and Orbital Data Explorer website may cause this example to no longer function. # Disclaimer: This sample code is provided "as is", without warranty of any kind, express or implied. In no event shall the author be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the sample code or the use or other dealings with the sample code. # Phython download website: https://www.python.org/downloads/ import urllib.request import re import time from pathlib import Path # Variables for user to populate---------- saveFilesToThisDirectory = 'c:/temp/data/' # local destination path to save files #next two lines are for downloading from the PDS Geosciences Node archive url = "http://pds-geosciences.wustl.edu/mro/mro-m-rss-5-sdp-v1/mrors_1xxx/" #enter the directory you would like to download relativeLinkPathBase = "http://pds-geosciences.wustl.edu" #this is the default location for the relative paths on the website (just leave this value) #next two lines are for downloading an ODE cart request #url = "http://ode.rsl.wustl.edu/cartdownload/data/sample/" #enter the directory you would like to download #relativeLinkPathBase = "http://ode.rsl.wustl.edu/" #this is the default location for the relative paths on the ode cart website (just leave this value) recursiveVal = True # True/False whether to download files in subdirectories of the specified location in the url variable verboseMessages = False # True/False whether to display verbose messages during the script processing # End of variables for user to populate---------- relativeLinkPathBase = relativeLinkPathBase.rstrip('/') maxDownloadAttempts = 3 filesToDownloadList = [] def get_pageLinks(inUrl,inRecursive): if verboseMessages: print("Cataloging Directory: ",inUrl) #directory to process myURLReader = urllib.request.urlopen(inUrl.rstrip('/')) myResults = myURLReader.read().decode('utf-8').replace("<a href=","<A HREF=").replace("</a>","</A>") myURLReader.close() data=myResults.split("</A>") tag="<A HREF=\"" endtag="\">" for item in data: if "<A HREF" in item: try: ind = item.index(tag) item=item[ind+len(tag):] end=item.index(endtag) except: pass else: #The link is found itemToDownload = item[:end] if "." in itemToDownload: #the link is to a file if relativeLinkPathBase not in itemToDownload: #is the path relative, so we add the base url itemToDownload = relativeLinkPathBase + itemToDownload filesToDownloadList.append(itemToDownload) else: # it's a directory, so let's go into it if recursive is chosen if inRecursive: if itemToDownload not in inUrl: #we make sure it isn't a link to parent directory if relativeLinkPathBase not in itemToDownload: itemToDownload = relativeLinkPathBase + itemToDownload # the directory is a subdirectory, so we will follow it if verboseMessages: print("subdirectory to process ", itemToDownload) get_pageLinks(itemToDownload,inRecursive) def download_files(): # download the files that were identified # this is refering to the global list of files to download localSuccessfulDownloads = 0 print("==Downloads starting ==============") for link in filesToDownloadList: downloadAttempts = 0 fileDownloaded = False if verboseMessages: print("downloading file: ",link) local_link = link; local_link = saveFilesToThisDirectory + local_link.replace(relativeLinkPathBase,"") local_filename = link.split('/')[-1] #make sure the local directory stucture has been created path = Path(local_link.replace(local_filename,"")) path.mkdir(parents=True, exist_ok=True) while not fileDownloaded and downloadAttempts < maxDownloadAttempts: try: urllib.request.urlretrieve(link,local_link) localSuccessfulDownloads += 1 fileDownloaded = True except urllib.error.URLError as e: downloadAttempts += 1 #we will retry the download the number of times allowed by maxDownloadAttempts variable if verboseMessages: print("downloadError: ",e.reason) if verboseMessages: print("downloadErrorFile: ",link," attempt:",downloadAttempts) if downloadAttempts < maxDownloadAttempts: time.sleep(15) #wait 15 seconds before the next attempt else: print("Could not successfully download: ",link," after ",downloadAttempts," download attempts") print("==Downloads complete ==============") print("SuccessfulDownloads: ",localSuccessfulDownloads," out of ",len(filesToDownloadList)) print('==Process is starting ===================') #get the file links get_pageLinks(url, recursiveVal) print("==Collected ", len(filesToDownloadList), " file links ======") #now download the files download_files() Link to comment Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now