Jump to content
PDS Geosciences Node Community
Dan Scholes

Python sample for downloading PDS Geosciences Node archive directories or ODE cart requests

Recommended Posts

Below I have included a Python 3.6 sample script for downloading files from the PDS Geosciences Node. The sample includes a configuration for downloading data files from both a PDS Geosciences Node archive and files from an Orbital Data Explorer (ODE) cart request. The script supports multiple levels of sub directories, as well. The script includes variables that should be set by the user for his or her environment. The example PDS data set and ODE cart request both exist, and they are available for test executions of the script. Python 3.6 is required for the script to function. This script is also available for download in the downloads section of the forum.

# PDSGeosciencesNode_FileDownload.py
# Dan Scholes 2/19/18
# Pypthon 3.6 compatible version
# Example of downloading data files using 
# links from HTTP PDS Geosciences Node Data Archive
# or Orbital Data Explorer (ODE) Cart location
# Note: One drawback of this script is that it downloads one file at a time, rather than multiple streams.
# Additional Note: In the future, changes to the PDS Geosciences Node website and Orbital Data Explorer website may cause this example to no longer function.
# Disclaimer: This sample code is provided "as is", without warranty of any kind, express or implied. In no event shall the author be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the sample code or the use or other dealings with the sample code.

# Phython download website: https://www.python.org/downloads/

import urllib.request
import re
import time
from pathlib import Path

# Variables for user to populate----------
saveFilesToThisDirectory = 'c:/temp/data/'  # local destination path to save files

#next two lines are for downloading from the PDS Geosciences Node archive
url = "http://pds-geosciences.wustl.edu/mro/mro-m-rss-5-sdp-v1/mrors_1xxx/"  #enter the directory you would like to download
relativeLinkPathBase = "http://pds-geosciences.wustl.edu"										#this is the default location for the relative paths on the website (just leave this value) 		

#next two lines are for downloading an ODE cart request
#url = "http://ode.rsl.wustl.edu/cartdownload/data/sample/" 			#enter the directory you would like to download
#relativeLinkPathBase = "http://ode.rsl.wustl.edu/"							#this is the default location for the relative paths on the ode cart website (just leave this value) 		

recursiveVal = True					# True/False whether to download files in subdirectories of the specified location in the url variable
verboseMessages = False			# True/False whether to display verbose messages during the script processing

# End of variables for user to populate----------

relativeLinkPathBase = relativeLinkPathBase.rstrip('/')
maxDownloadAttempts = 3
filesToDownloadList = []

def get_pageLinks(inUrl,inRecursive):
	if verboseMessages: print("Cataloging Directory: ",inUrl)  #directory to process
	
	myURLReader = urllib.request.urlopen(inUrl.rstrip('/'))
	myResults = myURLReader.read().decode('utf-8').replace("<a href=","<A HREF=").replace("</a>","</A>")
	myURLReader.close()
		
	data=myResults.split("</A>")
	tag="<A HREF=\""
	endtag="\">"
	for item in data:
	
	    if "<A HREF" in item:
	        try:
	            ind = item.index(tag)
	            item=item[ind+len(tag):]
	            end=item.index(endtag)
	        except: pass
	        else:
	            #The link is found
	            itemToDownload = item[:end]
	            if "." in itemToDownload:
	            	#the link is to a file
	            	if relativeLinkPathBase not in itemToDownload:
	            		#is the path relative, so we add the base url		            	
	            		itemToDownload = relativeLinkPathBase + itemToDownload
	            	filesToDownloadList.append(itemToDownload)
	            else:
	            	# it's a directory, so let's go into it if recursive is chosen
	            	if inRecursive:
		            	if itemToDownload not in inUrl:  #we make sure it isn't a link to parent directory
		            		if relativeLinkPathBase not in itemToDownload:
		            			itemToDownload = relativeLinkPathBase + itemToDownload
		            		# the directory is a subdirectory, so we will follow it
		            		if verboseMessages: print("subdirectory to process ", itemToDownload)
		            		get_pageLinks(itemToDownload,inRecursive)

def download_files():
	# download the files that were identified
	# this is refering to the global list of files to download
	localSuccessfulDownloads = 0

	print("==Downloads starting ==============")
	for link in filesToDownloadList:
		downloadAttempts = 0		
		fileDownloaded = False
		if verboseMessages: print("downloading file: ",link)
		local_link = link;
		local_link = saveFilesToThisDirectory + local_link.replace(relativeLinkPathBase,"")
		local_filename = link.split('/')[-1]	

		#make sure the local directory stucture has been created
		path = Path(local_link.replace(local_filename,"")) 
		path.mkdir(parents=True, exist_ok=True)
		
		while not fileDownloaded and downloadAttempts < maxDownloadAttempts:			
			try: 
				urllib.request.urlretrieve(link,local_link)	
				localSuccessfulDownloads += 1	
				fileDownloaded = True
			except urllib.error.URLError as e:
				downloadAttempts += 1 #we will retry the download the number of times allowed by maxDownloadAttempts variable
				if verboseMessages: print("downloadError: ",e.reason)
				if verboseMessages: print("downloadErrorFile: ",link," attempt:",downloadAttempts)
				if downloadAttempts < maxDownloadAttempts:
					time.sleep(15)  #wait 15 seconds before the next attempt
				else:
					print("Could not successfully download: ",link," after ",downloadAttempts," download attempts")
				
	print("==Downloads complete ==============")
	print("SuccessfulDownloads: ",localSuccessfulDownloads," out of ",len(filesToDownloadList))
	
	
print('==Process is starting ===================')
#get the file links
get_pageLinks(url, recursiveVal)
print("==Collected ", len(filesToDownloadList), " file links ======")
#now download the files
download_files()

 

Share this post


Link to post
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.


×
×
  • Create New...