This tutorial covers how to download water column sonar data stored in an Amazon Web Services (AWS) S3 bucket using S3 file paths provided by the map viewer data discovery portal. Two methods for downloading these files will be demonstrated.
# Install libraries as needed using pip. Examples below:
# !pip install urllib3
# !pip install requests
# !pip install boto3
# !pip install botocore
# !pip install awscli
# Import libraries
import json
import os
import subprocess
import requests
import boto3
import botocore
from botocore import UNSIGNED
from botocore.client import Config
From the map viewer, you can access a JSON formatted listing of S3 file paths for a selected dataset.
Read in and parse the JSON data from the resulting url to extract the file paths needed to download files. Example JSON output
# Paste in the url to the JSON formatted listing of s3 file paths
json_url = "https://tinyurl.com/ynh9ecbb"
# Fetch the JSON data from this url and convert it to a string
url = requests.get(json_url)
text = url.text
# Convert JSON string to a Python dictionary
contents = json.loads(text)
# Display dictionary keys
for k in contents.keys():
print(k)
# Access and print file names and S3 paths from the dictionary
for i in contents['features']:
file_name = i['attributes']['FILE_NAME']
cloud_path = i['attributes']['CLOUD_PATH']
if cloud_path:
print(f"{file_name}, {cloud_path}")
displayFieldName fieldAliases fields features 0352_20171031_101728_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0352_20171031_101728_EX1709_MB.wcd 0353_20171031_111730_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0353_20171031_111730_EX1709_MB.wcd 0354_20171031_121730_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0354_20171031_121730_EX1709_MB.wcd 0355_20171031_131723_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0355_20171031_131723_EX1709_MB.wcd
Data files are stored in an Amazon Web Services S3 bucket and are accessible for immediate download using a variety of tools. This tutorial covers two methods - using the boto3 library and using the subprocess library to run AWS CLI commands.
# Setup access to S3 bucket as an anonymous user
s3 = boto3.resource(
's3',
aws_access_key_id='',
aws_secret_access_key='',
config=Config(signature_version=UNSIGNED),
)
BUCKET = 'noaa-wcsd-pds'
Let's parse the JSON data for files and store them in a list to drive data downloading. We'll pull file names and object keys from the JSON.
The object key looks like:
'data/raw/Okeanos_Explorer/EX1709/EM302/0352_20171031_101728_EX1709_MB.wcd'The filename:
'0352_20171031_101728_EX1709_MB.wcd'
We'll also filter the bucket for files associated with any tarballs since acoustic data in the bucket are stored as individual files. Finally, we'll pull README files, which contain essential metadata about the datasets the data are associated with.
# Create some variables to store file information needed for downloading
files_to_download = []
readme_paths = set()
# Loop through the JSON to get file names and paths to create a list
# of files to download (list of tuples (file_name, object_key))
for i in contents['features']:
cloud_path = i['attributes']['CLOUD_PATH']
if cloud_path:
file_name = i['attributes']['FILE_NAME']
object_key = i['attributes']['CLOUD_PATH'].replace("s3://noaa-wcsd-pds/", "")
# Each dataset in AWS contains a README file with metadata about the
# dataset. These files aren't listed in the JSON retrieved from the map
# viewer, so we need to filter the bucket to get the file.
# First, let's add the README base path to a set.
readme_basepath = object_key.replace(file_name, '')
readme_paths.add(readme_basepath)
# Acoustic data are stored in their original raw format in the S3 bucket,
# so if a file name ends with .tar, we need to filter the bucket for files
# without the .tar extension that are associated with the tarball.
if file_name.endswith(".tar"):
file_base_name = os.path.splitext(file_name)[0]
folder = os.path.dirname(cloud_path).split(BUCKET + '/')[-1]
filter_prefix = folder + '/' + file_base_name
# Use this prefix that excludes the file extension to filter the bucket
# and update object_key and file_name with the file found
for object in s3.Bucket(BUCKET).objects.filter(Prefix=filter_prefix):
object_key = object.key
file_name = object_key.split('/')[-1]
files_to_download.append((file_name, object_key))
else:
# Files that don't have a .tar extension in the json don't require
# filtering the bucket so they can just be added to a list.
files_to_download.append((file_name, object_key))
# Find README files
# Filter bucket for README files and add to the list of files to download
for path in readme_paths:
filter_prefix = path + 'README'
for object in s3.Bucket(BUCKET).objects.filter(Prefix=filter_prefix):
object_key = object.key
file_name = object_key.split('/')[-1]
files_to_download.append((file_name, object_key))
Now let's download the data. Loop through the list of file information we created in the previous step and use boto3 to download the files.
# Download files
for f in files_to_download:
file_name = f[0]
object_key = f[1]
try:
# Check if file already exists locally before downloading
if file_name not in os.listdir('.'):
s3.Bucket(BUCKET).download_file(object_key, file_name)
print('downloaded:', file_name)
else:
print('already exists:', file_name)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print(f"{file_name} not found - {e}")
else:
raise e
downloaded: 0352_20171031_101728_EX1709_MB.wcd downloaded: 0353_20171031_111730_EX1709_MB.wcd downloaded: 0354_20171031_121730_EX1709_MB.wcd downloaded: 0355_20171031_131723_EX1709_MB.wcd downloaded: README_EX1709_EM302.md
AWS CLI is a tool that allows interaction with an S3 bucket from the command line. To get started, first install the tool. Note that there are two versions of AWS CLI available with different installation requirements. Version 1 requires python 3.6+ on your system. Version 2 requires an AWS account. Once installed, use the subprocess python library to run AWS CLI commands.
# Variable to store paths to README files
readme_paths = set()
# Loop through file paths and download files from AWS
for i in contents['features']:
file_name = i['attributes']['FILE_NAME']
cloud_path = i['attributes']['CLOUD_PATH']
if cloud_path:
# Add the README base path to a set.
readme_basepath = cloud_path.replace(file_name, '')
readme_paths.add(readme_basepath)
# Check if data file already exists locally before downloading
if file_name in os.listdir('.'):
print('already exists:', file_name)
continue
else:
# Build the aws cli command needed to download files
arg_list = ['aws', 's3', 'cp']
file_base_name = os.path.splitext(file_name)[0]
if file_name.endswith('.tar'):
# Find files associated with tarballs by using wildcards
cloud_dir_path = os.path.dirname(cloud_path)
arg_list.append(cloud_dir_path + "/")
arg_list.append(".") # Download destination path. Update as needed.
arg_list.extend(['--recursive', '--exclude', '*', '--include'])
arg_list.append(f'{file_base_name}*') # * is a wildcard
arg_list.append('--no-sign-request') # bypass bucket credentials
else:
# File isn't a tarball. Use filename as is.
arg_list.append(cloud_path)
arg_list.append(".")
arg_list.append('--no-sign-request')
# Download file from terminal using subprocess
process = subprocess.run(args=arg_list, check=False, timeout=600,
cwd=None, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True)
if process.returncode:
print(f"Error {process.stderr} downloading {file_name} from AWS")
else:
print('downloaded:', file_name)
# Find README files using wildcards
for path in readme_paths:
arg_list = ['aws', 's3', 'cp']
arg_list.append(path)
arg_list.append(".")
arg_list.extend(['--recursive', '--exclude', '*', '--include'])
arg_list.append('README*') # * is a wildcard
arg_list.append('--no-sign-request') # bypass bucket credentials
# Download README file
process = subprocess.run(args=arg_list, check=False, timeout=600,
cwd=None, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True)
if process.returncode:
print(f"Error {process.stderr} downloading file from AWS")
else:
if process.stdout:
print(f'downloaded README')
already exists: 0352_20171031_101728_EX1709_MB.wcd already exists: 0353_20171031_111730_EX1709_MB.wcd already exists: 0354_20171031_121730_EX1709_MB.wcd already exists: 0355_20171031_131723_EX1709_MB.wcd downloaded README
Documentation on the bucket structure, as well as tools and tutorials for data processing are listed below.
AWS Registry Page: https://registry.opendata.aws/ncei-wcsd-archive/
Tutorials: https://cires.gitbook.io/ncei-wcsd-archive/
Contact: wcd.info@noaa.gov