# Install libraries as needed using pip. Examples below:
# !pip install urllib3
# !pip install requests
# !pip install boto3
# !pip install botocore
# !pip install awscli


# Import libraries
import json
import os
import subprocess
import requests
import boto3
import botocore
from botocore import UNSIGNED
from botocore.client import Config


# Paste in the url to the JSON formatted listing of s3 file paths
json_url = "https://tinyurl.com/ynh9ecbb"

# Fetch the JSON data from this url and convert it to a string
url = requests.get(json_url)
text = url.text

# Convert JSON string to a Python dictionary
contents = json.loads(text)

# Display dictionary keys
for k in contents.keys():
    print(k)

# Access and print file names and S3 paths from the dictionary
for i in contents['features']:
    file_name = i['attributes']['FILE_NAME']
    cloud_path = i['attributes']['CLOUD_PATH']
    if cloud_path:
      print(f"{file_name}, {cloud_path}")

displayFieldName
fieldAliases
fields
features
0352_20171031_101728_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0352_20171031_101728_EX1709_MB.wcd
0353_20171031_111730_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0353_20171031_111730_EX1709_MB.wcd
0354_20171031_121730_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0354_20171031_121730_EX1709_MB.wcd
0355_20171031_131723_EX1709_MB.wcd, s3://noaa-wcsd-pds/data/raw/Okeanos_Explorer/EX1709/EM302/0355_20171031_131723_EX1709_MB.wcd


# Setup access to S3 bucket as an anonymous user
s3 = boto3.resource(
     's3',
     aws_access_key_id='',
     aws_secret_access_key='',
     config=Config(signature_version=UNSIGNED),
    )

BUCKET = 'noaa-wcsd-pds'


# Create some variables to store file information needed for downloading
files_to_download = []
readme_paths = set()

# Loop through the JSON to get file names and paths to create a list
# of files to download (list of tuples (file_name, object_key))
for i in contents['features']:
    cloud_path = i['attributes']['CLOUD_PATH']
    if cloud_path:
      file_name = i['attributes']['FILE_NAME']
      object_key = i['attributes']['CLOUD_PATH'].replace("s3://noaa-wcsd-pds/", "")

      # Each dataset in AWS contains a README file with metadata about the
      # dataset. These files aren't listed in the JSON retrieved from the map
      # viewer, so we need to filter the bucket to get the file.
      # First, let's add the README base path to a set.
      readme_basepath = object_key.replace(file_name, '')
      readme_paths.add(readme_basepath)

      # Acoustic data are stored in their original raw format in the S3 bucket,
      # so if a file name ends with .tar, we need to filter the bucket for files
      # without the .tar extension that are associated with the tarball.
      if file_name.endswith(".tar"):
        file_base_name = os.path.splitext(file_name)[0]
        folder = os.path.dirname(cloud_path).split(BUCKET + '/')[-1]
        filter_prefix = folder + '/' + file_base_name
        # Use this prefix that excludes the file extension to filter the bucket
        # and update object_key and file_name with the file found
        for object in s3.Bucket(BUCKET).objects.filter(Prefix=filter_prefix):
          object_key = object.key
          file_name = object_key.split('/')[-1]
          files_to_download.append((file_name, object_key))
      else:
        # Files that don't have a .tar extension in the json don't require
        # filtering the bucket so they can just be added to a list.
        files_to_download.append((file_name, object_key))

# Find README files
# Filter bucket for README files and add to the list of files to download
for path in readme_paths:
    filter_prefix = path + 'README'
    for object in s3.Bucket(BUCKET).objects.filter(Prefix=filter_prefix):
        object_key = object.key
        file_name = object_key.split('/')[-1]
        files_to_download.append((file_name, object_key))


# Download files
for f in files_to_download:
  file_name = f[0]
  object_key = f[1]
  try:
      # Check if file already exists locally before downloading
      if file_name not in os.listdir('.'):
          s3.Bucket(BUCKET).download_file(object_key, file_name)
          print('downloaded:', file_name)
      else:
          print('already exists:', file_name)
  except botocore.exceptions.ClientError as e:
      if e.response['Error']['Code'] == "404":
          print(f"{file_name} not found - {e}")
      else:
          raise e

downloaded: 0352_20171031_101728_EX1709_MB.wcd
downloaded: 0353_20171031_111730_EX1709_MB.wcd
downloaded: 0354_20171031_121730_EX1709_MB.wcd
downloaded: 0355_20171031_131723_EX1709_MB.wcd
downloaded: README_EX1709_EM302.md


# Variable to store paths to README files
readme_paths = set()

# Loop through file paths and download files from AWS
for i in contents['features']:
    file_name = i['attributes']['FILE_NAME']
    cloud_path = i['attributes']['CLOUD_PATH']

    if cloud_path:
      # Add the README base path to a set.
      readme_basepath = cloud_path.replace(file_name, '')
      readme_paths.add(readme_basepath)

      # Check if data file already exists locally before downloading
      if file_name in os.listdir('.'):
        print('already exists:', file_name)
        continue

      else:
        # Build the aws cli command needed to download files
        arg_list = ['aws', 's3', 'cp']
        file_base_name = os.path.splitext(file_name)[0]

        if file_name.endswith('.tar'):
          # Find files associated with tarballs by using wildcards
          cloud_dir_path = os.path.dirname(cloud_path)
          arg_list.append(cloud_dir_path + "/")
          arg_list.append(".")   # Download destination path. Update as needed.
          arg_list.extend(['--recursive', '--exclude', '*', '--include'])
          arg_list.append(f'{file_base_name}*')     # * is a wildcard
          arg_list.append('--no-sign-request')      # bypass bucket credentials

        else:
          # File isn't a tarball. Use filename as is.
          arg_list.append(cloud_path)
          arg_list.append(".")
          arg_list.append('--no-sign-request')

        # Download file from terminal using subprocess
        process = subprocess.run(args=arg_list, check=False, timeout=600,
                          cwd=None, stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE,
                          universal_newlines=True)
        if process.returncode:
          print(f"Error {process.stderr} downloading {file_name} from AWS")
        else:
            print('downloaded:', file_name)

# Find README files using wildcards
for path in readme_paths:
  arg_list = ['aws', 's3', 'cp']
  arg_list.append(path)
  arg_list.append(".")
  arg_list.extend(['--recursive', '--exclude', '*', '--include'])
  arg_list.append('README*')                # * is a wildcard
  arg_list.append('--no-sign-request')      # bypass bucket credentials

  # Download README file
  process = subprocess.run(args=arg_list, check=False, timeout=600,
                  cwd=None, stdout=subprocess.PIPE,
                  stderr=subprocess.PIPE,
                  universal_newlines=True)
  if process.returncode:
    print(f"Error {process.stderr} downloading file from AWS")
  else:
    if process.stdout:
      print(f'downloaded README')

already exists: 0352_20171031_101728_EX1709_MB.wcd
already exists: 0353_20171031_111730_EX1709_MB.wcd
already exists: 0354_20171031_121730_EX1709_MB.wcd
already exists: 0355_20171031_131723_EX1709_MB.wcd
downloaded README

Downloading Water Column Sonar Data from AWS¶

Setup¶

Parse JSON data¶

Download data¶

Method 1: Using boto3¶

Method 2: Using AWS CLI¶

Additional Resources¶