Amazon Glacier Backups in Bash

Why use an official Java or .NET-based algorithm when you can write your own in Bash? This script uses the AWS CLI tool to back up a file to an Amazon Glacier vault, without the need for a heavy runtime.

Tested on Linux, so YMMV on other systems.

Requires the AWS CLI tool to be configured appropriately via aws configure.

#!/bin/bash
#
# This script takes a path to a file and uploads it to Amazon
# Glacier. It does this in several steps:
#
#    1. Split the file up into 1MiB chunks.
#    2. Initiate a multipart upload.
#    3. Upload each part individually.
#    4. Calculate the file's tree hash and finish the upload.
#
# See: http://amzn.to/1RjTwYk
#
# Author: Damien Radtke <damienradtke at gmail dot com>
# License: WTFPL

# Set this to the name of the Glacier vault to upload to.
VAULT_NAME=...
# 1 MiB in bytes; the tree hash algorithm requires chunks of this
# size.
CHUNK_SIZE=1048576

if [[ -z "${1}" ]]; then
    echo "No file provided."
    exit 1
fi
ARCHIVE="`realpath ${1}`"
ARCHIVE_SIZE=`cat "${ARCHIVE}" | wc --bytes`

TEMP=`mktemp --directory`
cd "${TEMP}"

# Clean up at exit.
function cleanup {
    echo "Cleaning up."
    cd ~-
    rm -rf "${TEMP}"
}
trap cleanup EXIT

echo "Initiating multipart upload..."

# Split the archive into chunks.
split --bytes=${CHUNK_SIZE} "${ARCHIVE}" chunk
NUM_CHUNKS=`ls chunk* | wc -l`

# Initiate upload.
UPLOAD_ID=$(aws glacier initiate-multipart-upload \
    --account-id=- \
    --vault-name="${VAULT_NAME}" \
    --archive-description="`basename \"${ARCHIVE}\"`" \
    --part-size=${CHUNK_SIZE} \
    --query=uploadId | sed 's/"//g')

RETVAL=$?
if [[ ${RETVAL} -ne 0 ]]; then
    echo "initiate-multipart-upload failed with status code: ${RETVAL}"
    exit 1
fi
echo "Upload ID: ${UPLOAD_ID}"

# Abort the upload if forced to exit.
function abort_upload {
    echo "Aborting upload."
    aws glacier abort-multipart-upload \
        --account-id=- \
        --vault-name="${VAULT_NAME}" \
        --upload-id="${UPLOAD_ID}"
}
trap abort_upload SIGINT SIGTERM

# Loop through the chunks.
INDEX=0
for CHUNK in chunk*; do
    # Calculate the byte range for this chunk.
    START=$((INDEX*CHUNK_SIZE))
    END=$((((INDEX+1)*CHUNK_SIZE)-1))
    END=$((END>(ARCHIVE_SIZE-1)?ARCHIVE_SIZE-1:END))
    # Increment the index.
    INDEX=$((INDEX+1))

    while true; do
        echo "Uploading chunk ${INDEX} / ${NUM_CHUNKS}..."
        aws glacier upload-multipart-part \
            --account-id=- \
            --vault-name="${VAULT_NAME}" \
            --upload-id="${UPLOAD_ID}" \
            --body="${CHUNK}" \
            --range="bytes ${START}-${END}/*" \
            >/dev/null
        RETVAL=$?
        if [[ ${RETVAL} -eq 0 ]]; then
            # Upload succeeded, on to the next one.
            break
        elif [[ ${RETVAL} -eq 130 ]]; then
            # Received a SIGINT.
            exit 1
        elif [[ ${RETVAL} -eq 255 ]]; then
            # Most likely a timeout, just let it try again.
            echo "Chunk ${INDEX} ran into an error, retrying..."
            sleep 1
        else
            echo "upload-multipart-part failed with status code: ${RETVAL}"
            echo "Aborting upload."
            aws glacier abort-multipart-upload \
                --account-id=- \
                --vault-name="${VAULT_NAME}" \
                --upload-id="${UPLOAD_ID}"
            exit 1
        fi
    done
    openssl dgst -sha256 -binary ${CHUNK} > "hash${CHUNK:5}"
done

# Calculate tree hash.
# ("And now for the tricky bit.")
echo "Calculating tree hash..."
while true; do
    COUNT=`ls hash* | wc -l`
    if [[ ${COUNT} -le 2 ]]; then
        TREE_HASH=$(cat hash* | openssl dgst -sha256 | awk '{print $2}')
        break
    fi
    ls hash* | xargs -n 2 | while read PAIR; do
        PAIRARRAY=(${PAIR})
        if [[ ${#PAIRARRAY[@]} -eq 1 ]]; then
            break
        fi
        cat ${PAIR} | openssl dgst -sha256 -binary > temphash
        rm ${PAIR}
        mv temphash "${PAIRARRAY[0]}"
    done
done

echo "Finalizing..."
aws glacier complete-multipart-upload \
    --account-id=- \
    --vault-name="${VAULT_NAME}" \
    --upload-id="${UPLOAD_ID}" \
    --checksum="${TREE_HASH}" \
    --archive-size=${ARCHIVE_SIZE}
RETVAL=$?
if [[ ${RETVAL} -ne 0 ]]; then
    echo "complete-multipart-upload failed with status code: ${RETVAL}"
    echo "Aborting upload ${UPLOAD_ID}"
    aws glacier abort-multipart-upload \
        --account-id=- \
        --vault-name="${VAULT_NAME}" \
        --upload-id="${UPLOAD_ID}"
    exit 1
fi

echo "Done."
exit 0