Script to launch Amazon EC2 Spot instances

This script will

create a new key pair if one doesn't exist
create a security group if one doesn't exist
request a new spot instance
wait for the spot request to be fulfilled
wait for the instance to boot
connect to it via SSH and run a script of your choice

Install

First, install the dependencies. This is for Ubuntu Server 14.04.

sudo apt-get install -y python python-pip python-dev libffi-dev libssl-dev
sudo pip install boto paramiko

Create a new file (e.g. launch-spot.py) and give it exec permissions (chmod +x launch-spot.py).

#!/usr/bin/python2.7 -u

# pip install boto paramiko

import argparse
import boto, boto.ec2, boto.ec2.blockdevicemapping, boto.manage
import paramiko
import os, sys, time

#boto.set_stream_logger('boto')

def launch_spot_instance(id, profile, spot_wait_sleep=5, instance_wait_sleep=3):
  ec2 = boto.ec2.connect_to_region(profile['region'])

  if not 'key_pair' in profile:
    profile['key_pair'] = ('KP-' + id, 'KP-' + id + '.pem')
    try:
      print >> sys.stderr, 'Creating key pair...',
      keypair = ec2.create_key_pair('KP-' + id)
      keypair.save('.')
      print >> sys.stderr, 'created'
    except boto.exception.EC2ResponseError as e:
      if e.code == 'InvalidKeyPair.Duplicate':
        print >> sys.stderr, 'already exists'
      else:
        raise e

  if not 'security_group' in profile:
    try:
      print >> sys.stderr, 'Creating security group...',
      sc = ec2.create_security_group('SG-' + id, 'Security Group for ' + id)
      for proto, fromport, toport, ip in profile['firewall']:
        sc.authorize(proto, fromport, toport, ip)
      profile['security_group'] = (sc.id, sc.name)
      print >> sys.stderr, 'created'
    except boto.exception.EC2ResponseError as e:
      if e.code == 'InvalidGroup.Duplicate':
        print >> sys.stderr, 'already exists'
        sc = ec2.get_all_security_groups(groupnames=['SG-' + id])[0]
        profile['security_group'] = (sc.id, sc.name)
      else:
        raise e

  existing_requests = ec2.get_all_spot_instance_requests(filters={'launch.group-id': profile['security_group'][0], 'state': ['open', 'active']})
  if existing_requests:
    if len(existing_requests) > 1:
      raise Exception('Too many existing spot requests')
    print >> sys.stderr, 'Reusing existing spot request'
    spot_req_id = existing_requests[0].id
  else:
    bdm = boto.ec2.blockdevicemapping.BlockDeviceMapping()
    bdm['/dev/sda1'] = boto.ec2.blockdevicemapping.BlockDeviceType(volume_type='gp2', size=profile['disk_size'], delete_on_termination=profile['disk_delete_on_termination'])
    bdm['/dev/sdb'] = boto.ec2.blockdevicemapping.BlockDeviceType(ephemeral_name='ephemeral0')
    print >> sys.stderr, 'Requesting spot instance'
    spot_reqs = ec2.request_spot_instances(
      price=profile['price'], image_id=profile['image_id'], instance_type=profile['type'], placement=profile['region'] + profile['availability_zone'],
      security_groups=[profile['security_group'][1]], key_name=profile['key_pair'][0], block_device_map=bdm)
    spot_req_id = spot_reqs[0].id

  print >> sys.stderr, 'Waiting for launch',
  instance_id = None
  spot_tag_added = False
  while not instance_id:
    spot_req = ec2.get_all_spot_instance_requests(request_ids=[spot_req_id])[0]
    if not spot_tag_added:
      spot_req.add_tag('Name', id)
      spot_tag_added = True
    if spot_req.state == 'failed':
      raise Exception('Spot request failed')
    instance_id = spot_req.instance_id
    if not instance_id:
      print >> sys.stderr, '.',
      time.sleep(spot_wait_sleep)
  print >> sys.stderr

  print >> sys.stderr, 'Retrieving instance by id'
  reservations = ec2.get_all_instances(instance_ids=[instance_id])
  instance = reservations[0].instances[0]
  instance.add_tag('Name', id)
  print >> sys.stderr, 'Got instance: ' + str(instance.id) +  ' [' + instance.state + ']'
  print >> sys.stderr, 'Waiting for instance to boot',
  while not instance.state in ['running', 'terminated', 'shutting-down']:
    print >> sys.stderr, '.',
    time.sleep(instance_wait_sleep)
    instance.update()
  print >> sys.stderr
  if instance.state != 'running':
    raise Exception('Instance was terminated')
  return instance

def connect_to_instance(ip, username, key_filename, timeout=10):
  print >> sys.stderr, 'Connecting to SSH [' + ip + '] ',
  client = paramiko.SSHClient()
  client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
  retries = 0
  while retries < 30:
    try:
      print >> sys.stderr, '.',
      client.connect(ip, username=username, key_filename=key_filename, timeout=timeout)
      break
    except:
      retries += 1
  print >> sys.stderr
  return client

def setup_instance(id, instance, file, user_name, key_name):
  script = open(file, 'r').read().replace('\r', '')

  client = connect_to_instance(instance.ip_address, user_name, key_name)
  session = client.get_transport().open_session()
  session.set_combine_stderr(True)

  print >> sys.stderr, 'Running script: ' + os.path.relpath(file, os.getcwd())
  session.exec_command(script)
  stdout = session.makefile()
  try:
    for line in stdout:
      print line.rstrip()
  except (KeyboardInterrupt, SystemExit):
    print >> sys.stderr, 'Ctrl-C, stopping'
  client.close()
  exit_code = session.recv_exit_status()
  print >> sys.stderr, 'Exit code: ' + str(exit_code)
  return exit_code == 0

if __name__ == '__main__':

  profiles = {
    '15G': {
      'region': 'eu-west-1',
      'availability_zone': 'a',
      'price': '0.05',
      'type': 'r3.large',
      'image_id': 'ami-ed82e39e',
      'username': 'ubuntu',
      #'key_pair': ('AWS-EU', 'eu-key.pem'),
      'disk_size': 20,
      'disk_delete_on_termination': True,
      'scripts': [],
      'firewall': [ ('tcp', 22, 22, '0.0.0.0/0') ]
    }
  }

  parser = argparse.ArgumentParser(description='Launch spot instance')
  parser.add_argument('-n', '--name', help='Name', required=True)
  parser.add_argument('-p', '--profile', help='Profile', default=profiles.keys()[0], choices=profiles.keys())
  parser.add_argument('-s', '--script', help='Script path', action='append', default=[])
  parser.add_argument('-i', '--interactive', help='Connect to SSH', action='store_true')
  args = parser.parse_args()

  profile = profiles[args.profile]

  try:
    instance = launch_spot_instance(args.name, profile)
  except boto.exception.NoAuthHandlerFound:
    print >> sys.stderr, 'Error: No credentials found, try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables'
    sys.exit(1)

  for script in profile['scripts'] + args.script:
    if not setup_instance(id=args.name, instance=instance, file=script, user_name=profile['username'], key_name=profile['key_pair'][1]):
      break

  if args.interactive:
    print 'ssh ' + profile['username'] + '@' + instance.ip_address + ' -i ' + profile['key_pair'][1] + ' -oStrictHostKeyChecking=no'

The script is less than 200 lines long and should be readable from top to bottom.

Use

Set your Amazon AWS access keys as environment variables.

Or you can read the boto documentation to find out how to store the keys in configuration files.

export AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXXXXXX"
export AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXXXXXXXXXXXX"

Then launch a new spot instance like this

$ ./launch-spot.py -n test -p 15G
Creating key pair... created
Creating security group... created
Requesting spot instance
Waiting for launch . . . . . .
Retrieving instance by id
Got instance: i-15fc09d5 [pending]
Waiting for instance to boot . . . .

-n stands for --name and it is how you can identify this spot instance.

-p stands for --profile and is a collection of settings for launching this instance (region, availability zone, instance type, max spot price, AMI, SSH username, disk size, security group rules, etc.)

The profiles are hard coded in the script. Feel free to modify the script to load them from an external configuration file (in JSON perhaps).

Currently, there is just one profile named 15G

profiles = {
  '15G': {
    'region': 'eu-west-1',
    'availability_zone': 'a',
    'price': '0.05',
    'type': 'r3.large',
    'image_id': 'ami-ed82e39e',
    'username': 'ubuntu',
    #'key_pair': ('AWS-EU', 'eu-key.pem'),
    'disk_size': 20,
    'disk_delete_on_termination': True,
    'scripts': [],
    'firewall': [ ('tcp', 22, 22, '0.0.0.0/0') ]
  }
}

You can run the script again and as long as you use the same name it will resume.

$ ./launch-spot.py -n test -p 15G
Creating key pair... already exists
Creating security group... already exists
Reusing existing spot request
Waiting for launch
Retrieving instance by id
Got instance: i-15fc09d5 [running]
Waiting for instance to boot

To run a script on this instance via SSH use one or more -s arguments.

For example, if you have test.sh

#!/bin/bash
touch /tmp/i-was-here
ls -l /tmp/i-was-here

this will be the output of the script

$ ./launch-spot.py -n test -p 15G -s test.sh
Creating key pair... already exists
Creating security group... already exists
Reusing existing spot request
Waiting for launch
Retrieving instance by id
Got instance: i-15fc09d5 [running]
Waiting for instance to boot
Connecting to SSH [54.74.149.116]  .
Running script: test.sh
-rw-rw-r-- 1 ubuntu ubuntu 0 Oct 10 11:53 /tmp/i-was-here
Exit code: 0

If you want to connect to this instance manually, you can use the -i flag:

$ ./launch-spot.py -n test -p 15G -i
Creating key pair... already exists
Creating security group... already exists
Reusing existing spot request
Waiting for launch
Retrieving instance by id
Got instance: i-15fc09d5 [running]
Waiting for instance to boot
ssh [email protected] -i KP-test.pem -oStrictHostKeyChecking=no

You can copy & paste the last line in your terminal and connect to the instance.

As you can see, if you haven't specified key_pair in the profile, a new key pair will be created with the name KP-test and it will be saved as KP-test.pem.

If there's no security_group in the profile, a new one will be created with the name SG-test and the rules from the firewall profile setting will be applied. At a minimum, SSH from your IP should be allowed.

If you want to stop the spot instance, simply run sudo halt on the instance. The instance, the instance request and disks (if auto terminate is on) will automatically be shut down and terminated.

Scripting

If you use a simple bash script, I suggest using one like this. set -e will stop the execution if an error occurs. If the script finished successfully, it will not be run again.

#!/bin/bash

set -e

if [ ! -f /var/setup.done ]; then

    ...

    sudo touch /var/setup.done

fi

If you share a provisioning script with Vagrant and AWS, you can use this line to check if you are running on AWS EC2 or locally on Vagrant.

export AWS=`curl -s -m 1 http://169.254.169.254/latest/meta-data/instance-id 2> /dev/null`
export VAGRANT=`ls /vagrant 2> /dev/null | head -n 1`

Amazon Ubuntu mirrors and apt-get can be slow and flaky. Especially if you hit CTRL+C while the script is executing apt-get and then run it again.

# this may or may not prevent "hash sum mismatch" errors
sudo rm -rf /var/lib/apt/lists/*
sudo apt-get clean
# amazon mirrors are very slow
sudo sed -i 's/us-east-1.ec2.archive.ubuntu.com/us.archive.ubuntu.com/g' /etc/apt/sources.list
sudo sed -i 's/eu-west-1.ec2.archive.ubuntu.com/ie.archive.ubuntu.com/g' /etc/apt/sources.list
# stay up to date
sudo apt-get update
# in case previous apt-get was interrupted
sudo dpkg --configure -a

Create a file system on the instance disk and move /home, /tmp and /swapfile to it. This can be useful on large SSD backed instance disks.

export AWS=`curl -s -m 1 http://169.254.169.254/latest/meta-data/instance-id 2> /dev/null`
export AWS_INSTANCE_STORE=1

if [ "$AWS" ] && [ "$AWS_INSTANCE_STORE" -e "1" ] && [ ! -d /mnt/instance ]; then
  # aws instance store
  sudo mkfs -t ext4 /dev/xvdb
  sudo mkdir -p /mnt/instance
  sudo mount /dev/xvdb /mnt/instance
  sudo chown -R $USER:$USER /mnt/instance

  # don't use root disk
  [ ! -d /mnt/instance/home ] && sudo mv /home/ubuntu /mnt/instance/home && sudo ln -s /mnt/instance/home /home/ubuntu
  [ ! -d /mnt/instance/tmp ] && sudo mv /tmp/ /mnt/instance/ && sudo ln -s /mnt/instance/tmp /tmp
  [ ! -f /mnt/instance/swapfile ] && sudo touch /mnt/instance/swapfile && sudo ln -s /mnt/instance/swapfile /swapfile
fi

Add swap.

export SWAP="4G"

if [ "$SWAP" ]; then
  sudo fallocate -l $SWAP /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile
  sudo sysctl vm.swappiness=1 && sudo sysctl vm.vfs_cache_pressure=50
fi

Keep /tmp in RAM.

export TMPFS=0
export TMPFS_SWAP="5G"

if [ "$TMPFS" ] && [ "$TMPFS" -eq "1" ]; then
  sudo mount -o defaults,noatime,nosuid,nodev,noexec,mode=1777,size=500G -t tmpfs tmpfs /tmp
  [ "$TMPFS_SWAP" ] && sudo fallocate -l $TMPFS_SWAP /swapfile.tmpfs && sudo mkswap /swapfile.tmpfs && sudo swapon /swapfile.tmpfs
fi

To upload/download files to S3 on AWS and use local files in /vagrant when running in Vagrant, you could do something like this.

export AWS=`curl -s -m 1 http://169.254.169.254/latest/meta-data/instance-id 2> /dev/null`
export AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXXXXXX"
export AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXXXXXXXXXXXX"
export VAGRANT=`ls /vagrant 2> /dev/null | head -n 1`

sudo apt-get install -y python-pip
sudo pip install awscli
[ "$AWS_ACCESS_KEY_ID" ] && [ "$AWS_SECRET_ACCESS_KEY" ] && echo -e "$AWS_ACCESS_KEY_ID\n$AWS_SECRET_ACCESS_KEY\n\n" | aws configure

if [ "$VAGRANT" ]; then
  cp /vagrant/file.tar.gz file.tar.gz
elif [ "$AWS" ]; then
  aws s3 cp s3://bucket/file.tar.gz file.tar.gz
fi

...

if [ "$VAGRANT" ]; then
  cp result.tar.gz /vagrant/result.tar.gz
elif [ "$AWS" ]; then
  aws s3 cp result.tar.gz s3://bucket/result.tar.gz --storage-class REDUCED_REDUNDANCY
fi