MakeArchive

From RdiffBackupWiki

Jump to: navigation, search

This script is intended to make a snapshot of the backup at a date in the past. It can be used to create archives on CD, DVD or other backup media. When given a space constraint, it calculates the datetime of the latest snapshot that still fits into that space. The snapshot includes the source tree at that specific datetime and all the rdiff-backup data to recreate the source at any earlier time. This script can be used if your backup device has run out of space, but you rather write your old backups to CD/DVD rather than throwing them away. After running this script, you can clean up your backup tree with the --remove-older-than option of rdiff-backup.

The script is written in Python. You can call it with:

makearchive /path/to/backup/tree /required/path/to/snapshot/ (maximum size in bytes)

If no size constraint is given, the script assumes single-layer DVD.

#!/usr/bin/env python
#
# Copyright 2007 by Pieter Edelman (p _dot_ edelman _at_ gmx _dot_ net)
#

import re, sys, os, time, shutil

# Takes three arguments; the dir where the backup tree is stored (including
# rdiff-backup-data), the dir where the result should be, and an optional number
# of maximum bytes (defaults to single-layer DVD).

def getOptimalDT(backup_dir, req_size):
  """ Finds the latest backup where the sum of increments and the source tree
      are still smaller than req_size. """
  dts       = []  
  source    = {}
  increment = {}
  
  # Analyze the session_statistics files for the file size of the source trees
  files = os.listdir(backup_dir)
  for log_file in files:
    if (log_file[:19] == "session_statistics."):
      # Extract the datetime from the title
      dt = log_file[19:44]
      dts.append(dt)
      
      # Read the log and search for the appropriate values
      log = file(os.path.join(backup_dir, log_file), "r").read()
      source[dt] = int(re.search("SourceFileSize ([0-9]+)", log).group(1))
      
  # Count the size of all the backup data for each date
  for file_info in os.walk(backup_dir):
    curr_dir = os.path.join(backup_dir, file_info[0]) 
    for file_name in file_info[2]:
      time_match = re.search(".*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2})\.(missing|dir|diff|data|snapshot)(\.gz)?$", file_name)
      if (time_match):
        dt = time_match.group(1)
        try:
          size = os.stat(os.path.join(curr_dir, file_name)).st_size
        except OSError:
          print "Couldn't stat %s, skipping...." % os.path.join(curr_dir, file_name)
        if (dt not in increment):
          increment[dt] = size
        else:
          increment[dt] += size
  
  # Find the optimal datetime
  dts.sort()
  optimal_dt   = None
  optimal_size = 0
  incr_size  = 0
  for dt in dts:
    # Calculate the total increment size by appending the size for this backup
    incr_size += increment[dt]
    
    # Check if the increment size plus the size of the source tree are below the required size, and if so, save this time stamp
    if ((source[dt] + incr_size) <= req_size): 
      optimal_dt   = dt
      optimal_size = (source[dt] + incr_size)
      
  return [optimal_dt, optimal_size]
      
def copyIncrements(source_dir, dest_dir, dest_dt):
  """ Recursive method to copy all increment files. """
  # Mk dir on destination
  if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

  for file in os.listdir(source_dir):
    # If file is dir:
    if os.path.isdir(os.path.join(source_dir, file)):
      # Call self
      copyIncrements(os.path.join(source_dir, file), os.path.join(dest_dir, file), dest_dt)  
    # Else if file is in specified time
    else:
      time_match = re.search(".*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2})\.(missing|dir|diff|data|snapshot)(\.gz)?$", file)
      if time_match and (time_match.group(1) < dest_dt):
          # Copy it
          try:
            shutil.copy(os.path.join(source_dir, file), dest_dir)
          except IOError:
            print "Problem with file %s, ignoring it." % os.path.join(source_dir, file)

def copyBackupData(backup_dir, dest_dir, dest_dt):
  """ Copy the rdiff-backup files up to timestamp dest_dt under backup_dir to a
      tree in dest_dir. """
  
  # Ascend into rdiff-backup-data
  backup_dir = os.path.join(backup_dir, "rdiff-backup-data")
  dest_dir   = os.path.join(dest_dir, "rdiff-backup-data")
  if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

  # Search for the relevant files
  for file in os.listdir(backup_dir):
    if not os.path.isdir(os.path.join(backup_dir, file)):
      time_match = re.search(".*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2})\.(snapshot|data|dir)(\.gz)?$", file)
      if time_match and (time_match.group(1) <= dest_dt):
        # Copy it
        shutil.copy(os.path.join(backup_dir, file), dest_dir)
        # If we have the session statistics file of the latest backup date, copy
        # it also to "current_mirror"
        if ((time_match.group(1) == dest_dt) and (file[:19] == "session_statistics.")):
          shutil.copy(os.path.join(backup_dir, file), os.path.join(dest_dir, file.replace("session_statistics", "current_mirror")))

# Get the options from the command line
backup_dir = sys.argv[1]
dest_dir   = sys.argv[2]
if (len(sys.argv) > 3):
  req_size = int(sys.argv[3])
else:
  req_size = 4700000000
  
# Get the required datetime 
optimal = getOptimalDT(os.path.join(backup_dir, "rdiff-backup-data"), req_size)
if (not optimal):
  print "There is no timestamp that satisfies your demands!"
  sys.exit(1)
else:
  print "I will use the backup from %s (%d bytes)" % (optimal[0], optimal[1])

print "Restoring the main data" 
os.system("rdiff-backup --restore-as-of %s %s %s" % (optimal[0], backup_dir, dest_dir))
print "Restoring the backup data"
copyBackupData(backup_dir, dest_dir, optimal[0])
copyIncrements(os.path.join(backup_dir, "rdiff-backup-data", "increments"), os.path.join(dest_dir, "rdiff-backup-data", "increments"), optimal[0])

Personal tools