docker-postgres-backup/start.py

"""
The script provides functionality:
 1. Backup your PostgresSQL database to sql file using pg_dump
 2. Compress sql file using LZMA2
 3. Upload compressed file to S3 storage
 4. Scheduled to run every hour

You can set some envs vars:
- SCHEDULE - can be monthly, weekly, daily, hourly (by default)
- DB_USER - user to connect DB, default is postgres
- DB_PASSWORD - password to connect DB, default is postgres
- DB_HOST - host to connect DB, default is localhost
- DB_PORT - port to connect DB, default is 5432
- DB_NAME - database to back up, default is postgres
- TIME_ZONE - timezone for datetime using in filenames, default is Europe/Tallinn
- COMPRESSION_LEVEL - level of LZMA compression, default is 7
- PREFIX - prefix for backup filename, default is empty

Settings for S3 storage:
- AWS_S3_REGION_NAME - default nl-ams
- AWS_S3_ENDPOINT_URL - default https://s3.nl-ams.scw.cloud
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_BUCKET_NAME
"""

import lzma
import os
import shutil
import subprocess
import threading
import time
from datetime import datetime

import boto3
import pytz
import schedule
from boto3.exceptions import S3UploadFailedError

SCHEDULE = os.getenv("SCHEDULE", "HOURLY")
DB_USER = os.getenv("DB_USER", "postgres")
DB_PASSWORD = os.getenv("DB_PASSWORD", "postgres")
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME", "postgres")
TIME_ZONE = pytz.timezone(os.getenv("TIME_ZONE", "Europe/Tallinn"))
PREFIX = os.getenv("PREFIX", "")
AWS_S3_REGION_NAME = os.getenv("AWS_S3_REGION_NAME", "nl-ams")
AWS_S3_ENDPOINT_URL = os.getenv("AWS_S3_ENDPOINT_URL", "https://s3.nl-ams.scw.cloud")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
COMPRESSION_SETTINGS = [
    {
        "id": lzma.FILTER_LZMA2,
        "preset": int(os.getenv("COMPRESSION_LEVEL", 7)) | lzma.PRESET_EXTREME,
    },
]


def backup_db_from_postgres(file_path: str) -> bool:
    """
    Backup db from PostgresSQL to file using pg_dump
    :param file:
    :return:
    """
    postgres_connection_url = (
        f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    )
    try:
        process = subprocess.Popen(
            [
                "pg_dump",
                f"--dbname={postgres_connection_url}",
                "-f",
                file_path,
                "-Fp",
            ],
            stdout=subprocess.PIPE,
        )
        process.communicate()
        if process.returncode != 0:
            print(f"Command failed. Return code : {process.returncode}")
            return False
        return True
    except (subprocess.SubprocessError, OSError) as exception:
        print(exception)
        return False


def compress_file_to_xz(file_path: str) -> str:
    compressed_file_path = f"{file_path}.xz"
    with open(file_path, "rb") as origin_file:
        with lzma.open(
            compressed_file_path, "wb", filters=COMPRESSION_SETTINGS
        ) as compressed_file:
            shutil.copyfileobj(origin_file, compressed_file)
    os.remove(file_path)
    return compressed_file_path


def upload_to_s3(compressed_file_path: str, filename: str) -> bool:
    time_string = datetime.now(tz=TIME_ZONE).strftime("%Y/%m/%d")
    destination_folder = f"{time_string}/{filename}"
    try:
        s3_client = boto3.client(
            service_name="s3",
            region_name=AWS_S3_REGION_NAME,
            endpoint_url=AWS_S3_ENDPOINT_URL,
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        )
        s3_client.upload_file(
            Filename=compressed_file_path,
            Bucket=AWS_BUCKET_NAME,
            Key=destination_folder,
        )
        os.remove(compressed_file_path)
        return True
    except S3UploadFailedError as exception:
        print(exception)
        return False


def run_backup_database() -> None:
    """
    Run backup script
    """
    scheduled_time = datetime.now(tz=TIME_ZONE).strftime("%y%m%d-%H%M")
    filename = f"{DB_NAME}-{scheduled_time}.sql"
    if PREFIX:
        filename = f"{PREFIX}-{filename}"
    sql_file_path = f"/tmp/{filename}"

    backup_success = backup_db_from_postgres(file_path=sql_file_path)
    if not backup_success:
        print("Backup failed")
        return
    compressed_file_path = compress_file_to_xz(file_path=sql_file_path)
    upload_success = upload_to_s3(compressed_file_path, f"{filename}.xz")
    if not upload_success:
        print("Upload failed")
        return
    upload_time = datetime.now(tz=TIME_ZONE).strftime("%Y-%m-%d %H:%M")
    print(f"Made backup at {scheduled_time} and uploaded to S3 at {upload_time}")


def run_threaded(job_func):
    """
    Run the jobs in threading
    :param job_func:
    :return:
    """
    job_thread = threading.Thread(target=job_func)
    job_thread.start()


match SCHEDULE:
    case "MONTHLY":
        print("Scheduled to run backup task every 4 weeks")
        schedule.every(4).weeks.do(run_threaded, run_backup_database)
    case "WEEKLY":
        print("Scheduled to run backup task every Monday")
        schedule.every().monday.at("02:00").do(run_threaded, run_backup_database)
    case "DAILY":
        print("Scheduled to run backup task every day at 02:00")
        schedule.every().day.at("02:00").do(run_threaded, run_backup_database)
    # For any other values, incl HOURLY - run hourly
    case _:
        print("Scheduled to run backup task every hour")
        schedule.every().hour.at(":05").do(run_threaded, run_backup_database)

# Run first job immediately
schedule.run_all()

while True:
    schedule.run_pending()
    time.sleep(1)