DevOps Scripting with Python

Master Python for real-world DevOps automation, system administration, and cloud integration.

Running External Commands

subprocess Module

Execute system commands and capture output:

import subprocess

# Run command and wait for completion
result = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print(result.stdout)
print(result.returncode)  # 0 = success

# Run with shell=True (for shell features)
result = subprocess.run("echo $HOME", shell=True, text=True, capture_output=True)
print(result.stdout)

# Pass environment variables
env = os.environ.copy()
env["APP_ENV"] = "production"
result = subprocess.run(["app", "start"], env=env)

# Real-world: Check service status
def check_service_status(service_name):
    result = subprocess.run(
        ["systemctl", "is-active", service_name],
        capture_output=True,
        text=True
    )
    return result.returncode == 0

Real-world example: Deploy script

def deploy_application(version, environment):
    """Deploy application to environment."""
    commands = [
        ["git", "pull", "origin", "main"],
        ["git", "checkout", f"v{version}"],
        ["pip", "install", "-r", "requirements.txt"],
        ["python", "manage.py", "migrate"],
        ["systemctl", "restart", "app-service"]
    ]

    for cmd in commands:
        result = subprocess.run(cmd)
        if result.returncode != 0:
            raise RuntimeError(f"Deployment failed at: {' '.join(cmd)}")

    print(f"Successfully deployed v{version} to {environment}")

Operating System Operations

os Module

File and directory operations:

import os

# Current working directory
print(os.getcwd())
os.chdir("/opt/app")

# List files
files = os.listdir(".")
for f in files:
    print(f)

# Create directory
os.makedirs("/var/log/app", exist_ok=True)

# Check path exists
if os.path.exists("/etc/config.yaml"):
    print("Config found")

# Get file size
size = os.path.getsize("app.log")
print(f"Log size: {size} bytes")

# Environment variables
debug_mode = os.getenv("DEBUG", "false")
api_key = os.getenv("API_KEY")

# Set environment variable
os.environ["APP_DEBUG"] = "true"

Real-world: Cleanup script

import os
import time

def cleanup_old_logs(log_dir, days=7):
    """Remove log files older than N days."""
    cutoff_time = time.time() - (days * 86400)

    for filename in os.listdir(log_dir):
        filepath = os.path.join(log_dir, filename)
        if os.path.isfile(filepath):
            if os.path.getmtime(filepath) < cutoff_time:
                os.remove(filepath)
                print(f"Deleted: {filepath}")

cleanup_old_logs("/var/log/app")

shutil Module

File operations:

import shutil

# Copy file
shutil.copy("source.txt", "destination.txt")
shutil.copy2("source.txt", "dest.txt")  # Preserves metadata

# Copy directory tree
shutil.copytree("source_dir", "dest_dir")

# Move/rename
shutil.move("old_name.txt", "new_name.txt")

# Remove directory tree
shutil.rmtree("temp_dir")

# Disk usage
total, used, free = shutil.disk_usage("/")
print(f"Free space: {free / (1024**3):.2f} GB")

Real-world: Backup script

def backup_directory(source, backup_dir):
    """Create timestamped backup of directory."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = f"{backup_dir}/backup_{timestamp}"

    try:
        shutil.copytree(source, backup_path)
        print(f"Backup created: {backup_path}")
        return backup_path
    except Exception as e:
        print(f"Backup failed: {e}")
        return None

SSH and Remote Operations

Paramiko Library

Execute commands on remote servers via SSH:

import paramiko

def execute_remote_command(hostname, username, command, key_file=None):
    """Execute command on remote server."""
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

    try:
        # Connect with key or password
        if key_file:
            client.connect(hostname, username=username, key_filename=key_file)
        else:
            client.connect(hostname, username=username, password="password")

        # Execute command
        stdin, stdout, stderr = client.exec_command(command)

        # Get output
        output = stdout.read().decode()
        error = stderr.read().decode()

        return output, error
    finally:
        client.close()

# Usage
output, error = execute_remote_command(
    "prod-01.example.com",
    "ubuntu",
    "df -h",
    key_file="/home/user/.ssh/id_rsa"
)
print(output)

Real-world: Multi-server health check

def health_check_servers(servers):
    """Check health across multiple servers."""
    results = {}

    for server in servers:
        hostname, username = server["host"], server["user"]
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

        try:
            client.connect(hostname, username=username)
            stdin, stdout, stderr = client.exec_command("ps aux | wc -l")
            proc_count = int(stdout.read().decode().strip())

            results[hostname] = {
                "status": "healthy" if proc_count > 5 else "warning",
                "processes": proc_count
            }
        except Exception as e:
            results[hostname] = {"status": "down", "error": str(e)}
        finally:
            client.close()

    return results

AWS Integration with Boto3

Interact with AWS services:

import boto3

# EC2 operations
ec2 = boto3.client("ec2", region_name="us-east-1")

# List all instances
response = ec2.describe_instances()
for reservation in response["Reservations"]:
    for instance in reservation["Instances"]:
        print(f"Instance: {instance['InstanceId']} - {instance['State']['Name']}")

# Start instance
ec2.start_instances(InstanceIds=["i-1234567890abcdef0"])

# Create security group rule
ec2.authorize_security_group_ingress(
    GroupId="sg-12345678",
    IpProtocol="tcp",
    FromPort=443,
    ToPort=443,
    CidrIp="0.0.0.0/0"
)

S3 operations:

s3 = boto3.client("s3")

# List buckets
response = s3.list_buckets()
for bucket in response["Buckets"]:
    print(bucket["Name"])

# Upload file
s3.upload_file("local_file.txt", "my-bucket", "s3_key.txt")

# Download file
s3.download_file("my-bucket", "s3_key.txt", "local_file.txt")

# List objects in bucket
response = s3.list_objects_v2(Bucket="my-bucket", Prefix="logs/")
for obj in response.get("Contents", []):
    print(obj["Key"])

Real-world: Automated backups to S3

def backup_to_s3(local_dir, bucket_name, s3_prefix):
    """Backup local directory to S3."""
    s3 = boto3.client("s3")

    for filename in os.listdir(local_dir):
        filepath = os.path.join(local_dir, filename)
        if os.path.isfile(filepath):
            s3_key = f"{s3_prefix}/{datetime.now().strftime('%Y%m%d')}/{filename}"

            try:
                s3.upload_file(filepath, bucket_name, s3_key)
                print(f"Uploaded: {filename}")
            except Exception as e:
                print(f"Failed to upload {filename}: {e}")

HTTP Requests

requests Library

Make HTTP requests:

import requests

# GET request
response = requests.get("https://api.example.com/servers")
if response.status_code == 200:
    data = response.json()
else:
    print(f"Error: {response.status_code}")

# POST request
payload = {
    "name": "new-server",
    "region": "us-east-1",
    "type": "t3.medium"
}
response = requests.post(
    "https://api.example.com/servers",
    json=payload,
    headers={"Authorization": "Bearer TOKEN"}
)

# Query parameters
response = requests.get(
    "https://api.example.com/logs",
    params={"service": "api", "level": "error", "limit": 100}
)

# Handle timeouts
try:
    response = requests.get(url, timeout=5)
except requests.Timeout:
    print("Request timed out")

Real-world: Health check with retries

def health_check_api(url, max_retries=3, timeout=5):
    """Check API health with exponential backoff."""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                f"{url}/health",
                timeout=timeout
            )
            if response.status_code == 200:
                return True
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff

    return False

Data Parsing

JSON Operations

import json

# Parse JSON string
json_str = '{"host": "prod-01", "status": "running"}'
data = json.loads(json_str)
print(data["host"])

# Convert to JSON string
config = {"app": "myapp", "port": 8080, "debug": False}
json_output = json.dumps(config, indent=2)
print(json_output)

# Read JSON file
with open("config.json", "r") as f:
    config = json.load(f)

# Write JSON file
with open("output.json", "w") as f:
    json.dump(config, f, indent=2)

YAML Operations

import yaml

# Parse YAML string
yaml_str = """
servers:
  - name: web-01
    ip: 10.0.0.1
    port: 8080
  - name: db-01
    ip: 10.0.0.2
    port: 5432
"""
data = yaml.safe_load(yaml_str)
for server in data["servers"]:
    print(f"{server['name']}: {server['ip']}")

# Read YAML file
with open("inventory.yaml", "r") as f:
    inventory = yaml.safe_load(f)

# Write YAML file
config = {"environment": "prod", "replicas": 3}
with open("deploy.yaml", "w") as f:
    yaml.dump(config, f, default_flow_style=False)

Log Parsing and Analysis

Parse and analyze logs:

import re
from collections import defaultdict

def parse_nginx_log(log_file):
    """Parse nginx access log."""
    # Typical nginx log format
    pattern = r'(\S+) - (\S+) \[(.*?)\] "(\S+) (\S+) (\S+)" (\d+) (\d+)'

    stats = defaultdict(int)

    with open(log_file, "r") as f:
        for line in f:
            match = re.match(pattern, line)
            if match:
                ip, user, timestamp, method, path, protocol, status, size = match.groups()
                stats[f"status_{status}"] += 1
                stats[f"method_{method}"] += 1

    return stats

# Real-world: Find and report errors
def analyze_errors(log_file, pattern=r"ERROR"):
    """Find error lines in log file."""
    errors = []

    with open(log_file, "r") as f:
        for line_num, line in enumerate(f, 1):
            if re.search(pattern, line):
                errors.append({
                    "line": line_num,
                    "message": line.strip()
                })

    return errors

errors = analyze_errors("/var/log/app/error.log")
print(f"Found {len(errors)} errors")

System Monitoring

Monitor system resources:

import psutil

# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
print(f"CPU: {cpu_percent}%")

# Memory usage
memory = psutil.virtual_memory()
print(f"Memory: {memory.percent}% used")
print(f"Available: {memory.available / (1024**3):.2f} GB")

# Disk usage
disk = psutil.disk_usage("/")
print(f"Disk: {disk.percent}% used")

# Process info
for proc in psutil.process_iter(["pid", "name", "memory_percent"]):
    try:
        if proc.info["memory_percent"] > 10:
            print(f"{proc.info['name']}: {proc.info['memory_percent']}%")
    except (psutil.NoSuchProcess, psutil.AccessDenied):
        pass

Real-world: Monitoring script

def monitor_system(alert_thresholds):
    """Check system metrics against thresholds."""
    alerts = {}

    if psutil.cpu_percent(interval=1) > alert_thresholds.get("cpu", 80):
        alerts["cpu"] = "High CPU usage"

    memory = psutil.virtual_memory()
    if memory.percent > alert_thresholds.get("memory", 85):
        alerts["memory"] = f"Memory: {memory.percent}%"

    disk = psutil.disk_usage("/")
    if disk.percent > alert_thresholds.get("disk", 90):
        alerts["disk"] = f"Disk: {disk.percent}%"

    return alerts

alerts = monitor_system({"cpu": 75, "memory": 80, "disk": 85})
for key, message in alerts.items():
    print(f"ALERT: {message}")

Logging Best Practices

Configure logging for production scripts:

import logging
import logging.handlers

# Create logger
logger = logging.getLogger("deployment")

# File handler with rotation
handler = logging.handlers.RotatingFileHandler(
    "/var/log/app/deploy.log",
    maxBytes=10485760,  # 10MB
    backupCount=5
)

# Format logs
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)

# Add handler
logger.addHandler(handler)
logger.setLevel(logging.INFO)

# Use logging
logger.info("Deployment started")
logger.warning("No backup found")
logger.error("Deployment failed")

Exercises

System Health Dashboard: Create a script that displays CPU, memory, and disk usage with alerts if thresholds are exceeded.
Log Analyzer: Parse application logs, count errors by type, and generate a report.
Multi-Server Deployment: Write a script using Paramiko to deploy a new application version across 10 servers with rollback on failure.
AWS Snapshot Manager: Create an automated backup script that creates daily snapshots of EC2 volumes and cleans up snapshots older than 30 days.
Configuration Validator: Write a script that validates JSON/YAML configuration files against a schema.
API Monitoring: Create a monitoring script that checks multiple API endpoints and logs response times, retrying with exponential backoff on failures.
Log Aggregator: Build a script that collects logs from multiple remote servers via SSH and stores them in a central location with parsing and analysis.

Summary

subprocess: Execute system commands
os/shutil: File and directory operations
paramiko: SSH connections and remote command execution
boto3: AWS service integration
requests: HTTP API interactions
json/yaml: Data parsing
logging: Structured logging for production code
psutil: System monitoring and metrics
regex: Log parsing and pattern matching

Running External Commands​

subprocess Module​

Operating System Operations​

os Module​

shutil Module​

SSH and Remote Operations​

Paramiko Library​

AWS Integration with Boto3​

HTTP Requests​

requests Library​

Data Parsing​

JSON Operations​

YAML Operations​

Log Parsing and Analysis​

System Monitoring​

Logging Best Practices​

Exercises​

Summary​