Generate synthetic data for Accounts Receivable

For those who would like to generate dummy/synthetic data for Accounts Receivable, use Python script below.

It can be run in Google Collab notebook if you don’t have local Python environment setup:

  1. Edit variables - date period, # of customers and transactions, reporting date (default is today).

  2. Run the code

  3. Navigate to Files and download the 3 CSV files generated.

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 3, 31)
num_customers = 75
num_transactions = 10000

# Create customer segments
customer_segments = ['Enterprise', 'Mid-Market', 'Small Business', 'Government']
customer_credit_terms = [30, 45, 60, 90, 180]

# Create customers
customers = []
for i in range(1, num_customers + 1):
    segment = random.choice(customer_segments)
    credit_terms = random.choice(customer_credit_terms)
    customers.append({
        'CustomerID': f'CUST{i:04d}',
        'CustomerName': f'Customer {i}',
        'Segment': segment,
        'CreditTerms': credit_terms
    })

customers_df = pd.DataFrame(customers)

# Create invoices with seasonal patterns
invoices = []
transaction_id = 1

for _ in range(num_transactions):
    # Select a random customer
    customer = random.choice(customers)
    
    # Create random invoice date
    days_range = (end_date - start_date).days
    random_days = random.randint(0, days_range)
    invoice_date = start_date + timedelta(days=random_days)
    
    # Add seasonality to invoice amounts
    month = invoice_date.month
    
    # Higher amounts in Q4 (Oct-Dec)
    if month in [10, 11, 12]:
        base_amount = np.random.normal(10000, 3000)
    # Lower amounts in Q1 (Jan-Mar)
    elif month in [1, 2, 3]:
        base_amount = np.random.normal(5000, 1500)
    # Medium amounts rest of year
    else:
        base_amount = np.random.normal(7500, 2000)
    
    # Adjust amount based on customer segment
    if customer['Segment'] == 'Enterprise':
        amount = base_amount * 2.5
    elif customer['Segment'] == 'Mid-Market':
        amount = base_amount * 1.5
    elif customer['Segment'] == 'Government':
        amount = base_amount * 1.8
    else:  # Small Business
        amount = base_amount * 0.7
    
    # Round to 2 decimal places
    invoice_amount = round(max(amount, 100), 2)  # Minimum $100
    
    # Calculate due date based on credit terms
    due_date = invoice_date + timedelta(days=customer['CreditTerms'])
    
    # Determine if paid and payment date with seasonal collection patterns
    is_paid = random.random() < 0.85  # 85% of invoices eventually get paid
    
    if is_paid:
        # Create seasonal payment behavior
        # December has faster payments (bonus incentives)
        if due_date.month == 12:
            days_to_pay = max(int(np.random.normal(-5, 10)), -15)  # More likely to pay early
        # January has slower payments
        elif due_date.month == 1:
            days_to_pay = max(int(np.random.normal(15, 12)), -5)  # More likely to pay late
        # Quarter-end months have slightly faster payments
        elif due_date.month in [3, 6, 9]:
            days_to_pay = int(np.random.normal(0, 15))
        # Normal months
        else:
            days_to_pay = int(np.random.normal(5, 20))
        
        # Adjust by customer segment
        if customer['Segment'] == 'Enterprise':
            days_to_pay += int(np.random.normal(0, 5))
        elif customer['Segment'] == 'Small Business':
            days_to_pay += int(np.random.normal(10, 8))
        elif customer['Segment'] == 'Government':
            days_to_pay += int(np.random.normal(15, 10))
        
        payment_date = due_date + timedelta(days=days_to_pay)
        
        # Ensure payment date is not before invoice date
        if payment_date < invoice_date:
            payment_date = invoice_date + timedelta(days=random.randint(0, 5))
        
        # Ensure payment date is not in the future
        if payment_date > datetime.now():
            days_to_pay = None
            payment_date = None
            is_paid = False
    else:
        days_to_pay = None
        payment_date = None
    
    invoices.append({
        'TransactionID': f'INV{transaction_id:06d}',
        'CustomerID': customer['CustomerID'],
        'InvoiceDate': invoice_date,
        'DueDate': due_date,
        'InvoiceAmount': invoice_amount,
        'IsPaid': is_paid,
        'PaymentDate': payment_date,
        'DaysToPayment': days_to_pay
    })
    
    transaction_id += 1

# Create DataFrame
invoices_df = pd.DataFrame(invoices)

# Calculate aging buckets using current date for reporting
reporting_date = datetime.now()  # Use current date for aging snapshot

def calculate_aging(row):
    if row['IsPaid']:
        if row['PaymentDate'] <= reporting_date:
            return 'Paid'
    
    days_outstanding = (reporting_date - row['DueDate']).days
    
    if days_outstanding <= 0:
        return 'Current'
    elif days_outstanding <= 30:
        return '1-30 Days'
    elif days_outstanding <= 60:
        return '31-60 Days'
    elif days_outstanding <= 90:
        return '61-90 Days'
    else:
        return '90+ Days'

invoices_df['AgingBucket'] = invoices_df.apply(calculate_aging, axis=1)

# Create monthly sales summary for forecasting
invoices_df['YearMonth'] = invoices_df['InvoiceDate'].dt.strftime('%Y-%m')
monthly_sales = invoices_df.groupby('YearMonth')['InvoiceAmount'].sum().reset_index()
monthly_sales = monthly_sales.sort_values('YearMonth')

# Save to CSV
invoices_df.to_csv('synthetic_ar_transactions.csv', index=False)
customers_df.to_csv('synthetic_customers.csv', index=False)
monthly_sales.to_csv('synthetic_monthly_sales.csv', index=False)

print("Generated synthetic AR dataset with:")
print(f"- {len(customers_df)} customers")
print(f"- {len(invoices_df)} invoice transactions")
print(f"- Data spanning from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print("Files saved: synthetic_ar_transactions.csv, synthetic_customers.csv, synthetic_monthly_sales.csv")
Next
Next

Building Superior Heatmaps in Power BI with SVG 🔥