Data Overview

library(summarytools)
library(kableExtra)
library(actuar)
library(fplot)

st_options(plain.ascii = FALSE, style = "rmarkdown")

#library(lossdevtapp)
devtools::load_all()

data("claims_transactional")
data("losses")
data("exposures")

latest_eval <- losses |> dplyr::filter(eval_date == max(.data$eval_date))
wc_dat <- latest_eval |> dplyr::filter(coverage == "WC")
al_dat <- latest_eval |> dplyr::filter(coverage == "AL")

Overview

The purpose of this vignette is to

Simulation of Transactional Claims Data

In order to create an initial Proof-of-Concept Shiny Application, I first needed loss data to develop and work with, and I decided to approach this initially by simulating the data.

This package comes with a function: simulate_claims() which is a wrapper around the overall workflow I will walkthrough below.

When it comes to actuarial claim’s data, it is much easier to simulate on a claim level basis, simulating incremental changes to individual claims over time as opposed to aggregated simulations to derive static lossruns.

Parameters

  • Number of Claims
  • Accident Range (Start/End Date)
  • Seed
  • Payment Function (Density Function)
  • (Optional) Claim ID Prefix
  • (Optional) Possible Geographic Regions (i.e. States)
  • Report Date Binomial Distribution Parameters
  • Claim Status Binomial Distribution Parameters

For example, by default the simulate_claims() uses the following parameters:

  • Number of Claims: 1000
  • Start Date: 2015-01-01
  • End Date: The Current Date
  • Seed: 12345
  • Loss Distribution: LogNormal
  • Loss Distribution Parameters: list(mean_log = 7.5, sd_log = 1.5)
  • Probability of Status Open: 96%
  • Cache: FALSE
simulate_claims(
  n_claims = 1000,
  start_date = "2015-01-01",
  end_date = Sys.Date(),
  seed = 12345,
  loss_distribution = "lnorm",
  params = list(mean_log = 7.5, sd_log = 1.5),
  status_prob_open = 0.96,
  cache = FALSE,
  ...
)
simulate_claims <- function(n_claims = 1000,
                            start_date = "2015-01-01",
                            end_date = Sys.Date(),
                            seed = 12345,
                            loss_distribution = "lnorm",
                            params = list(mean_log = 7.5, sd_log = 1.5),
                            status_prob_open = 0.96,
                            cache = FALSE,
                            ...) {

  # loss_distribution <- match.arg("loss_distribution")

  # validate arguments
  stopifnot(
    is.numeric(n_claims) && n_claims > 0,
    class(as.Date(start_date)) == "Date",
    class(as.Date(end_date)) == "Date" &&
      as.Date(end_date) > as.Date(start_date),
    is.numeric(seed),
    loss_distribution %in% c(
      "lnorm",
      "lognormal",
      "normal",
      "gamma",
      "lgamma",
      "pareto",
      "weibull",
      "genbeta"
    ),
    is.numeric(status_prob_open),
    status_prob_open > 0 && status_prob_open < 1
  )

  beg_date <- as.Date(start_date)
  end_date <- as.Date(end_date)
  accident_range <- as.numeric(end_date - beg_date)
  set.seed(seed)
  accident_date <- sample(0:accident_range, size = n_claims, replace = TRUE)

  # mean_log <- 7.5
  # sd_log <- 1.5

  payment_fun <- function(n) stats::rlnorm(n, params$mean_log, params$sd_log)

  claims <- tibble::tibble(
    claim_num = paste0("claim-", 1:n_claims),
    accident_date = beg_date + lubridate::days(accident_date),
    state = sample(c("TX", "CA", "GA", "FL"), size = n_claims, replace = TRUE),
    claimant = randomNames::randomNames(n_claims),
    report_lag = stats::rnbinom(n_claims, 5, .25), # 0 if claim closed when reported
    status = stats::rbinom(n_claims, 1, 0.96), # initial payment amount
    payment =  payment_fun(n_claims)
  ) %>%
    dplyr::mutate(
      report_date = .data$accident_date + .data$report_lag,
      payment = ifelse(.data$status == 0, 0, .data$payment),
      case = .data$payment + stats::runif(.env$n_claims, 0.25, 8.0),
      transaction_date = .data$report_date
    ) %>%
    dplyr::arrange(.data$accident_date)

  n_trans <- stats::rnbinom(n_claims, 3, 0.25)
  trans_lag <- lapply(n_trans, function(x) stats::rnbinom(x, 7, 0.1)) %>%
    lapply(function(x) { if (length(x) == 0) 0 else x })

  for (i in seq_len(n_claims)) {
    trans_lag[[i]] <- tibble::tibble(
      "trans_lag" = trans_lag[[i]],
      "claim_num" = paste0("claim-", i)
    )
  }

  trans_tbl <- dplyr::bind_rows(trans_lag) %>%
    dplyr::group_by(.data$claim_num) %>%
    dplyr::mutate(trans_lag = cumsum(.data$trans_lag)) %>%
    dplyr::ungroup()

  # separate all zero claims from the claims that have payments
  zero_claims <- dplyr::filter(claims, .data$status == 0)
  first_trans <- dplyr::filter(claims, .data$status == 1)

  subsequent_trans <- dplyr::left_join(trans_tbl, first_trans, by = "claim_num") %>%
    dplyr::filter(!is.na(.data$accident_date))

  n_trans <- nrow(subsequent_trans)

  subsequent_trans <- subsequent_trans %>%
    dplyr::mutate(payment = payment_fun(.env$n_trans),
                  case = pmax(.data$case * stats::rnorm(.env$n_trans, 1.5, 0.1) - .data$payment, 500),
                  transaction_date = .data$report_date + .data$trans_lag) %>%
    dplyr::select(-.data$trans_lag)

  trans <- dplyr::bind_rows(zero_claims, first_trans, subsequent_trans) %>%
    dplyr::arrange(.data$transaction_date)

  # add in a transaction number
  trans$trans_num <- 1:nrow(trans)

  # set final trans status to closed and case to 0
  trans <- trans %>%
    dplyr::arrange(.data$trans_num) %>%
    dplyr::group_by(.data$claim_num) %>%
    dplyr::mutate(final_trans = ifelse(.data$trans_num == max(.data$trans_num), TRUE, FALSE),
                  status = ifelse(.data$final_trans, 0, 1),
                  case = ifelse(.data$final_trans, 0, .data$case),
                  status = ifelse(.data$status == 0, "Closed", "Open"),
                  paid = round(cumsum(.data$payment), 0),
                  case = round(.data$case, 0),
                  payment = round(.data$payment, 0)) %>%
    dplyr::select(-.data$final_trans) %>%
    dplyr::arrange(.data$accident_date) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(.data$claim_num, dplyr::desc(.data$transaction_date))

  if (cache) { saveRDS(trans, file = "trans.RDS") }

  trans

}
# number of claims
n_claims <- 10000

# start/end dates & accident range
beg_date <- as.Date("2015-01-01")
end_date <- end_of_month(start_of_month(Sys.Date()) - 1)
accident_range <- as.numeric(end_date - beg_date)

set.seed(1234)

accident_date <- sample(0:accident_range, size = n_claims, replace = TRUE)

payment_fun <- function(n) rlnorm(n, 7.5, 1.5)

claims <- tibble::tibble(
  claim_num = 1:n_claims,
  claim_id = paste0("claim-", 1:n_claims),
  accident_date = beg_date + lubridate::days(accident_date),
  state = sample(c("TX", "CA", "GA", "FL"), size = n_claims, replace = TRUE),
  claimant = randomNames::randomNames(n_claims),
  report_date = rnbinom(n_claims, 5, .25),
  # 0 if claim closed when reported
  status = rbinom(n_claims, 1, 0.96),
  # initial payment amount
  payment =  payment_fun(n_claims)) %>%
  dplyr::mutate(report_date = accident_date + report_date,
                # set payment to zero if closed when reported
                payment = ifelse(status == 0, 0, payment),
                case = payment * runif(n_claims, 0.25, 8.0),
                transaction_date = report_date) %>%
  dplyr::arrange(accident_date)

To start, I specify the Experience Period in which I want to simulate claims using a start and end date:

# start/end dates & accident range
# beg_date <- as.Date("2015-01-01")
# end_date <- end_of_month(start_of_month(Sys.Date()) - 1)
# accident_range <- as.numeric(end_date - beg_date)