--- title: "Data Overview" author: "Jimmy Briggs" description: > Provide an overview of the data utilized by this package and how it was derived. vignette: > %\VignetteIndexEntry{Data Overview} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", results = "asis", message = FALSE, warning = FALSE, eval = FALSE ) options(tibble.print_min = 4, tibble.print_max = 4) options(scipen = 999) ``` ```{r setup, message=FALSE, warning=FALSE} library(summarytools) library(kableExtra) library(actuar) library(fplot) st_options(plain.ascii = FALSE, style = "rmarkdown") #library(lossdevtapp) devtools::load_all() data("claims_transactional") data("losses") data("exposures") latest_eval <- losses |> dplyr::filter(eval_date == max(.data$eval_date)) wc_dat <- latest_eval |> dplyr::filter(coverage == "WC") al_dat <- latest_eval |> dplyr::filter(coverage == "AL") ``` ## Overview The purpose of this vignette is to ## Simulation of Transactional Claims Data In order to create an initial Proof-of-Concept Shiny Application, I first needed loss data to develop and work with, and I decided to approach this initially by simulating the data. This package comes with a function: [`simulate_claims()`](../R/fct_simulate_claims.R) which is a wrapper around the overall workflow I will walkthrough below. When it comes to actuarial claim's data, it is much easier to simulate on a claim level basis, simulating incremental changes to individual claims over time as opposed to aggregated simulations to derive static lossruns. ### Parameters - Number of Claims - Accident Range (Start/End Date) - Seed - Payment Function (Density Function) - (Optional) Claim ID Prefix - (Optional) Possible Geographic Regions (i.e. States) - Report Date Binomial Distribution Parameters - Claim Status Binomial Distribution Parameters For example, by default the [`simulate_claims()`](../R/fct_simulate_claims.R) uses the following parameters: - Number of Claims: **1000** - Start Date: **2015-01-01** - End Date: The Current Date - Seed: **12345** - Loss Distribution: **LogNormal** - Loss Distribution Parameters: **`list(mean_log = 7.5, sd_log = 1.5)`** - Probability of Status Open: **96%** - Cache: `FALSE` ```R simulate_claims( n_claims = 1000, start_date = "2015-01-01", end_date = Sys.Date(), seed = 12345, loss_distribution = "lnorm", params = list(mean_log = 7.5, sd_log = 1.5), status_prob_open = 0.96, cache = FALSE, ... ) ``` ```R simulate_claims <- function(n_claims = 1000, start_date = "2015-01-01", end_date = Sys.Date(), seed = 12345, loss_distribution = "lnorm", params = list(mean_log = 7.5, sd_log = 1.5), status_prob_open = 0.96, cache = FALSE, ...) { # loss_distribution <- match.arg("loss_distribution") # validate arguments stopifnot( is.numeric(n_claims) && n_claims > 0, class(as.Date(start_date)) == "Date", class(as.Date(end_date)) == "Date" && as.Date(end_date) > as.Date(start_date), is.numeric(seed), loss_distribution %in% c( "lnorm", "lognormal", "normal", "gamma", "lgamma", "pareto", "weibull", "genbeta" ), is.numeric(status_prob_open), status_prob_open > 0 && status_prob_open < 1 ) beg_date <- as.Date(start_date) end_date <- as.Date(end_date) accident_range <- as.numeric(end_date - beg_date) set.seed(seed) accident_date <- sample(0:accident_range, size = n_claims, replace = TRUE) # mean_log <- 7.5 # sd_log <- 1.5 payment_fun <- function(n) stats::rlnorm(n, params$mean_log, params$sd_log) claims <- tibble::tibble( claim_num = paste0("claim-", 1:n_claims), accident_date = beg_date + lubridate::days(accident_date), state = sample(c("TX", "CA", "GA", "FL"), size = n_claims, replace = TRUE), claimant = randomNames::randomNames(n_claims), report_lag = stats::rnbinom(n_claims, 5, .25), # 0 if claim closed when reported status = stats::rbinom(n_claims, 1, 0.96), # initial payment amount payment = payment_fun(n_claims) ) %>% dplyr::mutate( report_date = .data$accident_date + .data$report_lag, payment = ifelse(.data$status == 0, 0, .data$payment), case = .data$payment + stats::runif(.env$n_claims, 0.25, 8.0), transaction_date = .data$report_date ) %>% dplyr::arrange(.data$accident_date) n_trans <- stats::rnbinom(n_claims, 3, 0.25) trans_lag <- lapply(n_trans, function(x) stats::rnbinom(x, 7, 0.1)) %>% lapply(function(x) { if (length(x) == 0) 0 else x }) for (i in seq_len(n_claims)) { trans_lag[[i]] <- tibble::tibble( "trans_lag" = trans_lag[[i]], "claim_num" = paste0("claim-", i) ) } trans_tbl <- dplyr::bind_rows(trans_lag) %>% dplyr::group_by(.data$claim_num) %>% dplyr::mutate(trans_lag = cumsum(.data$trans_lag)) %>% dplyr::ungroup() # separate all zero claims from the claims that have payments zero_claims <- dplyr::filter(claims, .data$status == 0) first_trans <- dplyr::filter(claims, .data$status == 1) subsequent_trans <- dplyr::left_join(trans_tbl, first_trans, by = "claim_num") %>% dplyr::filter(!is.na(.data$accident_date)) n_trans <- nrow(subsequent_trans) subsequent_trans <- subsequent_trans %>% dplyr::mutate(payment = payment_fun(.env$n_trans), case = pmax(.data$case * stats::rnorm(.env$n_trans, 1.5, 0.1) - .data$payment, 500), transaction_date = .data$report_date + .data$trans_lag) %>% dplyr::select(-.data$trans_lag) trans <- dplyr::bind_rows(zero_claims, first_trans, subsequent_trans) %>% dplyr::arrange(.data$transaction_date) # add in a transaction number trans$trans_num <- 1:nrow(trans) # set final trans status to closed and case to 0 trans <- trans %>% dplyr::arrange(.data$trans_num) %>% dplyr::group_by(.data$claim_num) %>% dplyr::mutate(final_trans = ifelse(.data$trans_num == max(.data$trans_num), TRUE, FALSE), status = ifelse(.data$final_trans, 0, 1), case = ifelse(.data$final_trans, 0, .data$case), status = ifelse(.data$status == 0, "Closed", "Open"), paid = round(cumsum(.data$payment), 0), case = round(.data$case, 0), payment = round(.data$payment, 0)) %>% dplyr::select(-.data$final_trans) %>% dplyr::arrange(.data$accident_date) %>% dplyr::ungroup() %>% dplyr::arrange(.data$claim_num, dplyr::desc(.data$transaction_date)) if (cache) { saveRDS(trans, file = "trans.RDS") } trans } ``` ```R # number of claims n_claims <- 10000 # start/end dates & accident range beg_date <- as.Date("2015-01-01") end_date <- end_of_month(start_of_month(Sys.Date()) - 1) accident_range <- as.numeric(end_date - beg_date) set.seed(1234) accident_date <- sample(0:accident_range, size = n_claims, replace = TRUE) payment_fun <- function(n) rlnorm(n, 7.5, 1.5) claims <- tibble::tibble( claim_num = 1:n_claims, claim_id = paste0("claim-", 1:n_claims), accident_date = beg_date + lubridate::days(accident_date), state = sample(c("TX", "CA", "GA", "FL"), size = n_claims, replace = TRUE), claimant = randomNames::randomNames(n_claims), report_date = rnbinom(n_claims, 5, .25), # 0 if claim closed when reported status = rbinom(n_claims, 1, 0.96), # initial payment amount payment = payment_fun(n_claims)) %>% dplyr::mutate(report_date = accident_date + report_date, # set payment to zero if closed when reported payment = ifelse(status == 0, 0, payment), case = payment * runif(n_claims, 0.25, 8.0), transaction_date = report_date) %>% dplyr::arrange(accident_date) ``` To start, I specify the *Experience Period* in which I want to simulate claims using a start and end date: ```{r sim_input_dates} # start/end dates & accident range # beg_date <- as.Date("2015-01-01") # end_date <- end_of_month(start_of_month(Sys.Date()) - 1) # accident_range <- as.numeric(end_date - beg_date) ```