################################################################
# Carl 10 Dec 2016
#
# Monte Carlo study of TOPALS and indirect standardization estimators
# on simulated mortality data for which the true log(mx) schedule is known
#
# - Uses the male log(mx) schedule for Brazil as the "truth"
#
# - Generates simulated small-area samples for different population sizes n,
#    where n=# of people in each single-year age group. (For ex., if n=10 then
#    there are (10 infants, 10 1-yr-olds, ..., 10 99 yr-olds exposed to the
#    true mortality rates, and the observed numbers fo deaths are Poisson
#    random variables Dx ~ 10*(true mx)
#
# - Fits TOPALS and IS models for each sample using
#       (1) the HMD male average logmx values as the standard
#       (2) the true schedule as the standard
#
# - Calculates errors abs(estimated logmx - true logmx) for each (sample, method, std, age)
# - Calculates errors abs(estimated e0 - true e0) for each (sample, method, std)
#
# - Summarizes errors by (sample size, method)
################################################################
# HOUSEKEEPING
graphics.off()
if (.Platform$OS.type == 'windows') windows(width=12, height=8, record=TRUE)

rm(list=ls())   

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(splines)

set.seed(6447100)
# GET BRAZILIAN DATA
### Observed deaths and populations for UF, Meso, Micro and Municipalities in Brazil in 2010 
###   (deaths are totals over 2009-2011)
big.df  = read.csv("../big2010.df.csv",skip=10,  header=T, sep=",", dec=".", stringsAsFactors = FALSE)

### municipio-level geographic data (names and codes for state, meso, micro, etc.)
muni.df = read.csv("../muni.df.csv", header=T, sep=",", dec=".", stringsAsFactors = FALSE)

### build linear B-spline matrix (100 x 7)
age = 0:99
B   = bs( age, knots=c(0,1,10,20,40,70), degree=1)

nalpha = ncol(B)

### calculate a national-level male logmx schedule for Brazil
BR <- left_join(big.df, muni.df, by='munid') %>%
  filter(sex=='m') %>%
  select(munid, municode, muniname, ufabb, sex,age,death,expos) %>%
  group_by(sex, age) %>%
  summarise(expos=sum(expos), death=sum(death)) %>%
  mutate(mx=death/expos, logmx=log(mx))

head(BR)
## Source: local data frame [6 x 6]
## Groups: sex [1]
## 
##     sex   age   expos death           mx     logmx
##   <chr> <int>   <dbl> <int>        <dbl>     <dbl>
## 1     m     0 4133216 67803 0.0164044189 -4.110205
## 2     m     1 4142143  4921 0.0011880323 -6.735457
## 3     m     2 4189374  2809 0.0006705060 -7.307478
## 4     m     3 4280676  2040 0.0004765602 -7.648917
## 5     m     4 4376254  1726 0.0003944013 -7.838142
## 6     m     5 4432177  1554 0.0003506178 -7.955814
plot(0:99, BR$logmx, xlab='Age', ylab='log(mx)',pch=15, cex=.75,
     main='True Schedule for Simulations\nLog Mortality, Brazil Males 2009-2011')

### standard rates by sex from HMD dataframe (calculated elsewhere)
HMDstd = dget('../HMDstd.dput') %>%
  filter(age %in% 0:99)
# SIMULATED DATA: deaths by age (100), sample# (5000), sample size (3)
sample.sizes = c(1,100,10000)
nsamp        = 5000

D = array(NA, c(100, nsamp, length(sample.sizes)), 
                dimnames=list(0:99,paste0('samp#',seq(nsamp)),paste(sample.sizes)))

for (n in sample.sizes) {
  D[,,paste(n)]     = sapply(1:nsamp, function(x) rpois(100, n*exp(BR$logmx)) )
}

## display a bit of the simulated death data (first 5 samples at each sample size)
D[,1:5,]
## , , 1
## 
##    samp#1 samp#2 samp#3 samp#4 samp#5
## 0       0      0      0      1      0
## 1       0      0      0      0      0
## 2       0      0      0      0      0
## 3       0      0      0      0      0
## 4       0      0      0      0      0
## 5       0      0      0      0      0
## 6       0      0      0      0      0
## 7       0      0      0      0      0
## 8       0      0      0      0      0
## 9       0      0      0      0      0
## 10      0      0      0      0      0
## 11      0      0      0      0      0
## 12      0      0      0      0      0
## 13      0      0      0      0      0
## 14      0      0      0      0      0
## 15      0      0      0      0      0
## 16      0      0      0      0      0
## 17      0      0      0      0      0
## 18      0      0      0      0      0
## 19      0      0      0      0      0
## 20      0      0      0      0      0
## 21      0      0      0      0      0
## 22      0      0      0      0      0
## 23      0      0      0      0      0
## 24      0      0      0      0      0
## 25      0      0      0      0      0
## 26      0      0      0      0      0
## 27      0      0      0      0      0
## 28      0      0      0      0      0
## 29      0      0      0      0      0
## 30      0      0      0      0      0
## 31      0      0      0      0      0
## 32      0      0      0      0      0
## 33      0      0      0      0      0
## 34      0      0      0      0      0
## 35      0      0      0      0      0
## 36      0      0      0      0      0
## 37      0      0      0      0      0
## 38      0      0      0      0      0
## 39      0      0      0      0      0
## 40      0      0      0      0      0
## 41      0      0      0      0      0
## 42      0      0      0      0      0
## 43      0      0      0      0      0
## 44      0      0      0      0      0
## 45      0      1      0      0      0
## 46      0      0      0      0      0
## 47      0      0      0      0      0
## 48      0      0      0      0      0
## 49      0      0      0      0      0
## 50      0      0      0      0      0
## 51      0      0      0      0      0
## 52      0      0      0      0      0
## 53      0      0      0      0      0
## 54      0      0      0      0      0
## 55      0      0      0      0      0
## 56      0      0      0      0      0
## 57      0      0      0      0      0
## 58      0      0      0      0      0
## 59      0      0      0      0      0
## 60      0      0      0      0      0
## 61      0      0      0      0      0
## 62      0      0      0      0      0
## 63      0      0      0      0      0
## 64      0      0      0      0      0
## 65      0      0      0      0      0
## 66      0      0      0      0      0
## 67      0      0      0      0      1
## 68      0      0      0      0      0
## 69      0      0      0      0      0
## 70      0      0      0      0      0
## 71      1      0      0      0      0
## 72      0      0      0      0      0
## 73      0      0      0      0      0
## 74      1      0      0      0      0
## 75      0      0      0      0      0
## 76      0      1      0      0      0
## 77      0      0      0      1      0
## 78      0      0      0      0      1
## 79      0      1      0      0      1
## 80      0      0      0      0      0
## 81      0      0      0      0      0
## 82      0      0      0      0      1
## 83      1      0      0      0      0
## 84      0      0      0      1      2
## 85      1      0      0      0      1
## 86      0      0      0      1      0
## 87      0      1      0      1      0
## 88      0      1      0      1      0
## 89      0      1      0      0      0
## 90      0      0      0      0      0
## 91      0      1      0      0      0
## 92      0      1      0      1      0
## 93      1      1      0      0      0
## 94      0      0      0      0      0
## 95      0      1      0      0      0
## 96      1      0      0      0      0
## 97      0      1      2      0      0
## 98      1      0      0      0      1
## 99      0      0      0      1      0
## 
## , , 100
## 
##    samp#1 samp#2 samp#3 samp#4 samp#5
## 0       3      2      0      1      5
## 1       0      0      0      0      0
## 2       0      0      0      0      0
## 3       0      0      0      0      0
## 4       0      0      0      0      0
## 5       0      0      0      0      0
## 6       0      0      0      0      0
## 7       0      0      0      0      0
## 8       0      0      0      0      0
## 9       0      0      0      0      0
## 10      0      0      0      0      0
## 11      0      0      0      0      0
## 12      0      0      0      0      0
## 13      0      0      0      0      0
## 14      0      0      0      0      0
## 15      0      0      0      0      0
## 16      0      0      0      0      0
## 17      0      1      0      0      0
## 18      0      1      0      0      0
## 19      0      0      0      0      0
## 20      0      1      0      0      0
## 21      0      0      0      0      0
## 22      0      0      0      0      0
## 23      0      0      2      0      1
## 24      0      1      0      0      0
## 25      0      1      0      0      0
## 26      0      0      1      0      1
## 27      0      0      0      1      0
## 28      1      0      0      0      0
## 29      0      0      0      1      0
## 30      0      0      0      0      2
## 31      0      0      0      0      0
## 32      0      0      0      0      0
## 33      0      0      0      0      0
## 34      0      2      0      0      0
## 35      0      0      1      0      0
## 36      2      0      0      1      0
## 37      0      0      0      1      0
## 38      1      1      0      0      0
## 39      1      1      1      0      0
## 40      0      2      1      0      0
## 41      0      2      0      2      0
## 42      0      0      0      1      0
## 43      2      0      0      0      0
## 44      1      0      2      2      0
## 45      0      0      1      0      0
## 46      0      0      0      0      0
## 47      1      1      0      1      0
## 48      0      2      1      0      1
## 49      3      1      1      0      2
## 50      1      0      2      0      0
## 51      1      1      1      2      0
## 52      0      0      2      1      0
## 53      1      0      1      1      1
## 54      3      4      1      1      2
## 55      0      2      2      0      2
## 56      2      1      1      2      4
## 57      2      0      2      5      1
## 58      1      0      2      4      1
## 59      4      2      2      1      1
## 60      2      1      4      1      0
## 61      2      1      0      4      3
## 62      3      0      2      1      0
## 63      2      2      2      1      1
## 64      1      2      3      3      3
## 65      2      0      1      1      4
## 66      2      2      5      2      4
## 67      3      3      1      1      2
## 68      3      3      2      4      2
## 69      2      2      2      5      2
## 70      4      3      4      2      1
## 71      4      4      6      4      6
## 72      5      5      2     10      3
## 73      1      7      4      7      3
## 74      6      5      4      8     10
## 75      6      7      7      2      7
## 76      6      5      4      6      6
## 77      6      6      8      4      6
## 78      4      5      7      5      4
## 79      5      8      5      6      4
## 80      8      5      8      7      6
## 81      8      4      3      9      7
## 82     12     12      6     10      3
## 83      7      7     12     11      6
## 84      5     12      7     12     11
## 85      7     15     15      5     15
## 86     22      9      4      7     14
## 87     17     10     11      9     16
## 88     13      9     15     19     12
## 89     14     14     12     14     16
## 90     16     15     13     18     16
## 91     15     14     18     16     14
## 92     22     20     19     16     18
## 93     20     18     30     23     22
## 94     26     16     13     13     26
## 95     17     17     27     16     26
## 96     18     28     22     25     23
## 97     27     17     18     14     22
## 98     31     33     18     31     22
## 99     31     22     27     19     23
## 
## , , 10000
## 
##    samp#1 samp#2 samp#3 samp#4 samp#5
## 0     158    170    151    156    172
## 1      14      4     12     14     10
## 2      10      8     10      7      8
## 3       2      5      5      3      3
## 4       6      4      3      5      8
## 5       3      5      4      3      5
## 6       2      2      0      3      4
## 7       2      2      3      3      0
## 8       1      3      4      1      4
## 9       3      5      2      1      2
## 10      3      3      1      3      3
## 11      2      5      2      2      3
## 12      5      5      4      5      4
## 13      9      5      6      8      5
## 14      5      9      6     12      9
## 15     12      5      9     12      8
## 16     17     17     14     15      8
## 17     15     20     29     21     19
## 18     24     30     24     22     19
## 19     24     20     31     14     17
## 20     24     27     26     24     25
## 21     17     35     34     19     30
## 22     37     22     28     30     17
## 23     24     21     29     16     32
## 24     20     24     28     35     23
## 25     24     20     30     33     22
## 26     20     20     21     26     24
## 27     21     24     23     28     23
## 28     26     31     23     30     17
## 29     31     29     30     27     25
## 30     29     30     30     35     18
## 31     29     27     29     30     20
## 32     39     27     23     26     38
## 33     34     27     24     19     32
## 34     34     29     32     31     22
## 35     38     36     30     28     28
## 36     31     44     32     30     34
## 37     36     48     31     37     28
## 38     28     31     42     25     34
## 39     28     21     37     30     34
## 40     35     45     30     32     38
## 41     35     42     57     43     27
## 42     54     48     52     42     38
## 43     51     59     39     58     33
## 44     50     45     39     54     41
## 45     58     42     66     49     63
## 46     51     62     40     60     65
## 47     44     57     64     77     70
## 48     67     39     58     64     75
## 49     65     61     57     73     65
## 50     73     57     71     84     77
## 51     84     83     71     72     74
## 52     73     82     85     91     82
## 53    101    112    102    109    103
## 54     87     96     99    100    103
## 55     95    120     96    115    109
## 56    130    105    113    128    100
## 57    116    104    129    105    150
## 58    130    133    131    137    145
## 59    152    153    128    148    131
## 60    144    172    141    146    133
## 61    184    163    168    171    161
## 62    186    172    182    169    176
## 63    172    173    198    204    192
## 64    188    189    195    183    173
## 65    220    203    221    186    214
## 66    252    212    214    237    258
## 67    274    250    254    260    243
## 68    272    283    236    324    308
## 69    276    311    278    295    296
## 70    339    356    308    280    327
## 71    384    338    388    419    321
## 72    386    389    387    395    362
## 73    424    395    405    378    416
## 74    437    466    445    411    421
## 75    467    433    515    511    471
## 76    530    573    503    510    499
## 77    553    565    572    597    554
## 78    634    584    665    634    655
## 79    668    700    683    729    651
## 80    728    779    747    739    730
## 81    799    794    820    787    828
## 82    883    833    926    919    921
## 83    913    918    954    975    935
## 84    976   1034   1016    967   1054
## 85   1084   1101   1074   1125   1150
## 86   1139   1235   1257   1214   1212
## 87   1335   1360   1230   1285   1322
## 88   1404   1414   1377   1362   1407
## 89   1412   1437   1441   1470   1422
## 90   1553   1505   1633   1505   1534
## 91   1773   1690   1663   1657   1651
## 92   1915   1900   1829   1920   1850
## 93   1948   2029   2051   2027   2016
## 94   2226   2187   2179   2140   2135
## 95   2149   2094   2188   2232   2171
## 96   2300   2273   2372   2369   2274
## 97   2228   2307   2315   2349   2331
## 98   2417   2408   2460   2327   2416
## 99   2558   2428   2584   2553   2525
# IS and TOPALS fits for logmx
logmx.hat = array(NA, c(100,3,nsamp,2,2),
                  dimnames=list(0:99,paste(sample.sizes),paste0('samp#',1:nsamp), c('TOPALS','IS'),c('BR','HMD')))
# calculate IS estimates of logmx
# for a column of observed deaths D0...D99, sample size n, and standard schedule
IS.fit = function(Dx, n, std) {
  expected.deaths = sum(n * exp(std))
  offset          = log( sum(Dx) / expected.deaths)
  return( std + offset)
}


for (n in sample.sizes) {
  logmx.hat[,paste(n),,'IS','BR']  = apply(D[,,paste(n)],2, IS.fit, n=n, std=BR$logmx)
  logmx.hat[,paste(n),,'IS','HMD'] = apply(D[,,paste(n)],2, IS.fit, n=n, std=HMDstd[,'male'])
}
# calculate TOPALS estimates of logmx
# for a column of observed deaths D0...D99, sample size n, and standard schedule

## log likelihood function L(alpha | D,N)
penalized.logL = function( alpha, D, N, std, Basis=B, k=1) {
  logmx.hat = std + (Basis %*% alpha)
  penalty = k * sum(diff(alpha)^2)
  return( sum( -N * exp(logmx.hat) + D * logmx.hat) - penalty)
}

TOPALS.fit = function(Dx, n, std) {
  # control parameters and initial values for nonlinear fit
  this.control = list(maxit=1000,fnscale= -1, parscale=rep(.01,ncol(B)))
  alpha0 = rep(0, ncol(B))
  
  fit = optim( alpha0, penalized.logL, D=Dx, N=rep(n,100), std=std, Basis=B,
                          method='BFGS', control=this.control)
       
  converged = (fit$convergence == 0)
  if (converged)  { result = std + B %*% fit$par }
  if (!converged) { result = std + NA} 
  return( result)
}

for (n in sample.sizes) {
  logmx.hat[,paste(n),,'TOPALS','BR']  = apply(D[,,paste(n)],2, TOPALS.fit, n=n, std=BR$logmx)
  logmx.hat[,paste(n),,'TOPALS','HMD'] = apply(D[,,paste(n)],2, TOPALS.fit, n=n, std=HMDstd[,'male'])
}

## change any bad data (caused by samples with a TOTAL of zero deaths) to missing
logmx.hat[ !is.finite(logmx.hat)] = NA

##########################################################################

## display some randomly selected samples and fits (n=100)
op = par(no.readonly = TRUE) # save graphical settings

par(mfrow=c(1,2))

  sel = sort( sample(nsamp, 15) )
  for (i in sel) {
  
    for (method in c('IS','TOPALS')) {
      plot(0:99, BR$logmx, type='p', xlab='Age', ylab='log(mx)', pch=15,cex=.75,
           main=paste('Sample',i,'with n=100 people per single-year age\n',
                       method,'(green solid=correct std, red dashed=wrong std)'),
           cex.main = .80, ylim=c(-10,0) )
      points(0:99, log( D[,i,'100'] / 100) )
      lines(0:99, logmx.hat[,'100',i,method,'BR'], col='forestgreen', lwd=3)
      lines(0:99, logmx.hat[,'100',i,method,'HMD'], col='red', lwd=3,lty=2)
      
      zero.ages = (0:99)[ which(D[,i,'100'] == 0) ]
      rug( zero.ages)
    } # for method

  } # for i

par(op)

##########################################################################
# ERROR SUMMARIES
##########################################################################

## calculate and print the mean abs(logmx.hat - logmx) for each
## (sample size, method, std)

logmx.errors= apply( abs( logmx.hat - BR$logmx), c(2,4,5), mean, na.rm=TRUE)
round(logmx.errors, 3)
## , , BR
## 
##       TOPALS    IS
## 1      0.417 0.415
## 100    0.142 0.040
## 10000  0.028 0.004
## 
## , , HMD
## 
##       TOPALS    IS
## 1      0.692 0.733
## 100    0.187 0.539
## 10000  0.057 0.538
## calculate and print the mean abs(e0.hat - e0) for each
## (sample size, method, std)

 ## log mortality schedule -> life table {dx} column
dx = function(lambda) {
  mx = exp(lambda)
  Hx = cumsum(mx)
  lx = c(1, exp(-Hx))
  tmp = -diff(lx)
  return(tmp/sum(tmp))
}

# ## life table e0
e0 = function(lambda) {
  this.dx = dx(lambda)
  ## approx ages at death for those dying in each interval
  death.age = 0:99+.5
  death.age[1] = 0.10   # infants
  return (sum( death.age * this.dx))
}

# ## true life expectancy used in all simulations
e0.BR   = e0(BR$logmx)
e0.BR
## [1] 70.71041
e0.hat = apply(logmx.hat, 2:5, e0)

e0.errors= apply( abs( e0.hat - e0.BR), c(1,3,4), mean, na.rm=TRUE)
round(e0.errors, 2)
## , , BR
## 
##       TOPALS   IS
## 1       4.94 4.79
## 100     1.29 0.52
## 10000   0.15 0.05
## 
## , , HMD
## 
##       TOPALS   IS
## 1       6.10 6.31
## 100     1.43 5.54
## 10000   0.15 5.54