gam.fit3.r 125 KB
Newer Older
1
## R routines for gam fitting with calculation of derivatives w.r.t. sp.s
2
## (c) Simon Wood 2004-2013
3

4
## These routines are for type 3 gam fitting. The basic idea is that a P-IRLS
5
## is run to convergence, and only then is a scheme for evaluating the 
6
## derivatives via the implicit function theorem used. 
7 8


9 10 11 12
gam.reparam <- function(rS,lsp,deriv) 
## Finds an orthogonal reparameterization which avoids `dominant machine zero leakage' between 
## components of the square root penalty.
## rS is the list of the square root penalties: last entry is root of fixed. 
13
##    penalty, if fixed.penalty=TRUE (i.e. length(rS)>length(sp))
14
## lsp is the vector of log smoothing parameters.
15 16
## *Assumption* here is that rS[[i]] are in a null space of total penalty already;
## see e.g. totalPenaltySpace & mini.roots
17 18 19
## Ouputs:
## S -- the total penalty matrix similarity transformed for stability
## rS -- the component square roots, transformed in the same way
20
##       - tcrossprod(rS[[i]]) = rS[[i]] %*% t(rS[[i]]) gives the matrix penalty component.
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## Qs -- the orthogonal transformation matrix S = t(Qs)%*%S0%*%Qs, where S0 is the 
##       untransformed total penalty implied by sp and rS on input
## E -- the square root of the transformed S (obtained in a stable way by pre-conditioning)
## det -- log |S|
## det1 -- dlog|S|/dlog(sp) if deriv >0
## det2 -- hessian of log|S| wrt log(sp) if deriv>1  
{ q <- nrow(rS[[1]])
  rSncol <- unlist(lapply(rS,ncol))
  M <- length(lsp) 
  if (length(rS)>M) fixed.penalty <- TRUE else fixed.penalty <- FALSE
  
  d.tol <- .Machine$double.eps^.3 ## group `similar sized' penalties, to save work

  r.tol <- .Machine$double.eps^.75 ## This is a bit delicate -- too large and penalty range space can be supressed.

36
  oo <- .C(C_get_stableS,S=as.double(matrix(0,q,q)),Qs=as.double(matrix(0,q,q)),sp=as.double(exp(lsp)),
37 38 39 40 41 42
                  rS=as.double(unlist(rS)), rSncol = as.integer(rSncol), q = as.integer(q),
                  M = as.integer(M), deriv=as.integer(deriv), det = as.double(0), 
                  det1 = as.double(rep(0,M)),det2 = as.double(matrix(0,M,M)), 
                  d.tol = as.double(d.tol),
                  r.tol = as.double(r.tol),
                  fixed_penalty = as.integer(fixed.penalty))
43
  S <- matrix(oo$S,q,q)  
44
  S <- (S+t(S))*.5
45 46 47
  p <- abs(diag(S))^.5            ## by Choleski, p can not be zero if S +ve def
  p[p==0] <- 1                    ## but it's possible to make a mistake!!
  ##E <-  t(t(chol(t(t(S/p)/p)))*p) 
48 49 50
  St <- t(t(S/p)/p)
  St <- (St + t(St))*.5 ## force exact symmetry -- avoids very rare mroot fails 
  E <- t(mroot(St,rank=q)*p) ## the square root S, with column separation
51 52 53 54 55 56 57 58 59 60 61 62
  Qs <- matrix(oo$Qs,q,q)         ## the reparameterization matrix t(Qs)%*%S%*%Qs -> S
  k0 <- 1
  for (i in 1:length(rS)) { ## unpack the rS in the new space
    crs <- ncol(rS[[i]]);
    k1 <- k0 + crs * q - 1 
    rS[[i]] <- matrix(oo$rS[k0:k1],q,crs)
    k0 <- k1 + 1
  }
  ## now get determinant + derivatives, if required...
  if (deriv > 0) det1 <- oo$det1 else det1 <- NULL
  if (deriv > 1) det2 <- matrix(oo$det2,M,M) else det2 <- NULL  
  list(S=S,E=E,Qs=Qs,rS=rS,det=oo$det,det1=det1,det2=det2,fixed.penalty = fixed.penalty)
63
} ## gam.reparam
64 65


66

67
gam.fit3 <- function (x, y, sp, Eb,UrS=list(),
68
            weights = rep(1, nobs), start = NULL, etastart = NULL, 
69 70
            mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
71
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",null.coef=rep(0,ncol(x)),
72
            pearson.extra=0,dev.extra=0,n.true=-1,Sl=NULL,...) {
73 74 75 76 77 78 79 80
## Inputs:
## * x model matrix
## * y response
## * sp log smoothing parameters
## * Eb square root of nicely balanced total penalty matrix used for rank detection
## * UrS list of penalty square roots in range space of overall penalty. UrS[[i]]%*%t(UrS[[i]]) 
##   is penalty. See 'estimate.gam' for more.
## * weights prior weights (reciprocal variance scale)
81
## * start initial values for parameters. ignored if etastart or mustart present (although passed on).
82
## * etastart initial values for eta
83 84
## * mustart initial values for mu. discarded if etastart present.
## * control - control list.
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
## * intercept - indicates whether model has one.
## * deriv - order 0,1 or 2 derivatives are to be returned (lower is cheaper!)
## * gamma - multiplier for effective degrees of freedom in GCV/UBRE.
## * scale parameter. Negative signals to estimate.
## * printWarn print or supress?
## * scoreType - type of smoothness selection to use.
## * null.coef - coefficients for a null model, in order to be able to check for immediate 
##   divergence.
## * pearson.extra is an extra component to add to the pearson statistic in the P-REML/P-ML 
##   case, only.
## * dev.extra is an extra component to add to the deviance in the REML and ML cases only.
## * n.true is to be used in place of the length(y) in ML/REML calculations,
##   and the scale.est only.
## 
## Version with new reparameterization and truncation strategy. Allows iterative weights 
## to be negative. Basically the workhorse routine for Wood (2011) JRSSB.
## A much modified version of glm.fit. Purpose is to estimate regression coefficients 
## and compute a smoothness selection score along with its derivatives.
##
    if (control$trace) { t0 <- proc.time();tc <- 0} 
105 106 107 108 109 110 111 112 113 114 115 116
  
    if (inherits(family,"extended.family")) { ## then actually gam.fit4/5 is needed
      if (inherits(family,"general.family")) {
        return(gam.fit5(x,y,sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv,
                        family=family,control=control,Mp=Mp,start=start))
      } else
      return(gam.fit4(x, y, sp, Eb,UrS=UrS,
            weights = weights, start = start, etastart = etastart, 
            mustart = mustart, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, deriv=deriv,
            scale=scale,scoreType=scoreType,null.coef=null.coef,...))
    }
117 118

    if (family$link==family$canonical) fisher <- TRUE else fisher=FALSE 
119
    ## ... if canonical Newton = Fisher, but Fisher cheaper!
120 121 122 123 124 125
    if (scale>0) scale.known <- TRUE else scale.known <- FALSE
    if (!scale.known&&scoreType%in%c("REML","ML")) { ## the final element of sp is actually log(scale)
      nsp <- length(sp)
      scale <- exp(sp[nsp])
      sp <- sp[-nsp]
    }
126
    if (!deriv%in%c(0,1,2)) stop("unsupported order of differentiation requested of gam.fit3")
127 128 129 130
    x <- as.matrix(x)  
    nSp <- length(sp)  
    if (nSp==0) deriv.sp <- 0 else deriv.sp <- deriv 

131 132
    rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency

133 134 135 136
    xnames <- dimnames(x)[[2]]
    ynames <- if (is.matrix(y)) 
        rownames(y)
    else names(y)
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163


    q <- ncol(x)
    if (length(UrS)) { ## find a stable reparameterization...
    
      grderiv <- deriv*as.numeric(scoreType%in%c("REML","ML","P-REML","P-ML"))
      rp <- gam.reparam(UrS,sp,grderiv) ## note also detects fixed penalty if present
 ## Following is for debugging only...
 #     deriv.check <- FALSE
 #     if (deriv.check&&grderiv) {
 #       eps <- 1e-4
 #       fd.grad <- rp$det1
 #       for (i in 1:length(sp)) {
 #         spp <- sp; spp[i] <- spp[i] + eps/2
 #         rp1 <- gam.reparam(UrS,spp,grderiv)
 #         spp[i] <- spp[i] - eps
 #         rp0 <- gam.reparam(UrS,spp,grderiv)
 #         fd.grad[i] <- (rp1$det-rp0$det)/eps
 #       }
 #       print(fd.grad)
 #       print(rp$det1) 
 #     }

      T <- diag(q)
      T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs
      T <- U1%*%T ## new params b'=T'b old params
    
164
      null.coef <- t(T)%*%null.coef  
165 166 167
 
      if (!is.null(start)) start <- t(T)%*%start
    
168 169 170
      ## form x%*%T in parallel 
      x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads)
      ## x <- x%*%T   ## model matrix 0(nq^2)
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
   
      rS <- list()
      for (i in 1:length(UrS)) {
        rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]])))
      } ## square roots of penalty matrices in current parameterization
      Eb <- Eb%*%T ## balanced penalty matrix
      rows.E <- q-Mp
      Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp))
      St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q))
    } else { 
      T <- diag(q); 
      St <- matrix(0,q,q) 
      rSncol <- sp <- rows.E <- Eb <- Sr <- 0   
      rS <- list(0)
      rp <- list(det=0,det1 = rep(0,0),det2 = rep(0,0),fixed.penalty=FALSE)
    }
    iter <- 0;coef <- rep(0,ncol(x))
   
189 190
    conv <- FALSE
    n <- nobs <- NROW(y) ## n is just to keep codetools happy
191
    if (n.true <= 0) n.true <- nobs ## n.true is used in criteria in place of nobs
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
    nvars <- ncol(x)
    EMPTY <- nvars == 0
    if (is.null(weights)) 
        weights <- rep.int(1, nobs)
    if (is.null(offset)) 
        offset <- rep.int(0, nobs)
    variance <- family$variance
    dev.resids <- family$dev.resids
    aic <- family$aic
    linkinv <- family$linkinv
    mu.eta <- family$mu.eta
    if (!is.function(variance) || !is.function(linkinv)) 
        stop("illegal `family' argument")
    valideta <- family$valideta
    if (is.null(valideta)) 
        valideta <- function(eta) TRUE
    validmu <- family$validmu
    if (is.null(validmu)) 
210
       validmu <- function(mu) TRUE
211

212 213
    if (is.null(mustart)) {
        eval(family$initialize)
214
    } else {
215 216 217 218 219 220
        mukeep <- mustart
        eval(family$initialize)
        mustart <- mukeep
    }

    ## Added code
221 222
    if (family$family=="gaussian"&&family$link=="identity") strictly.additive <- TRUE else
      strictly.additive <- FALSE
223 224 225

    ## end of added code

226 227 228 229
    D1 <- D2 <- P <- P1 <- P2 <- trA <- trA1 <- trA2 <- 
        GCV<- GCV1<- GCV2<- GACV<- GACV1<- GACV2<- UBRE <-
        UBRE1<- UBRE2<- REML<- REML1<- REML2 <-NULL

230 231 232 233 234 235 236 237
    if (EMPTY) {
        eta <- rep.int(0, nobs) + offset
        if (!valideta(eta)) 
            stop("Invalid linear predictor values in empty model")
        mu <- linkinv(eta)
        if (!validmu(mu)) 
            stop("Invalid fitted means in empty model")
        dev <- sum(dev.resids(y, mu, weights))
238
        w <- (weights * mu.eta(eta)^2)/variance(mu)   ### BUG: incorrect for Newton
239 240 241 242 243 244 245 246 247 248 249 250 251 252
        residuals <- (y - mu)/mu.eta(eta)
        good <- rep(TRUE, length(residuals))
        boundary <- conv <- TRUE
        coef <- numeric(0)
        iter <- 0
        V <- variance(mu)
        alpha <- dev
        trA2 <- trA1 <- trA <- 0
        if (deriv) GCV2 <- GCV1<- UBRE2 <- UBRE1<-trA1 <- rep(0,nSp)
        GCV <- nobs*alpha/(nobs-gamma*trA)^2
        UBRE <- alpha/nobs - scale + 2*gamma/n*trA
        scale.est <- alpha / (nobs - trA)
    } ### end if (EMPTY)
    else {
253
        ##coefold <- NULL
254 255 256 257
        eta <- if (!is.null(etastart)) 
            etastart
        else if (!is.null(start)) 
            if (length(start) != nvars) 
258 259
                stop(gettextf("Length of start should equal %d and correspond to initial coefs for %s", 
                     nvars, deparse(xnames)))
260 261 262 263 264 265 266
            else {
                coefold <- start
                offset + as.vector(if (NCOL(x) == 1) 
                  x * start
                else x %*% start)
            }
        else family$linkfun(mustart)
267
        #etaold <- eta
268 269
        ##muold <- 
        mu <- linkinv(eta)
270 271
        #if (!(validmu(mu) && valideta(eta))) 
        #    stop("Can't find valid starting values: please specify some")
272
    
273 274
        boundary <- conv <- FALSE
        rV=matrix(0,ncol(x),ncol(x))   
275 276
       
        ## need an initial `null deviance' to test for initial divergence... 
277 278 279 280 281
        ## Note: can be better not to shrink towards start on
        ## immediate failure, in case start is on edge of feasible space...
        ## if (!is.null(start)) null.coef <- start
        coefold <- null.coef
        etaold <- null.eta <- as.numeric(x%*%null.coef + as.numeric(offset))
282 283
        old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights)) + t(null.coef)%*%St%*%null.coef 
        ## ... if the deviance exceeds this then there is an immediate problem
284 285 286 287 288 289 290 291 292
        ii <- 0
        while (!(validmu(mu) && valideta(eta))) { ## shrink towards null.coef if immediately invalid
          ii <- ii + 1
          if (ii>20) stop("Can't find valid starting values: please specify some")
          if (!is.null(start)) start <- start * .9 + coefold * .1
          eta <- .9 * eta + .1 * etaold  
          mu <- linkinv(eta)
        }

293
        for (iter in 1:control$maxit) { ## start of main fitting iteration
294
            good <- weights > 0
295 296
            var.val <- variance(mu)
            varmu <- var.val[good]
297 298 299 300 301 302 303
            if (any(is.na(varmu))) 
                stop("NAs in V(mu)")
            if (any(varmu == 0)) 
                stop("0s in V(mu)")
            mu.eta.val <- mu.eta(eta)
            if (any(is.na(mu.eta.val[good]))) 
                stop("NAs in d(mu)/d(eta)")
304
            
305
            good <- (weights > 0) & (mu.eta.val != 0)
306
         
307 308
            if (all(!good)) {
                conv <- FALSE
309
                warning(gettextf("No observations informative at iteration %d", iter))
310 311
                break
            }
312 313 314 315 316 317 318 319 320
            mevg<-mu.eta.val[good];mug<-mu[good];yg<-y[good]
            weg<-weights[good];var.mug<-var.val[good]
            if (fisher) { ## Conventional Fisher scoring
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
            } else { ## full Newton
              c = yg - mug
              alpha <- 1 + c*(family$dvar(mug)/var.mug + family$d2link(mug)*mevg)
              alpha[alpha==0] <- .Machine$double.eps
321 322
              z <- (eta - offset)[good] + (yg-mug)/(mevg*alpha) 
              ## ... offset subtracted as eta = X%*%beta + offset
323 324
              w <- weg*alpha*mevg^2/var.mug
            }
325

326
            ## Here a Fortran call has been replaced by pls_fit1 call
327 328
           
            if (sum(good)<ncol(x)) stop("Not enough informative observations.")
329
            if (control$trace) t1 <- proc.time()
330
            oo <- .C(C_pls_fit1,y=as.double(z),X=as.double(x[good,]),w=as.double(w),wy=as.double(w*z),
331 332
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(sum(good)),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
333 334
                     penalty=as.double(1),rank.tol=as.double(rank.tol),nt=as.integer(control$nthreads),
                     use.wy=as.integer(0))
335
            if (control$trace) tc <- tc + sum((proc.time()-t1)[c(1,4)])
336 337 338 339

            if (!fisher&&oo$n<0) { ## likelihood indefinite - switch to Fisher for this step
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
340
              if (control$trace) t1 <- proc.time()
341
              oo <- .C(C_pls_fit1,y=as.double(z),X=as.double(x[good,]),w=as.double(w),wy=as.double(w*z),
342 343
                       E=as.double(Sr),Es=as.double(Eb),n=as.integer(sum(good)),
                       q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
344 345
                       penalty=as.double(1),rank.tol=as.double(rank.tol),nt=as.integer(control$nthreads),
                       use.wy=as.integer(0))
346
              if (control$trace) tc <- tc + sum((proc.time()-t1)[c(1,4)])
347 348
            }

349 350
            start <- oo$y[1:ncol(x)];
            penalty <- oo$penalty
351
            eta <- drop(x%*%start)
352 353 354

            if (any(!is.finite(start))) {
                conv <- FALSE
355 356
                warning(gettextf("Non-finite coefficients at iteration %d", 
                  iter))
357
                break
358
            }        
359 360 361 362 363
     
           mu <- linkinv(eta <- eta + offset)
           dev <- sum(dev.resids(y, mu, weights))
          
           if (control$trace) 
364
             message(gettextf("Deviance = %s Iterations - %d", dev, iter, domain = "R-mgcv"))
365 366 367
            boundary <- FALSE
            
            if (!is.finite(dev)) {
368 369
                if (is.null(coefold)) {
                  if (is.null(null.coef)) 
370 371
                  stop("no valid set of coefficients has been found:please supply starting values", 
                    call. = FALSE)
372 373 374 375
                  ## Try to find feasible coefficients from the null.coef and null.eta
                  coefold <- null.coef
                  etaold <- null.eta
                }
376 377 378 379 380 381 382 383 384
                warning("Step size truncated due to divergence", 
                  call. = FALSE)
                ii <- 1
                while (!is.finite(dev)) {
                  if (ii > control$maxit) 
                    stop("inner loop 1; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2               
385
                  mu <- linkinv(eta)
386 387
                  dev <- sum(dev.resids(y, mu, weights))
                }
388 389
                boundary <- TRUE 
                penalty <- t(start)%*%St%*%start
390 391 392 393 394 395 396 397 398 399 400 401 402
                if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
            }
            if (!(valideta(eta) && validmu(mu))) {
                warning("Step size truncated: out of bounds", 
                  call. = FALSE)
                ii <- 1
                while (!(valideta(eta) && validmu(mu))) {
                  if (ii > control$maxit) 
                    stop("inner loop 2; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2 
403
                  mu <- linkinv(eta)
404 405
                }
                boundary <- TRUE
406
                penalty <- t(start)%*%St%*%start
407 408 409 410 411 412 413
                dev <- sum(dev.resids(y, mu, weights))
                if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
            }

            pdev <- dev + penalty  ## the penalized deviance 

414
            if (control$trace) 
415
                message(gettextf("penalized deviance = %s", pdev, domain = "R-mgcv"))
416
               
417

418 419 420 421
            div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5 
            ## ... threshold for judging divergence --- too tight and near
            ## perfect convergence can cause a failure here

422
            if (pdev-old.pdev>div.thresh) { ## solution diverging
423
             ii <- 1 ## step halving counter
424 425 426
             if (iter==1) { ## immediate divergence, need to shrink towards zero 
               etaold <- null.eta; coefold <- null.coef
             }
427
             while (pdev -old.pdev > div.thresh)  
428
             { ## step halve until pdev <= old.pdev
429
                if (ii > 100) 
430 431 432 433
                   stop("inner loop 3; can't correct step size")
                ii <- ii + 1
                start <- (start + coefold)/2 
                eta <- (eta + etaold)/2               
434
                mu <- linkinv(eta)
435 436
                  dev <- sum(dev.resids(y, mu, weights))
                  pdev <- dev + t(start)%*%St%*%start ## the penalized deviance
437
                if (control$trace) 
438
                  message(gettextf("Step halved: new penalized deviance = %g", pdev, "\n"))
439 440
              }
            } 
441 442
            
            if (strictly.additive) { conv <- TRUE;coef <- start;break;}
443 444

            if (abs(pdev - old.pdev)/(0.1 + abs(pdev)) < control$epsilon) {
445 446 447 448 449 450
               ## Need to check coefs converged adequately, to ensure implicit differentiation
               ## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to
               ## drop same parameter every iteration!)       
               grad <- 2 * t(x[good,])%*%(w*((x%*%start)[good]-z))+ 2*St%*%start 
               if (max(abs(grad)) > control$epsilon*max(abs(start+coefold))/2) {
               ##if (max(abs(start-coefold))>control$epsilon*max(abs(start+coefold))/2) {
451
               ## if (max(abs(mu-muold))>control$epsilon*max(abs(mu+muold))/2) {
452 453 454
                  old.pdev <- pdev
                  coef <- coefold <- start
                  etaold <- eta 
455
                  ##muold <- mu
456 457 458
                } else {
                  conv <- TRUE
                  coef <- start
459
                  etaold <- eta
460 461
                  break 
                }
462 463 464 465 466 467 468
            }
            else {  old.pdev <- pdev
                coef <- coefold <- start
                etaold <- eta 
            }
        } ### end main loop 
       
469 470 471 472
        wdr <- dev.resids(y, mu, weights)
        dev <- sum(wdr) 
        wdr <- sign(y-mu)*sqrt(pmax(wdr,0)) ## used below in scale estimation 
  
473 474 475 476 477 478 479 480 481 482 483 484 485
        ## Now call the derivative calculation scheme. This requires the
        ## following inputs:
        ## z and w - the pseudodata and weights
        ## X the model matrix and E where EE'=S
        ## rS the single penalty square roots
        ## sp the log smoothing parameters
        ## y and mu the data and model expected values
        ## g1,g2,g3 - the first 3 derivatives of g(mu) wrt mu
        ## V,V1,V2 - V(mu) and its first two derivatives wrt mu
        ## on output it returns the gradient and hessian for
        ## the deviance and trA 

         good <- weights > 0
486 487
         var.val <- variance(mu)
         varmu <- var.val[good]
488 489 490 491 492 493
         if (any(is.na(varmu))) stop("NAs in V(mu)")
         if (any(varmu == 0)) stop("0s in V(mu)")
         mu.eta.val <- mu.eta(eta)
         if (any(is.na(mu.eta.val[good]))) 
                stop("NAs in d(mu)/d(eta)")
   
494 495

         good <- (weights > 0) & (mu.eta.val != 0)
496 497
         mevg <- mu.eta.val[good];mug <- mu[good];yg <- y[good]
         weg <- weights[good];etag <- eta[good]
498 499 500 501 502 503 504 505 506
         var.mug<-var.val[good]

         if (fisher) { ## Conventional Fisher scoring
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
              alpha <- wf <- 0 ## Don't need Fisher weights separately
         } else { ## full Newton
              c <- yg - mug
              alpha <- 1 + c*(family$dvar(mug)/var.mug + family$d2link(mug)*mevg)
507 508 509 510 511 512
              ### can't just drop obs when alpha==0, as they are informative, but
              ### happily using an `effective zero' is stable here, and there is 
              ### a natural effective zero, since E(alpha) = 1.
              alpha[alpha==0] <- .Machine$double.eps 
              z <- (eta - offset)[good] + (yg-mug)/(mevg*alpha) 
              ## ... offset subtracted as eta = X%*%beta + offset
513 514 515
              wf <- weg*mevg^2/var.mug ## Fisher weights for EDF calculation
              w <- wf * alpha   ## Full Newton weights
         }
516 517 518 519 520 521 522 523
        
         g1 <- 1/mevg
         g2 <- family$d2link(mug)
         g3 <- family$d3link(mug)

         V <- family$variance(mug)
         V1 <- family$dvar(mug)
         V2 <- family$d2var(mug)      
524 525 526 527 528 529 530 531 532 533 534 535
        
         if (fisher) {
           g4 <- V3 <- 0
         } else {
           g4 <- family$d4link(mug)
           V3 <- family$d3var(mug)
         }

         if (TRUE) { ### TEST CODE for derivative ratio based versions of code... 
           g2 <- g2/g1;g3 <- g3/g1;g4 <- g4/g1
           V1 <- V1/V;V2 <- V2/V;V3 <- V3/V
         }
536 537 538 539 540

         P1 <- D1 <- array(0,nSp);P2 <- D2 <- matrix(0,nSp,nSp) # for derivs of deviance/ Pearson
         trA1 <- array(0,nSp);trA2 <- matrix(0,nSp,nSp) # for derivs of tr(A)
         rV=matrix(0,ncol(x),ncol(x));
         dum <- 1
541
         if (control$trace) cat("calling gdi...")
542

543
       REML <- 0 ## signals GCV/AIC used
544 545
       if (scoreType%in%c("REML","P-REML")) {REML <- 1;remlInd <- 1} else 
       if (scoreType%in%c("ML","P-ML")) {REML <- -1;remlInd <- 0} 
546 547

       if (REML==0) rSncol <- unlist(lapply(rS,ncol)) else rSncol <- unlist(lapply(UrS,ncol))
548
       if (control$trace) t1 <- proc.time()
549 550 551 552 553 554
       oo <- .C(C_gdi1,X=as.double(x[good,]),E=as.double(Sr),Eb = as.double(Eb), 
                rS = as.double(unlist(rS)),U1=as.double(U1),sp=as.double(exp(sp)),
                z=as.double(z),w=as.double(w),wf=as.double(wf),alpha=as.double(alpha),
                mu=as.double(mug),eta=as.double(etag),y=as.double(yg),
                p.weights=as.double(weg),g1=as.double(g1),g2=as.double(g2),
                g3=as.double(g3),g4=as.double(g4),V0=as.double(V),V1=as.double(V1),
555
                V2=as.double(V2),V3=as.double(V3),beta=as.double(coef),b1=as.double(rep(0,nSp*ncol(x))),
556
                w1=as.double(rep(0,nSp*length(z))),
557
                D1=as.double(D1),D2=as.double(D2),P=as.double(dum),P1=as.double(P1),P2=as.double(P2),
558
                trA=as.double(dum),trA1=as.double(trA1),trA2=as.double(trA2),
559
                rV=as.double(rV),rank.tol=as.double(rank.tol),
560 561 562 563
                conv.tol=as.double(control$epsilon),rank.est=as.integer(1),n=as.integer(length(z)),
                p=as.integer(ncol(x)),M=as.integer(nSp),Mp=as.integer(Mp),Enrow = as.integer(rows.E),
                rSncol=as.integer(rSncol),deriv=as.integer(deriv.sp),
                REML = as.integer(REML),fisher=as.integer(fisher),
564 565
                fixed.penalty = as.integer(rp$fixed.penalty),nthreads=as.integer(control$nthreads),
                dVkk=as.double(rep(0,nSp*nSp)))      
566 567 568 569
         if (control$trace) { 
           tg <- sum((proc.time()-t1)[c(1,4)])
           cat("done!\n")
         }
570
 
571 572
         ## get dbeta/drho, directly in original parameterization
         db.drho <- if (deriv) T%*%matrix(oo$b1,ncol(x),nSp) else NULL
573
         dw.drho <- if (deriv) matrix(oo$w1,length(z),nSp) else NULL
574

575 576 577 578 579
         rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix 
         
         Kmat <- matrix(0,nrow(x),ncol(x)) 
         Kmat[good,] <- oo$X                    ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array 

580
         coef <- oo$beta;
581 582 583 584 585 586 587 588 589 590 591 592
         eta <- drop(x%*%coef + offset)
         mu <- linkinv(eta)
         if (!(validmu(mu)&&valideta(eta))) {
           ## if iteration terminated with step halving then it can be that
           ## gdi1 returns an invalid coef, because likelihood is actually
           ## pushing coefs to invalid region. Probably not much hope in 
           ## this case, but it is worth at least returning feasible values,
           ## even though this is not quite consistent with derivs.
           coef <- start
           eta <- etaold
           mu <- linkinv(eta)
         }
593
         trA <- oo$trA;
594 595 596 597 598
                  
         if (control$scale.est%in%c("pearson","fletcher","Pearson","Fletcher")) {
            pearson <- sum(weights*(y-mu)^2/family$variance(mu))
            scale.est <- (pearson+dev.extra)/(n.true-trA)
            if (control$scale.est%in%c("fletcher","Fletcher")) { ## Apply Fletcher (2012) correction
599 600
              ## note limited to 10 times Pearson...
              s.bar = max(-.9,mean(family$dvar(mu)*(y-mu)*sqrt(weights)/family$variance(mu)))
601 602 603 604 605
              if (is.finite(s.bar)) scale.est <- scale.est/(1+s.bar)
            }
         } else { ## use the deviance estimator
           scale.est <- (dev+dev.extra)/(n.true-trA)
         }
606

607
        reml.scale <- NA  
608 609 610

        if (scoreType%in%c("REML","ML")) { ## use Laplace (RE)ML
          
611 612 613
          ls <- family$ls(y,weights,n,scale)*n.true/nobs ## saturated likelihood and derivatives
          Dp <- dev + oo$conv.tol + dev.extra
          REML <- Dp/(2*scale) - ls[1] + oo$rank.tol/2 - rp$det/2 - remlInd*Mp/2*log(2*pi*scale)
614
          attr(REML,"Dp") <- Dp/(2*scale)
615
          if (deriv) {
616
            REML1 <- oo$D1/(2*scale) + oo$trA1/2 - rp$det1/2 
617 618 619 620 621 622
            if (deriv==2) REML2 <- (matrix(oo$D2,nSp,nSp)/scale + matrix(oo$trA2,nSp,nSp) - rp$det2)/2
            if (sum(!is.finite(REML2))) {
               stop("Non finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
            }
          }
          if (!scale.known&&deriv) { ## need derivatives wrt log scale, too 
623 624
            ##ls <- family$ls(y,weights,n,scale) ## saturated likelihood and derivatives
            dlr.dlphi <- -Dp/(2 *scale) - ls[2]*scale - Mp/2*remlInd
625 626 627 628 629 630 631 632 633 634
            d2lr.d2lphi <- Dp/(2*scale) - ls[3]*scale^2 - ls[2]*scale
            d2lr.dspphi <- -oo$D1/(2*scale)
            REML1 <- c(REML1,dlr.dlphi)
            if (deriv==2) {
              REML2 <- rbind(REML2,as.numeric(d2lr.dspphi))
              REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi))
            }
          }
          reml.scale <- scale
        } else if (scoreType%in%c("P-REML","P-ML")) { ## scale unknown use Pearson-Laplace REML
635 636 637 638 639 640
          reml.scale <- phi <- (oo$P*(nobs-Mp)+pearson.extra)/(n.true-Mp) ## REMLish scale estimate
          ## correct derivatives, if needed...
          oo$P1 <- oo$P1*(nobs-Mp)/(n.true-Mp)
          oo$P2 <- oo$P2*(nobs-Mp)/(n.true-Mp)

          ls <- family$ls(y,weights,n,phi)*n.true/nobs ## saturated likelihood and derivatives
641
        
642
          Dp <- dev + oo$conv.tol + dev.extra
643
         
644 645
          K <- oo$rank.tol/2 - rp$det/2
                 
646
          REML <- Dp/(2*phi) - ls[1] + K - Mp/2*log(2*pi*phi)*remlInd
647
          attr(REML,"Dp") <- Dp/(2*phi)
648 649
          if (deriv) {
            phi1 <- oo$P1; Dp1 <- oo$D1; K1 <- oo$trA1/2 - rp$det1/2;
650
            REML1 <- Dp1/(2*phi) - phi1*(Dp/(2*phi^2)+Mp/(2*phi)*remlInd + ls[2]) + K1
651 652 653 654 655
            if (deriv==2) {
                   phi2 <- matrix(oo$P2,nSp,nSp);Dp2 <- matrix(oo$D2,nSp,nSp)
                   K2 <- matrix(oo$trA2,nSp,nSp)/2 - rp$det2/2   
                   REML2 <- 
                   Dp2/(2*phi) - (outer(Dp1,phi1)+outer(phi1,Dp1))/(2*phi^2) +
656 657
                   (Dp/phi^3 - ls[3] + Mp/(2*phi^2)*remlInd)*outer(phi1,phi1) -
                   (Dp/(2*phi^2)+ls[2]+Mp/(2*phi)*remlInd)*phi2 + K2
658 659 660 661
            }
          }
 
        } else { ## Not REML ....
662

663 664 665 666 667 668 669
           P <- oo$P
           
           delta <- nobs - gamma * trA
           delta.2 <- delta*delta           
  
           GCV <- nobs*dev/delta.2
           GACV <- dev/nobs + P * 2*gamma*trA/(delta * nobs) 
670

671 672 673 674
           UBRE <- dev/nobs - 2*delta*scale/nobs + scale
        
           if (deriv) {
             trA1 <- oo$trA1
675
           
676 677
             D1 <- oo$D1
             P1 <- oo$P1
678
          
679
             if (sum(!is.finite(D1))||sum(!is.finite(P1))||sum(!is.finite(trA1))) { 
680 681 682
                 stop(
               "Non-finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
             }
683
         
684 685 686 687
             delta.3 <- delta*delta.2
  
             GCV1 <- nobs*D1/delta.2 + 2*nobs*dev*trA1*gamma/delta.3
             GACV1 <- D1/nobs + 2*P/delta.2 * trA1 + 2*gamma*trA*P1/(delta*nobs)
688

689 690 691 692 693
             UBRE1 <- D1/nobs + gamma * trA1 *2*scale/nobs
             if (deriv==2) {
               trA2 <- matrix(oo$trA2,nSp,nSp) 
               D2 <- matrix(oo$D2,nSp,nSp)
               P2 <- matrix(oo$P2,nSp,nSp)
694
              
695
               if (sum(!is.finite(D2))||sum(!is.finite(P2))||sum(!is.finite(trA2))) { 
696 697 698
                 stop(
                 "Non-finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
               }
699
             
700 701
               GCV2 <- outer(trA1,D1)
               GCV2 <- (GCV2 + t(GCV2))*gamma*2*nobs/delta.3 +
702 703
                      6*nobs*dev*outer(trA1,trA1)*gamma*gamma/(delta.2*delta.2) + 
                      nobs*D2/delta.2 + 2*nobs*dev*gamma*trA2/delta.3  
704
               GACV2 <- D2/nobs + outer(trA1,trA1)*4*P/(delta.3) +
705 706 707
                      2 * P * trA2 / delta.2 + 2 * outer(trA1,P1)/delta.2 +
                      2 * outer(P1,trA1) *(1/(delta * nobs) + trA/(nobs*delta.2)) +
                      2 * trA * P2 /(delta * nobs) 
708 709 710 711 712
               GACV2 <- (GACV2 + t(GACV2))*.5
               UBRE2 <- D2/nobs +2*gamma * trA2 * scale / nobs
             } ## end if (deriv==2)
           } ## end if (deriv)
        } ## end !REML
713 714 715 716 717 718
        # end of inserted code
        if (!conv&&printWarn) 
            warning("Algorithm did not converge")
        if (printWarn&&boundary) 
            warning("Algorithm stopped at boundary value")
        eps <- 10 * .Machine$double.eps
719
        if (printWarn&&family$family[1] == "binomial") {
720 721 722
            if (any(mu > 1 - eps) || any(mu < eps)) 
                warning("fitted probabilities numerically 0 or 1 occurred")
        }
723
        if (printWarn&&family$family[1] == "poisson") {
724 725 726 727 728 729 730
            if (any(mu < eps)) 
                warning("fitted rates numerically 0 occurred")
        }
 
        residuals <- rep.int(NA, nobs)
        residuals[good] <- z - (eta - offset)[good]
          
731 732 733
        ## undo reparameterization....
        coef <- as.numeric(T %*% coef)
        rV <- T %*% rV
734 735 736 737 738
        names(coef) <- xnames 
    } ### end if (!EMPTY)
    names(residuals) <- ynames
    names(mu) <- ynames
    names(eta) <- ynames
739 740 741 742 743
    ww <- wt <- rep.int(0, nobs)
    if (fisher) { wt[good] <- w; ww <- wt} else { 
      wt[good] <- wf  ## note that Fisher weights are returned
      ww[good] <- w
    }
744 745 746
    names(wt) <- ynames
    names(weights) <- ynames
    names(y) <- ynames
747 748 749 750 751 752 753
    if (deriv && nrow(dw.drho)!=nrow(x)) {
      w1 <- dw.drho
      dw.drho <- matrix(0,nrow(x),ncol(w1))
      dw.drho[good,] <- w1
    }
    

754 755 756 757 758 759 760 761
    wtdmu <- if (intercept) 
        sum(weights * y)/sum(weights)
    else linkinv(offset)
    nulldev <- sum(dev.resids(y, wtdmu, weights))
    n.ok <- nobs - sum(weights == 0)
    nulldf <- n.ok - as.integer(intercept)
   
    aic.model <- aic(y, n, mu, weights, dev) # note: incomplete 2*edf needs to be added
762 763 764 765 766
    if (control$trace) {
      t1 <- proc.time()
      at <- sum((t1-t0)[c(1,4)])
      cat("Proportion time in C: ",(tc+tg)/at," ls:",tc/at," gdi:",tg/at,"\n")
    } 
767
   
768 769
    list(coefficients = coef, residuals = residuals, fitted.values = mu, 
         family = family, linear.predictors = eta, deviance = dev, 
770 771
        null.deviance = nulldev, iter = iter, weights = wt, working.weights=ww,prior.weights = weights, 
        df.null = nulldf, y = y, converged = conv,##pearson.warning = pearson.warning,
772
        boundary = boundary,D1=D1,D2=D2,P=P,P1=P1,P2=P2,trA=trA,trA1=trA1,trA2=trA2,
773
        GCV=GCV,GCV1=GCV1,GCV2=GCV2,GACV=GACV,GACV1=GACV1,GACV2=GACV2,UBRE=UBRE,
774
        UBRE1=UBRE1,UBRE2=UBRE2,REML=REML,REML1=REML1,REML2=REML2,rV=rV,db.drho=db.drho,
775
        dw.drho=dw.drho,dVkk = matrix(oo$dVkk,nSp,nSp),
776
        scale.est=scale.est,reml.scale= reml.scale,aic=aic.model,rank=oo$rank.est,K=Kmat)
777 778
} ## end gam.fit3

779
Vb.corr <- function(X,L,lsp0,S,off,dw,w,rho,Vr,nth=0,scale.est=FALSE) {
780 781 782 783
## compute higher order Vb correction...
## If w is NULL then X should be root Hessian, and 
## dw is treated as if it was 0, otherwise X should be model 
## matrix.
784
## dw is derivative w.r.t. all the smoothing parameters and family parameters as if these 
785
## were not linked, but not the scale parameter, of course. Vr includes scale uncertainty,
786
## if scale estimated...
787
## nth is the number of initial elements of rho that are not smoothing 
788 789
## parameters, scale.est is TRUE if scale estimated by REML and must be
## dropped from s.p.s
790 791 792 793 794 795 796
  M <- length(off) ## number of penalty terms
  if (scale.est) {
    ## drop scale param from L, rho and Vr...
    rho <- rho[-length(rho)]
    if (!is.null(L)) L <- L[-nrow(L),-ncol(L),drop=FALSE]
    Vr <- Vr[-nrow(Vr),-ncol(Vr),drop=FALSE]
  }
797 798 799 800
 
  if (is.null(lsp0)) lsp0 <- if (is.null(L)) rho*0 else rep(0,nrow(L))
  ## note that last element of lsp0 can be a scale parameter...
  lambda <- if (is.null(L)) exp(rho+lsp0[1:length(rho)]) else exp(L%*%rho + lsp0[1:nrow(L)])
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
  
  ## Re-create the Hessian, if is.null(w) then X assumed to be root
  ## unpenalized Hessian...
  H <- if (is.null(w)) crossprod(X) else H <- t(X)%*%(w*X)
  if (M>0) for (i in 1:M) {
      ind <- off[i] + 1:ncol(S[[i]]) - 1
      H[ind,ind] <- H[ind,ind] + lambda[i+nth] * S[[i]]
  }

  R <- try(chol(H),silent=TRUE) ## get its Choleski factor.  
  if (inherits(R,"try-error")) return(0) ## bail out as Hessian insufficiently well conditioned
  
  ## Create dH the derivatives of the hessian w.r.t. (all) the smoothing parameters...
  dH <- list()
  if (length(lambda)>0) for (i in 1:length(lambda)) {
    ## If w==NULL use constant H approx...
    dH[[i]] <- if (is.null(w)) H*0 else t(X)%*%(dw[,i]*X) 
    if (i>nth) { 
      ind <- off[i-nth] + 1:ncol(S[[i-nth]]) - 1
      dH[[i]][ind,ind] <- dH[[i]][ind,ind] + lambda[i]*S[[i-nth]]
    }
  }
  ## If L supplied then dH has to be re-weighted to give
  ## derivatives w.r.t. optimization smoothing params.
  if (!is.null(L)) {
    dH1 <- dH;dH <- list()
    if (length(rho)>0) for (j in 1:length(rho)) { 
      ok <- FALSE ## dH[[j]] not yet created
      if (nrow(L)>0) for (i in 1:nrow(L)) if (L[i,j]!=0.0) { 
        dH[[j]] <- if (ok) dH[[j]] + dH1[[i]]*L[i,j] else dH1[[i]]*L[i,j]
        ok <- TRUE
      }
    } 
    rm(dH1)
  } ## dH now w.r.t. optimization parameters 
  
  if (length(dH)==0) return(0) ## nothing to correct
838

839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
  ## Get derivatives of Choleski factor w.r.t. the smoothing parameters 
  dR <- list()
  for (i in 1:length(dH)) dR[[i]] <- dchol(dH[[i]],R) 
  rm(dH)
  
  ## need to transform all dR to dR^{-1} = -R^{-1} dR R^{-1}...
  for (i in 1:length(dR)) dR[[i]] <- -t(forwardsolve(t(R),t(backsolve(R,dR[[i]]))))
 
  ## BUT: dR, now upper triangular, and it relates to RR' = Vb not R'R = Vb
  ## in consequence of which Rz is the thing with the right distribution
  ## and not R'z...
  dbg <- FALSE
  if (dbg) { ## debugging code
    n.rep <- 10000;p <- ncol(R)
    r <- rmvn(n.rep,rep(0,M),Vr)
    b <- matrix(0,n.rep,p)
    for (i in 1:n.rep) {
      z <- rnorm(p)
      if (M>0) for (j in 1:M) b[i,] <- b[i,] + dR[[j]]%*%z*(r[i,j]) 
    }
    Vfd <- crossprod(b)/n.rep
  }

  vcorr(dR,Vr,FALSE) ## NOTE: unscaled!!
} ## Vb.corr

865
gam.fit3.post.proc <- function(X,L,lsp0,S,off,object) {
866
## get edf array and covariance matrices after a gam fit. 
867
## X is original model matrix, L the mapping from working to full sp
868 869
  scale <- if (object$scale.estimated) object$scale.est else object$scale
  Vb <- object$rV%*%t(object$rV)*scale ## Bayesian cov.
870 871 872 873
  # PKt <- object$rV%*%t(object$K)
  PKt <- .Call(C_mgcv_pmmult2,object$rV,object$K,0,1,object$control$nthreads)
  # F <- PKt%*%(sqrt(object$weights)*X)
  F <- .Call(C_mgcv_pmmult2,PKt,sqrt(object$weights)*X,0,0,object$control$nthreads)
874 875
  edf <- diag(F) ## effective degrees of freedom
  edf1 <- 2*edf - rowSums(t(F)*F) ## alternative
876

877 878 879
  ## edf <- rowSums(PKt*t(sqrt(object$weights)*X))
  ## Ve <- PKt%*%t(PKt)*object$scale  ## frequentist cov
  Ve <- F%*%Vb ## not quite as stable as above, but quicker
880
  hat <- rowSums(object$K*object$K)
881 882 883
  ## get QR factor R of WX - more efficient to do this
  ## in gdi_1 really, but that means making QR of augmented 
  ## a two stage thing, so not clear cut...
884 885
  qrx <- pqr(sqrt(object$weights)*X,object$control$nthreads)
  R <- pqr.R(qrx);R[,qrx$pivot] <- R
886
  if (!is.na(object$reml.scale)&&!is.null(object$db.drho)) { ## compute sp uncertainty correction
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
    hess <- object$outer.info$hess
    edge.correct <- if (is.null(attr(hess,"edge.correct"))) FALSE else TRUE
    K <- if (edge.correct) 2 else 1
    for (k in 1:K) {
      if (k==1) { ## fitted model computations
        db.drho <- object$db.drho
        dw.drho <- object$dw.drho
        lsp <- log(object$sp)
      } else { ## edge corrected model computations
        db.drho <- attr(hess,"db.drho1")
        dw.drho <- attr(hess,"dw.drho1")
        lsp <- attr(hess,"lsp1")
	hess <- attr(hess,"hess1")
      }
      M <- ncol(db.drho)
      ## transform to derivs w.r.t. working, noting that an extra final row of L
      ## may be present, relating to scale parameter (for which db.drho is 0 since it's a scale parameter)  
      if (!is.null(L)) { 
        db.drho <- db.drho%*%L[1:M,,drop=FALSE] 
        M <- ncol(db.drho)
      }
      ## extract cov matrix for log smoothing parameters...
      ev <- eigen(hess,symmetric=TRUE) 
      d <- ev$values;ind <- d <= 0
      d[ind] <- 0;d[!ind] <- 1/sqrt(d[!ind])
      rV <- (d*t(ev$vectors))[,1:M] ## root of cov matrix
      Vc <- crossprod(rV%*%t(db.drho))
      ## set a prior precision on the smoothing parameters, but don't use it to 
      ## fit, only to regularize Cov matrix. exp(4*var^.5) gives approx 
      ## multiplicative range. e.g. var = 5.3 says parameter between .01 and 100 times
      ## estimate. Avoids nonsense at `infinite' smoothing parameters.   
      d <- ev$values; d[ind] <- 0;
      d <- if (k==1) 1/sqrt(d+1/10) else 1/sqrt(d+1e-7)
      Vr <- crossprod(d*t(ev$vectors))
      ## Note that db.drho and dw.drho are derivatives w.r.t. full set of smoothing 
      ## parameters excluding any scale parameter, but Vr includes info for scale parameter
      ## if it has been estimated. 
      nth <- if (is.null(object$family$n.theta)) 0 else object$family$n.theta ## any parameters of family itself
      drop.scale <- object$scale.estimated && !(object$method %in% c("P-REML","P-ML"))
      Vc2 <- scale*Vb.corr(R,L,lsp0,S,off,dw.drho,w=NULL,lsp,Vr,nth,drop.scale)
927
    
928 929 930 931 932 933 934 935 936
      Vc <- Vb + Vc + Vc2 ## Bayesian cov matrix with sp uncertainty
      ## finite sample size check on edf sanity...
      if (k==1) { ## compute edf2 only with fitted model, not edge corrected
        edf2 <- rowSums(Vc*crossprod(R))/scale
        if (sum(edf2)>sum(edf1)) { 
          edf2 <- edf1
        }
      }
    } ## k loop
937 938
  } else edf2 <- Vc <- NULL
  list(Vc=Vc,Vb=Vb,Ve=Ve,edf=edf,edf1=edf1,edf2=edf2,hat=hat,F=F,R=R)
939
} ## gam.fit3.post.proc
940

941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973

score.transect <- function(ii, x, y, sp, Eb,UrS=list(), 
            weights = rep(1, length(y)), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, length(y)),U1,Mp,family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",eps=1e-7,null.coef=rep(0,ncol(x)),...) {
## plot a transect through the score for sp[ii]
  np <- 200
  if (scoreType%in%c("REML","P-REML","ML","P-ML")) reml <- TRUE else reml <- FALSE
  score <- spi <- seq(-30,30,length=np)
  for (i in 1:np) {

     sp[ii] <- spi[i]
     b<-gam.fit3(x=x, y=y, sp=sp,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
      control=control,gamma=gamma,scale=scale,
      printWarn=FALSE,mustart=mustart,scoreType=scoreType,null.coef=null.coef,...)

      if (reml) {
        score[i] <- b$REML
      } else if (scoreType=="GACV") {
        score[i] <- b$GACV
      } else if (scoreType=="UBRE"){
        score[i] <- b$UBRE 
      } else { ## default to deviance based GCV
        score[i] <- b$GCV
      }
  }
  par(mfrow=c(2,2),mar=c(4,4,1,1))
  plot(spi,score,xlab="log(sp)",ylab=scoreType,type="l")
  plot(spi[1:(np-1)],score[2:np]-score[1:(np-1)],type="l",ylab="differences")
  plot(spi,score,ylim=c(score[1]-.1,score[1]+.1),type="l")
  plot(spi,score,ylim=c(score[np]-.1,score[np]+.1),type="l")
974
} ## score.transect
975 976 977 978 979

deriv.check <- function(x, y, sp, Eb,UrS=list(), 
            weights = rep(1, length(y)), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, length(y)),U1,Mp,family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
980 981
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",eps=1e-7,
            null.coef=rep(0,ncol(x)),Sl=Sl,...)
982 983 984 985 986
## FD checking of derivatives: basically a debugging routine
{  if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2")
   if (control$epsilon>1e-9) control$epsilon <- 1e-9 
   b<-gam.fit3(x=x, y=y, sp=sp,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
987 988 989
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
      null.coef=null.coef,Sl=Sl,...)
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012

   P0 <- b$P;fd.P1 <- P10 <- b$P1;  if (deriv==2) fd.P2 <- P2 <- b$P2 
   trA0 <- b$trA;fd.gtrA <- gtrA0 <- b$trA1 ; if (deriv==2) fd.htrA <- htrA <- b$trA2 
   dev0 <- b$deviance;fd.D1 <- D10 <- b$D1 ; if (deriv==2) fd.D2 <- D2 <- b$D2 

   if (scoreType%in%c("REML","P-REML","ML","P-ML")) reml <- TRUE else reml <- FALSE

   if (reml) {
     score0 <- b$REML;grad0 <- b$REML1; if (deriv==2) hess <- b$REML2 
   } else if (scoreType=="GACV") {
     score0 <- b$GACV;grad0 <- b$GACV1;if (deriv==2) hess <- b$GACV2 
   } else if (scoreType=="UBRE"){
     score0 <- b$UBRE;grad0 <- b$UBRE1;if (deriv==2) hess <- b$UBRE2 
   } else { ## default to deviance based GCV
     score0 <- b$GCV;grad0 <- b$GCV1;if (deriv==2) hess <- b$GCV2
   }
  
   fd.grad <- grad0
   if (deriv==2) fd.hess <- hess
   for (i in 1:length(sp)) {
     sp1 <- sp;sp1[i] <- sp[i]+eps/2
     bf<-gam.fit3(x=x, y=y, sp=sp1,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
1013 1014 1015
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
      null.coef=null.coef,Sl=Sl,...)
1016 1017 1018 1019
      
     sp1 <- sp;sp1[i] <- sp[i]-eps/2
     bb<-gam.fit3(x=x, y=y, sp=sp1, Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
1020 1021 1022
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
     null.coef=null.coef,Sl=Sl,...)
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
      
   
      if (!reml) {
        Pb <- bb$P;Pf <- bf$P 
        P1b <- bb$P1;P1f <- bf$P1
        trAb <- bb$trA;trAf <- bf$trA
        gtrAb <- bb$trA1;gtrAf <- bf$trA1
        devb <- bb$deviance;devf <- bf$deviance
        D1b <- bb$D1;D1f <- bf$D1
      }
     

      if (reml) {
        scoreb <- bb$REML;scoref <- bf$REML;
        if (deriv==2) { gradb <- bb$REML1;gradf <- bf$REML1}
      } else if (scoreType=="GACV") {
        scoreb <- bb$GACV;scoref <- bf$GACV;
        if (deriv==2) { gradb <- bb$GACV1;gradf <- bf$GACV1}
      } else if (scoreType=="UBRE"){
        scoreb <- bb$UBRE; scoref <- bf$UBRE;
        if (deriv==2) { gradb <- bb$UBRE1;gradf <- bf$UBRE1} 
      } else { ## default to deviance based GCV
        scoreb <- bb$GCV;scoref <- bf$GCV;
        if (deriv==2) { gradb <- bb$GCV1;gradf <- bf$GCV1}
      }

      if (!reml) {
        fd.P1[i] <- (Pf-Pb)/eps
        fd.gtrA[i] <- (trAf-trAb)/eps
        fd.D1[i] <- (devf - devb)/eps
      }
      
     
      fd.grad[i] <- (scoref-scoreb)/eps
      if (deriv==2) { 
        fd.hess[,i] <- (gradf-gradb)/eps
        if (!reml) {
          fd.htrA[,i] <- (gtrAf-gtrAb)/eps
          fd.P2[,i] <- (P1f-P1b)/eps
          fd.D2[,i] <- (D1f-D1b)/eps
        } 
       
      }
   }
   
   if (!reml) {
     cat("\n Pearson Statistic... \n")
     cat("grad    ");print(P10)
     cat("fd.grad ");print(fd.P1)
     if (deriv==2) {
       fd.P2 <- .5*(fd.P2 + t(fd.P2))
       cat("hess\n");print(P2)
       cat("fd.hess\n");print(fd.P2)
     }

     cat("\n\n tr(A)... \n")
     cat("grad    ");print(gtrA0)
     cat("fd.grad ");print(fd.gtrA)
     if (deriv==2) {
       fd.htrA <- .5*(fd.htrA + t(fd.htrA))
       cat("hess\n");print(htrA)
       cat("fd.hess\n");print(fd.htrA)
     }
   

     cat("\n Deviance... \n")
     cat("grad    ");print(D10)
     cat("fd.grad ");print(fd.D1)
     if (deriv==2) {
       fd.D2 <- .5*(fd.D2 + t(fd.D2))
       cat("hess\n");print(D2)
       cat("fd.hess\n");print(fd.D2)
     }
   }
 
   cat("\n\n The objective...\n")

   cat("grad    ");print(grad0)
   cat("fd.grad ");print(fd.grad)
   if (deriv==2) {
     fd.hess <- .5*(fd.hess + t(fd.hess))
     cat("hess\n");print(hess)
     cat("fd.hess\n");print(fd.hess)
   }
   NULL
1108
} ## deriv.check
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130


rt <- function(x,r1) {
## transform of x, asymptoting to values in r1
## returns derivatives wrt to x as well as transform values
## r1[i] == NA for no transform 
  x <- as.numeric(x)
  ind <- x>0 
  rho2 <- rho1 <- rho <- 0*x
  if (length(r1)==1) r1 <- x*0+r1
  h <- exp(x[ind])/(1+exp(x[ind]))
  h1 <- h*(1-h);h2 <- h1*(1-2*h)
  rho[ind] <- r1[ind]*(h-0.5)*2
  rho1[ind] <- r1[ind]*h1*2
  rho2[ind] <- r1[ind]*h2*2
  rho[!ind] <- r1[!ind]*x[!ind]/2
  rho1[!ind] <- r1[!ind]/2
  ind <- is.na(r1)
  rho[ind] <- x[ind]
  rho1[ind] <- 1
  rho2[ind] <- 0
  list(rho=rho,rho1=rho1,rho2=rho2)
1131
} ## rt
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144

rti <- function(r,r1) {
## inverse of rti.
  r <- as.numeric(r)
  ind <- r>0
  x <- r
  if (length(r1)==1) r1 <- x*0+r1
  r2 <- r[ind]*.5/r1[ind] + .5
  x[ind] <- log(r2/(1-r2))
  x[!ind] <- 2*r[!ind]/r1[!ind]
  ind <- is.na(r1)
  x[ind] <- r[ind]
  x
1145
} ## rti
1146

1147
simplyFit <- function(lsp,X,y,Eb,UrS,L,lsp0,offset,U1,Mp,family,weights,
1148
                   control,gamma,scale,conv.tol=1e-6,maxNstep=5,maxSstep=2,
1149
                   maxHalf=30,printWarn=FALSE,scoreType="deviance",
1150
                   mustart = NULL,null.coef=rep(0,ncol(X)),Sl=Sl,...)
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
## function with same argument list as `newton' and `bfgs' which simply fits
## the model given the supplied smoothing parameters...
{ reml <- scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator

  ## sanity check L
  if (is.null(L)) L <- diag(length(lsp)) else {
    if (!inherits(L,"matrix")) stop("L must be a matrix.")
    if (nrow(L)<ncol(L)) stop("L must have at least as many rows as columns.")
    if (nrow(L)!=length(lsp0)||ncol(L)!=length(lsp)) stop("L has inconsistent dimensions.")
  }
  if (is.null(lsp0)) lsp0 <- rep(0,ncol(L))
  ## initial fit

  b<-gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
     offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
     control=control,gamma=gamma,scale=scale,
1167
     printWarn=FALSE,mustart=mustart,scoreType=scoreType,null.coef=null.coef,Sl=Sl,...)
1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178

  if (reml) {       
          score <- b$REML
  } else if (scoreType=="GACV") {
          score <- b$GACV
  } else if (scoreType=="UBRE") {
          score <- b$UBRE
  } else score <- b$GCV

  list(score=score,lsp=lsp,lsp.full=L%*%lsp+lsp0,grad=NULL,hess=NULL,score.hist=NULL,iter=0,conv =NULL,object=b)

1179
} ## simplyFit
1180 1181 1182 1183


newton <- function(lsp,X,y,Eb,UrS,L,lsp0,offset,U1,Mp,family,weights,
                   control,gamma,scale,conv.tol=1e-6,maxNstep=5,maxSstep=2,
1184
                   maxHalf=30,printWarn=FALSE,scoreType="deviance",start=NULL,
1185
                   mustart = NULL,null.coef=rep(0,ncol(X)),pearson.extra,
1186
                   dev.extra=0,n.true=-1,Sl=NULL,edge.correct=FALSE,...)
1187 1188 1189 1190 1191 1192 1193 1194
## Newton optimizer for GAM reml/gcv/aic optimization that can cope with an 
## indefinite Hessian. Main enhancements are: 
## i) always perturbs the Hessian to +ve definite if indefinite 
## ii) step halves on step failure, without obtaining derivatives until success; 
## (iii) carries start values forward from one evaluation to next to speed convergence;
## iv) Always tries the steepest descent direction as well as the 
##     Newton direction for indefinite problems (step length on steepest trial could
##     be improved here - currently simply halves until descent achieved).    
1195 1196
## L is the matrix such that L%*%lsp + lsp0 gives the logs of the smoothing 
## parameters actually multiplying the S[[i]]'s
1197 1198
## NOTE: an obvious acceleration would use db/dsp to produce improved
##       starting values at each iteration... 
1199 1200 1201 1202 1203
{  
  reml <- scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator

  ## sanity check L
  if (is.null(L)) L <- diag(length(lsp)) else {
1204 1205
    if (!inherits(L,"matrix")) stop("L must be a matrix.")
    if (nrow(L)<ncol(L)) stop("L must have at least as many rows as columns.")
1206
    if (nrow(L)!=length(lsp0)||ncol(L)!=length(lsp)) stop("L has inconsistent dimensions.")
1207
  }
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
  if (is.null(lsp0)) lsp0 <- rep(0,nrow(L)) 

  if (reml&&FALSE) { ## NOTE: routine set up to allow upper limits on lsp, but turned off.
    frob.X <- sqrt(sum(X*X))
    lsp.max <- rep(NA,length(lsp0))
    for (i in 1:nrow(L)) { 
      lsp.max[i] <- 16 + log(frob.X/sqrt(sum(UrS[[i]]^2))) - lsp0[i]
      if (lsp.max[i]<2) lsp.max[i] <- 2
    } 
  } else lsp.max <- NULL

  if (!is.null(lsp.max)) { ## then there are upper limits on lsp's
    lsp1.max <- coef(lm(lsp.max-lsp0~L-1)) ## get upper limits on lsp1 scale
    ind <- lsp>lsp1.max
    lsp[ind] <- lsp1.max[ind]-1 ## reset lsp's already over limit
    delta <- rti(lsp,lsp1.max) ## initial optimization parameters
  } else { ## optimization parameters are just lsp
    delta <- lsp
  }

  ## code designed to be turned on during debugging...
  check.derivs <- FALSE;sp.trace <- FALSE
  if (check.derivs) {
     deriv <- 2
     eps <- 1e-4
     deriv.check(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
         control=control,gamma=gamma,scale=scale,
1236
         printWarn=FALSE,start=start,mustart=mustart,
1237
         scoreType=scoreType,eps=eps,null.coef=null.coef,Sl=Sl,...)
1238 1239

     
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
  }

#  ii <- 0
#  if (ii>0) {
#    score.transect(ii,x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
#         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
#         control=control,gamma=gamma,scale=scale,
#         printWarn=FALSE,mustart=mustart,
#         scoreType=scoreType,eps=eps,null.coef=null.coef,...)
#  }
  ## ... end of debugging code 


1253
  ## initial fit
1254 1255
  b<-gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
     offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=2,
1256
     control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
1257
     mustart=mustart,scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
1258
     dev.extra=dev.extra,n.true=n.true,Sl=Sl,...)
1259

1260
  mustart <- b$fitted.values
1261
  etastart <- b$linear.predictors
1262
  start <- b$coefficients
1263

1264 1265 1266
  if (reml) {
     old.score <- score <- b$REML;grad <- b$REML1;hess <- b$REML2 
  } else if (scoreType=="GACV") {
1267 1268 1269 1270 1271 1272 1273
    old.score <- score <- b$GACV;grad <- b$GACV1;hess <- b$GACV2 
  } else if (scoreType=="UBRE"){
    old.score <- score <- b$UBRE;grad <- b$UBRE1;hess <- b$UBRE2 
  } else { ## default to deviance based GCV
    old.score <- score <- b$GCV;grad <- b$GCV1;hess <- b$GCV2
  }
  
1274 1275 1276
  grad <- t(L)%*%grad
  hess <- t(L)%*%hess%*%L

1277 1278 1279 1280 1281 1282 1283
  if (!is.null(lsp.max)) { ## need to transform to delta space
    rho <- rt(delta,lsp1.max)
    nr <- length(rho$rho1)
    hess <- diag(rho$rho1,nr,nr)%*%hess%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad)
    grad <- rho$rho1*grad
  }

1284 1285
  if (reml) score.scale <- abs(log(b$scale.est)) + abs(score) else 
  score.scale <- b$scale.est + abs(score)    
1286
  uconv.ind <- abs(grad) > score.scale*conv.tol
1287 1288
  ## check for all converged too soon, and undo !
  if (!sum(uconv.ind)) uconv.ind <- uconv.ind | TRUE
Dirk Eddelbuettel's avatar