Linear Regression

R
code
Linear Regression
OLS
Author

Simon-Pierre Boucher

Published

February 11, 2023

R code to estimate a linear model

set.seed(123)  # ensures replication

# predictors and response
N = 100 # sample size
k = 2   # number of desired predictors
X = matrix(rnorm(N * k), ncol = k)  
y = -.5 + .2*X[, 1] + .1*X[, 2] + rnorm(N, sd = .5)  # increasing N will get estimated values closer to these

dfXy = data.frame(X, y)



#'
#' # Functions 
#'

#' A maximum likelihood approach.
lm_ML = function(par, X, y) {
  # par: parameters to be estimated
  # X: predictor matrix with intercept column
  # y: response
  
  # setup
  beta   = par[-1]                             # coefficients
  sigma2 = par[1]                              # error variance
  sigma  = sqrt(sigma2)
  N = nrow(X)
  
  # linear predictor
  LP = X %*% beta                              # linear predictor
  mu = LP                                      # identity link in the glm sense
  
  # calculate likelihood
  L = dnorm(y, mean = mu, sd = sigma, log = TRUE) # log likelihood
#   L =  -.5*N*log(sigma2) - .5*(1/sigma2)*crossprod(y-mu)    # alternate log likelihood form

  -sum(L)                                      # optim by default is minimization, and we want to maximize the likelihood 
                                               # (see also fnscale in optim.control)
}

# An approach via least squares loss function.

lm_LS = function(par, X, y) {
  # arguments- 
  # par: parameters to be estimated
  # X: predictor matrix with intercept column
  # y: response
  
  # setup
  beta = par                                   # coefficients
  
  # linear predictor
  LP = X %*% beta                              # linear predictor
  mu = LP                                      # identity link
  
  # calculate least squares loss function
  L = crossprod(y - mu)
}


#' #  Obtain Model Estimates
#'
#' Setup for use with optim.

X = cbind(1, X)

#' Initial values. Note we'd normally want to handle the sigma differently as
#' it's bounded by zero, but we'll ignore for demonstration.  Also sigma2 is not
#' required for the LS approach as it is the objective function.

init = c(1, rep(0, ncol(X)))
names(init) = c('sigma2', 'intercept', 'b1', 'b2')

optlmML = optim(
  par = init,
  fn  = lm_ML,
  X   = X,
  y   = y,
  control = list(reltol = 1e-8)
)

optlmLS = optim(
  par = init[-1],
  fn  = lm_LS,
  X   = X,
  y   = y,
  control = list(reltol = 1e-8)
)

pars_ML = optlmML$par
pars_LS = c(sigma2 = optlmLS$value / (N - k - 1), optlmLS$par)  # calculate sigma2 and add



#' #  Comparison
#' 
#' Compare to `lm` which uses QR decomposition.

modlm = lm(y ~ ., dfXy)

#' Example
#' 
# QRX = qr(X)
# Q = qr.Q(QRX)
# R = qr.R(QRX)
# Bhat = solve(R) %*% crossprod(Q, y)
# alternate: qr.coef(QRX, y)

round(
  rbind(
    pars_ML,
    pars_LS,
    modlm = c(summary(modlm)$sigma^2, coef(modlm))), 
  digits = 3
)
        sigma2 intercept    b1    b2
pars_ML  0.219    -0.432 0.133 0.112
pars_LS  0.226    -0.432 0.133 0.112
modlm    0.226    -0.432 0.133 0.112
#' The slight difference in sigma is roughly maxlike dividing by N vs. N-k-1 in
#' the traditional least squares approach; diminishes with increasing N as both
#' tend toward whatever sd^2 you specify when creating the y response above.


#'
#'Compare to glm, which by default assumes gaussian family with identity link
#'and uses `lm.fit`.
#'
modglm = glm(y ~ ., data = dfXy)
summary(modglm)

Call:
glm(formula = y ~ ., data = dfXy)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.93651  -0.33037  -0.06222   0.31068   1.03991  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.43247    0.04807  -8.997 1.97e-14 ***
X1           0.13341    0.05243   2.544   0.0125 *  
X2           0.11191    0.04950   2.261   0.0260 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.2262419)

    Null deviance: 24.444  on 99  degrees of freedom
Residual deviance: 21.945  on 97  degrees of freedom
AIC: 140.13

Number of Fisher Scoring iterations: 2
#' Via normal equations.
coefs = solve(t(X) %*% X) %*% t(X) %*% y  # coefficients

#' Compare.
#' 
sqrt(crossprod(y - X %*% coefs) / (N - k - 1))
          [,1]
[1,] 0.4756489
summary(modlm)$sigma
[1] 0.4756489
sqrt(modglm$deviance / modglm$df.residual) 
[1] 0.4756489
c(sqrt(pars_ML[1]), sqrt(pars_LS[1]))
   sigma2    sigma2 
0.4684616 0.4756490 
# rerun by adding 3-4 zeros to the N

This code uses gradient descent to find the beta0 and beta1 coefficients that minimize the mean squared error (MSE). The learning rate alpha can be adjusted to influence the speed of learning.