set.seed(123) # ensures replication# predictors and responseN =100# sample sizek =2# number of desired predictorsX =matrix(rnorm(N * k), ncol = k) y =-.5+ .2*X[, 1] + .1*X[, 2] +rnorm(N, sd = .5) # increasing N will get estimated values closer to thesedfXy =data.frame(X, y)#'#' # Functions #'#' A maximum likelihood approach.lm_ML =function(par, X, y) {# par: parameters to be estimated# X: predictor matrix with intercept column# y: response# setup beta = par[-1] # coefficients sigma2 = par[1] # error variance sigma =sqrt(sigma2) N =nrow(X)# linear predictor LP = X %*% beta # linear predictor mu = LP # identity link in the glm sense# calculate likelihood L =dnorm(y, mean = mu, sd = sigma, log =TRUE) # log likelihood# L = -.5*N*log(sigma2) - .5*(1/sigma2)*crossprod(y-mu) # alternate log likelihood form-sum(L) # optim by default is minimization, and we want to maximize the likelihood # (see also fnscale in optim.control)}# An approach via least squares loss function.lm_LS =function(par, X, y) {# arguments- # par: parameters to be estimated# X: predictor matrix with intercept column# y: response# setup beta = par # coefficients# linear predictor LP = X %*% beta # linear predictor mu = LP # identity link# calculate least squares loss function L =crossprod(y - mu)}#' # Obtain Model Estimates#'#' Setup for use with optim.X =cbind(1, X)#' Initial values. Note we'd normally want to handle the sigma differently as#' it's bounded by zero, but we'll ignore for demonstration. Also sigma2 is not#' required for the LS approach as it is the objective function.init =c(1, rep(0, ncol(X)))names(init) =c('sigma2', 'intercept', 'b1', 'b2')optlmML =optim(par = init,fn = lm_ML,X = X,y = y,control =list(reltol =1e-8))optlmLS =optim(par = init[-1],fn = lm_LS,X = X,y = y,control =list(reltol =1e-8))pars_ML = optlmML$parpars_LS =c(sigma2 = optlmLS$value / (N - k -1), optlmLS$par) # calculate sigma2 and add#' # Comparison#' #' Compare to `lm` which uses QR decomposition.modlm =lm(y ~ ., dfXy)#' Example#' # QRX = qr(X)# Q = qr.Q(QRX)# R = qr.R(QRX)# Bhat = solve(R) %*% crossprod(Q, y)# alternate: qr.coef(QRX, y)round(rbind( pars_ML, pars_LS,modlm =c(summary(modlm)$sigma^2, coef(modlm))), digits =3)
#' The slight difference in sigma is roughly maxlike dividing by N vs. N-k-1 in#' the traditional least squares approach; diminishes with increasing N as both#' tend toward whatever sd^2 you specify when creating the y response above.#'#'Compare to glm, which by default assumes gaussian family with identity link#'and uses `lm.fit`.#'modglm =glm(y ~ ., data = dfXy)summary(modglm)
Call:
glm(formula = y ~ ., data = dfXy)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.93651 -0.33037 -0.06222 0.31068 1.03991
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.43247 0.04807 -8.997 1.97e-14 ***
X1 0.13341 0.05243 2.544 0.0125 *
X2 0.11191 0.04950 2.261 0.0260 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.2262419)
Null deviance: 24.444 on 99 degrees of freedom
Residual deviance: 21.945 on 97 degrees of freedom
AIC: 140.13
Number of Fisher Scoring iterations: 2
#' Via normal equations.coefs =solve(t(X) %*% X) %*%t(X) %*% y # coefficients#' Compare.#' sqrt(crossprod(y - X %*% coefs) / (N - k -1))
[,1]
[1,] 0.4756489
summary(modlm)$sigma
[1] 0.4756489
sqrt(modglm$deviance / modglm$df.residual)
[1] 0.4756489
c(sqrt(pars_ML[1]), sqrt(pars_LS[1]))
sigma2 sigma2
0.4684616 0.4756490
# rerun by adding 3-4 zeros to the N
This code uses gradient descent to find the beta0 and beta1 coefficients that minimize the mean squared error (MSE). The learning rate alpha can be adjusted to influence the speed of learning.