###############################
#                             #
#        LECTURE EIGHT        #
#                             #
###############################


############################### PART 1 #############################


## The lack-of-fit investigation
# We use the data from Problem 3.21, p.133.
radon = read.table("pr0321.txt",header=TRUE)
radon
class(radon$orif)
# orif is numeric here, we also need it as a factor:
A = factor(radon$orif, labels=letters[1:6])
radon = cbind(radon, A)
# One way of performing the lack of fit test
obj = lm(y ~ orif+A, data=radon) # order of orif and A important!
anova(obj) 
# Another way
obj0 = lm(y ~ orif, data=radon)
obj1 = lm(y ~ A, data=radon)
anova(obj0,obj1)
# One-star significance for lack of fit of linear relation
# We try to improve the regression model - first a plot
plot(y ~ orif, data=radon)
# Some of the y-values come more than once. It has been suggested to
# make this visible by inserting a jitter. Take it or leave it!
jit = runif(24,-0.01,0.01)
plot(y ~ I(orif+jit), data=radon,
  xlab="Orifice Diameter", ylab="Radon Released (%)")
# Notice the notation, `I' means `literally'.
# There seems to be a bit of curvature. Will a parabola fit?
obj2 = lm(y ~ orif + I(orif^2) + A, data=radon)
anova(obj2)
# No lack of fit.  The parabola fits well
# No reason for trying more complex models, but let's do it anyway
anova(lm(y ~ orif + I(orif^2) + I(orif^3) + A, data=radon))
# A cubic term does not give better fit.
# Can we draw the parabola?  
obj2
# The A coefficients are a nuisance. They disturb the picture.
obj3 = lm(y ~ orif + I(orif^2), data=radon); obj3
a = coef(obj3)
# In "curve", the independent variable must always be called `x'
curve(a[1] + a[2]*x + a[3]*x^2, col="red", add=TRUE)
# Now I'll add confidence bands for mean response
orif.tick = seq(0.37, 2, 0.01)
new = data.frame(orif=orif.tick)
M = predict(obj3, new, interval="confidence")
lines(orif.tick, M[,2], col="blue", lty=2)
lines(orif.tick, M[,3], col="blue", lty=2)
# Cleaning up
rm(list=ls())


########################## EXERCISE #########################


# Exercise 1
# Use the data from Problem 10.12 (p.476).
# (a) Create the model in the suggested in Problem 10.12 (remember
# to use the I() function)
# (b) Consider the p-values in all of the t-tests - are all terms
# necesary?
# (c) Backwards elimination is one way of simplifying a regression
# model. You start with the most complex model you can think of and
# try to remove terms one by one. In one version of backwards
# elimination you remove the term with the largest p-value, provided
# this p-value is larger than alpha. Then you redo the model, and do
# the same. This continues until there are no insignificant terms
# left. Try to do this.


############################ PART 2 #########################


## Two-way ANOVA (fixed factors)
# We use the data Table 5.1, p.188 & Example 5.1, p.192.
# Here battery life times are modelled as a function of temperature
# and material.
battery = read.table("battery.txt")
battery  # Notice the data-frame layout
# short notation
y = battery$y; mat = battery$mat; temp = battery$temp 
# checking data types
class(y);class(mat);class(temp)
mat = as.factor(mat); temp = as.factor(temp)
# We use the aov function
obj = aov(y ~ mat*temp)
# Note: mat and temp are factors, mat:temp is the interaction
# and mat*temp is short notation for mat + temp + mat:temp
anova(obj) 
# Everything is significant.
# We check the residuals
res = resid(obj); fit = fitted(obj)
plot(res ~ as.numeric(mat)); abline(h=0, lty=2)
plot(res ~ as.numeric(temp)); abline(h=0, lty=2)
plot(res ~ fit); abline(h=0, lty=2)
plot(res); abline(h=0, lty=2)
qqnorm(res); qqline(res)
# Interaction plot.
interaction.plot(temp, mat, y)  # Compare with Figure 5.9, p.194
interaction.plot(mat, temp, y)
# On p.194, Montgomery makes pairwise comparisons of materials
# according to Tukey, but only for the room temperature  (70°F).
# Can all temperatures can be used for estimating the variance?
bartlett.test(y, gl(9, 4))  # Yes!
# Exclude the values for cold and hot temperatures.
m2 = tapply(y[temp==70], mat[temp==70], mean)
abs(outer(m2,m2,"-"))
MSE = anova(obj)[4,3]
qtukey(0.95, 3, 27) * sqrt(MSE/4)
# Difference between type 2 and type 3 is not statistically assured
# If an additive model (no interaction) were accepted, materials could
# be compared regardless of temperature, using the command TukeyHSD,
# but this is not justified here. 
rm(list=ls())

## Two-way ANOVA (unbalanced case)
# The dataset in Table 15.6, p.653 is a subset of the battery
# dataset above.
battery = read.table("battery.txt")
# I have found the index numbers of those kept:
included = c(1:4, 7:9, 12:17, 20, 23:24, 26, 28, 32:33)
batt = battery[included, ]
# It is easy to get Table 15.7.
anova(aov(y ~ mat*temp, data=batt))
# The reduced dataset is proportional, which implies that the order
# of the factors still is not important.
anova(aov(y ~ temp*mat, data=batt))
# If we further remove the first item, say, the design is now not only
# unbalanced but also not proportional.
bat = batt[-1, ]
anova(aov(y ~ mat*temp, data=bat))
anova(aov(y ~ temp*mat, data=bat))
# We see that the Sum of Squares for the main effects are changed
rm(list=ls())


######################### EXERCISE ##########################


# Exercise 2
# Problem 5.6 (p.225).  
# Remember: Question (b) before question (a)!


############################# PART 3 ############################


## Analysis of Covariance
# As Subsection 15.3.1 shows, the hand calculations in an ANCOVA are
# pretty cumbersome. With R, it is quite simple!
# We illustrate with the data in Table 15.10, p.656
ancova = read.table("ancova.txt",header=TRUE)
ancova
# y is breaking strength of monofilament fiber, x is fiber thickness,
# machine is the machine that produced the fiber.
class(ancova$machine)
ancova$machine = as.factor(ancova$machine)
plot(y ~ x,data=ancova)
plot(y ~ x, col=rep(c("red", "green4", "blue"), c(5,5,5)), pch=16,data=ancova)
legend(15, 49, pch=16, col=c("red", "green4", "blue"),
  legend=paste("Machine ", 1:3))
# Looking at this plot, the factor machines seems to have no effect!
anova(lm(y ~ x * machine,data=ancova))
# The term x:machine is sort of interaction, do the three regression
# lines have different slope? No, we can accept parallel regression lines.
# This analysis was too difficult with Montgomery's formulae!
anova(lm(y ~ x + machine,data=ancova))
# We cannot detect any difference between machines, with the same
# p-value as in Table 15.13, p.661
# The final model is just the regression model
anova(lm(y ~ x,data=ancova))
# Finally the faulty analysis in Table 15.14, p.664
anova(aov(y ~ machine,data=ancova))
# As the plot indicates, the reason for the spurious significance is
# that x (= diameter) is not the same for the materials. If you didn't
# think about diameter you would get a wrong conclusion. Still, if you
# had randomised, even if you had neglected diameter you would rarely
# get such a wrong result.

rm(list=ls())


########################### EXERCISE ##############################


# Exercise 3
# (a) Skim all the datasets in Section 5.7 (pp.225-232).
# Which of the factors can also be considered quantitative (numeric)?
# (b) Consider the data in Problem 5.9 - model this with glass type as
# a factor and temperature as a continuous variable (remember: it is
# always a good idea to plot the data before you analyse it, and decide
# which model to use).
