Regression.r

Dec 4, 2014


# We'll go through the regression example from the book, and I'll show how to do
# this type of analysis in R. This is the "Janka Hardness vs. Wood Density"
# data, see Table 15.5 and Ch 22

# You can find the data here:
# http://www.uow.edu.au/~mwand/webspr/janka.txt
# (just save the text file in your current working directory for R)

# First, we load the data into a dataframe like so:
janka = read.table("janka.txt", header=TRUE)

# The "header=TRUE" option tells R to use the first line as column names
# Try typing just "janka" into the command line to see the raw data
janka

   dens hardness
1  24.7      484
2  24.8      427
3  27.3      413
4  28.4      517
5  28.4      549
6  29.0      648
7  30.3      587
8  32.7      704
9  35.6      979
10 38.5      914
11 38.8     1070
12 39.3     1020
13 39.4     1210
14 39.9      989
15 40.3     1160
16 40.6     1010
17 40.7     1100
18 40.7     1130
19 42.9     1270
20 45.8     1180
21 46.9     1400
22 48.2     1760
23 51.5     1710
24 51.5     2010
25 53.4     1880
26 56.0     1980
27 56.5     1820
28 57.3     2020
29 57.6     1980
30 59.2     2310
31 59.8     1940
32 66.0     3260
33 67.4     2700
34 68.8     2890
35 69.1     2740
36 69.1     3140


# A scatterplot of the data is simple in R
plot(janka, main="Hardness vs. Density of Timber")

plot of chunk unnamed-chunk-1


# This is the same as calling the following command
# (but notice the axes are labeled different)
plot(janka$hardness ~ janka$dens, main="Hardness vs. Density of Timber")

# The "y ~ x" means y is the dependent variable and x is the independent variable
# The "lm" command stands for "linear model" and will fit a linear regression to
# paired data. It uses the same "y ~ x" syntax.
janka.model = lm(hardness ~ dens, data=janka)

# The output of "lm" is an object with lots of information about the linear
# regression fit. Type the following command to see what variables it contains.
names(janka.model)

 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"


# The first item to look at is the "coefficients" this is the intercept (alpha)
# and slope (beta)
janka.model$coefficients

(Intercept)        dens 
   -1160.50       57.51


# The command "abline" will add a line to the plot with a certain intercept a
# (first parameter) and slope b (second parameter)
abline(janka.model$coefficients[1], janka.model$coefficients[2], col="red")

# The abline command is smart enough to know when you pass it a regression fit.
# The above command can be simplified like so:
abline(janka.model, col="red")

plot of chunk unnamed-chunk-1


# The next thing we can look at is the residuals between the data and the model
hist(janka.model$residuals, main="Residuals of Janka Regression")

plot of chunk unnamed-chunk-1


# Here we plot the residual values vs. the independent variable
plot(janka.model$residuals ~ janka$hardness, main="Residuals vs. Hardness")

plot of chunk unnamed-chunk-1


# Here's another example: the "Sea Slug" data from this URL:
# http://www.stat.ucla.edu/projects/datasets
# Before getting started, edit the seaslug.txt file to remove all entries with Time=99
# (this is a control group that should be removed)
# Try going through the same procedures with this data set
seaslug = read.table("seaslug.txt", header=TRUE)

# I also recommend searching for "sea slug" in Google images :)


## Here is an example for writing a function in R that returns two values
## in a list (like your function "my.regression" should do)
foo = function(x)
{
  y = x^2 + 2.3
  z = sin(x)
  return(list(y = y, z = z))
}

foo(1)

$y
[1] 3.3

$z
[1] 0.8415

foo(2)

$y
[1] 6.3

$z
[1] 0.9093