## Example 1: Reading in data from files
##############################################################################
## Motivating Example: Is there an observer-induced bias in our count data?
## Redo the analyses from last class and include Observer as a covariate.
##############################################################################
rm(list = ls(all=TRUE)) # clear the workspace!
library(readxl) # load the spreadsheet full of transect info
dataxl <- as.data.frame(read_xlsx("transectcounts.xlsx",sheet = 1))
# Explore the new object...
names(dataxl)
class(dataxl)
str(dataxl)
head(dataxl)
# Prep our data frame so it has the right columns and column names
names(dataxl) <- c("TransectID","Length","Prop1","count1","count2","count")
dataxl$Length1 <- dataxl$Length * dataxl$Prop1/100
dataxl$Length2 <- dataxl$Length - dataxl$Length1
head(dataxl)
## What about other file types? (See foreign package for more options!)
## See also http://www.r-tutor.com/r-introduction/data-frame/data-import
# ?read.csv
# ?read.table
# ?read_xlsx
## EXERCISE 1 -- check colum classes!
##
## Often the default storage type or class is incorrect! To change it, either
## 1. Reformat that column...
dataxl$TransectID <- as.character(dataxl$TransectID)
str(dataxl)
##
## 2. Use options like colClasses (read.table, read.csv, etc.), col_types
## In our case, rerun the above code but add the option
## ... col_types = c("text","numeric","numeric","numeric","numeric","numeric")
## EXERCISE 2 -- reshaping data: wide to long format
##
## We would like to add observer IDs to this data to test for observer effects.
## The observer information for each transect is in file 'transectobservers.csv'
##
## Load the data into data frame `tobs'
tobs = read.csv("transectobservers.csv")
str(tobs)
head(tobs)
## Next, we need to manipulate the data so that
## column 1 is the TransectID
## column 2 is the ObserverName
## See the data transformation and data wrangling cheatsheets at
## https://www.rstudio.com/resources/cheatsheets/
## https://www.rstudio.com/resources/cheatsheets/
## See Data processing with dplyr & tidyr at
## https://rpubs.com/bradleyboehmke/data_wrangling
## But also know there are other options, e.g. the reshape2 package
## http://www.milanor.net/blog/reshape-data-r-tidyr-vs-reshape2/
##
## Moving on...
## Use tidyr::gather() to reshape from wide to long format
library(tidyr) # install.packages("tidyr") if needed.
tobs2 = gather(tobs, Observer, TransectID, na.rm = TRUE)
# WARNING: Make sure names match exactly! Observer, not observer, etc.
# Also note I added na.rm=TRUE to remove the 'TransectID=NA' cases
str(tobs2)
head(tobs2)
nrow(tobs2) # number of rows. ncol() and dim() are also handy.
## Remove the duplicate Observer column introduced by read.csv()
## making the first column a factor instead of text column.
tobs2 <- tobs2[,c(1,3)] # subset all rows, columns 1 and 3.
head(tobs2)
## Now we can use tobs2 like a 'look up table' to add
## a column "Observer" to the dataxl dataframe.
head(dataxl,2)
head(tobs2, 2)
## Put them into data frame 'mydata' using merge()
mydata <- merge(dataxl, tobs2, by="TransectID")
str(mydata)
head(mydata,3)
###########################################################################
## Exercise: Is there an observer-induced bias?
## Redo the analyses from last class and include Observer as a covariate.
library(MASS) # load glm.nb into the workspace...
## Plot both dataxl and mydata to make sure merge() didn't mix up data
## by adding to the code below to also plot dataxl.
library(ggplot2) # the following should yield identical figures:
x11() # quartz() on a mac of x11() doesn't open a new figure window
ggplot(mydata, aes(count, fill = Prop1)) + geom_histogram(binwidth=1) + facet_grid(Prop1 ~ .)
## HINT:
## Using count-data-NB-transectcounts-results.R (see course website),
## modify the function calls to glm() and glm.nb() to include
## Observer as a covariate, and rerun that analysis to see of Observer
## improves the model fits or not. For example...
fitpois1 <- glm(count ~ Length - 1, data=mydata, family = poisson(link = "identity"))
fitpois1Obs <- glm(count ~ Length + Observer - 1, data=mydata, family = poisson(link = "identity"))
## To see how to include interaction terms (required for 620 students)
## See ?formula