# the holy trinity of data wrangling packages - thank you Hadley Wickham require(dplyr) require(tidyr) require(lubridate) # # Step #1 - First load in files # These are the Raw Data files. # They follow a constellation schema per Commandment #3 # (raptor_survey.csv/df is the fact table # raptor_temps.csv/temps is a 2nd environmental fact table # raptor_sites.csv/siteinfo is a dimension table (site) for both fact tables # Two of the tables are in row-column form (per Command #2) # But one table is in dimensional form rather than row-column. We will need to fix this # #bring things in NOT as R factors to start - just numbers or strings options(stringsAsFactors = FALSE) #have 3 files to load df=read.csv("c:/temp/raptor_survey.csv",header=TRUE) temps=read.csv("c:/temp/raptor_temps.csv",header=TRUE) siteinfo=read.csv("c:/temp/raptor_sites.csv",header=TRUE) # # Step #2 - explore what we have - determine dimensions, joining fields # check for errors # This is part of commandment #4 # str(df) #hmm Alpha column/site has "1.o" instead of 1.0 (is chr not int) head(df) summary(df) unique(df$spec) #hmm Accipiter cooperii misspelled a bit head(temps) summary(temps) #oops - 100 C is a bit hot unique(temps$site) head(siteinfo) #column is "sitename" instead of "site" unique(siteinfo$site) summary(siteinfo) # # Step #3 - clean up stuff (e.g. column names, things in single columns) # Per commandment #4 we are doing cleaning up # Per command #5 we are doing this in scripts and not hand editing the raw files # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # it is MUCH more repeatable to fix here in script # and leave raw files untouched # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #3a) site column called "sitename" in siteinfo inconsistent with "site" in core table colnames(siteinfo)[[1]]="site" str(siteinfo) #names should match #3b) fix string abunds for Alpha site as.numeric(df$Alpha) #can't just coerce - gives warning df$Alpha #lots of NAs are OK, #'s look OK unique(df$Alpha[!is.na(df$Alpha)]) #zoom in on non-NAs #ah ha somebody typed 1.o instead of 1.0 df$Alpha[df$Alpha=="1.o"]=1 #note other site columns are integer so convert to integer df$Alpha=as.integer(df$Alpha) #no errors - nailed it str(df) #OK all site columns numeric # # Step #4 - Make tall & filter # Per commandment #2 we take our one table that is dimensional and make it row-column # # dplyr::gather([data],name_new_factor_column,name_new_value_column,cols to use or exclude) dftall=df %>% gather(site,abund,-date,-spec) dftall1=dftall %>% filter(abund>0) #drop all the NA rows # # Step #5 - more cleanup - again do in code, not raw files # Commandment #4/5 again - I told you it was a large part of the work! # #5a fix cooperii misspelling fix=which(dftall1$spec=="Accipiter cooperi") cat("# of cooperi misspellings=",length(fix)) dftall1[fix,"spec"]="Accipiter cooperii" unique(dftall1$spec) #5b make a genus column #separate(column,newcolnames,separater(default to non-alphanumeric)) dftall2= dftall1 %>%separate(spec,c("genus","specname"),remove=FALSE) %>% select(-specname) df=dftall2 #save some typing #Commandment #6 # We have a fact table (df) with 3 dimensions (date, site, species). # we have another fact table (temps) with 2 dimensions (date, site) # These share date and site dimensions # The site dimension has its own dimension table (siteinfo) with additional site attributes # # Step #6 - join in ancillary tables to make one single # "denormalized" data frame # per comamndment #8, this is being done in reproducible code # #recall 2 tables to join: # siteinfo only on site # temps by site & date #we are now working with factors, so clean up siteinfo #do the first join to siteinfo #beware - by= is one place you have to use quotes on column names df1=full_join(df,siteinfo,by="site") head(df1) str(df1) #OK now join in temps on 2 fields (site,date) df2=left_join(df1,temps,by=c("site","date")) head(df2,n=20) #rows 11 & 12 one site, one date, same temp # # Step #7 - now is right shape, make columns right type (should be last step) # Per the spirit of Commandment #7 make this data as easy to use as possible # per comamndment #8, this is being done in reproducible code # df2$date=ymd(df2$date) df2$spec=as.factor(df2$spec) df2$genus=as.factor(df2$genus) df2$site=as.factor(df2$site) df2$region=as.factor(df2$region) df2$lon=as.numeric(df2$lon) thedata=df2 #pull up to a name I'll remember for analysis head(thedata) str(thedata) # # Step #8 - add calculated columns, e.g DOY, Month of year # thedata=thedata %>% mutate(doy=yday(date),moy=month(date)) # # Step #9 - Analyses # Per commandment #9 our analyses are being done in reusable code # Per commandment #8 we are conceptualizing this dimensionally, # and we are making this table completely denormalized # and as easy to work with (adding calculated fields like DOY) # but also per commandment #8 we are leaving this in the native # form for our analysis tool, R and dplyr, namely row-column data # # a) filter to april (full month surveyed) & summarize by doy thedata %>% filter(moy==4) %>% group_by(doy) %>% summarise(abund=mean(abund,na.rm=TRUE)) # b) same but doy and region thedata %>% filter(moy==4) %>% group_by(doy,region) %>% summarise(abund=mean(abund,na.rm=TRUE)) %>% spread(region,abund) # c) top 5 counts thedata %>% top_n(5,abund) # d) summarize to views per month, normalized by days observed thedata %>% group_by(moy) %>% summarise(abund=sum(abund,na.rm=TRUE),obsdays=n()) %>% mutate(abund.per.day=abund/obsdays) %>% select(moy,abund.per.day) # e) summarise to month x species table (normalized by days observed) thedata %>% group_by(moy,spec) %>% summarise(abund=sum(abund,na.rm=TRUE),obsdays=n()) %>% mutate(abund.per.day=abund/obsdays) %>% select(moy,spec,abund.per.day) %>% spread(moy,abund.per.day) # f) summarize to views per month each year, normalized by days observed thedata %>% group_by(moyr=floor_date(date,"month")) %>% summarise(abund=sum(abund,na.rm=TRUE),obsdays=n()) %>% mutate(abund.per.day=abund/obsdays) %>% select(moyr,abund.per.day)