Exercise 1: Plot points and linear regression

  1. Plot the total flow by water year
  2. Add a line with the linear regression


Oftentimes, time series data (such as water quantity or water quality) do not meet the assumptions of linear regressions. We won’t go into those here, but for a list, see http://r-statistics.co/Assumptions-of-Linear-Regression.html. It is common for streamflow trends to be assessed using non-parametric methods, such as the Mann-Kendall (MK) test. The MK test is appropriate for identifying where changes show a monotonic increase or decrease over time, and whether that change is significant. The MK test is often paired with Theil-Sen line to estimate the slope of the trend. This method is robust against outliers and essentially calculates the median of the slope between all pairs of sample points within your data. We calculate the MK test to get the significance of the trend and the Theil-Sen test to get the slope, intercept and confidence interval for plotting the data.

############################################  
# Mann Kendall test
############################################
temp.ts<-ts(annual.flow$Total_cms, start=min(annual.flow$WaterYear, freq=1));  
  mk <- mk.test(temp.ts); mk
## Mann-Kendall Test
##  
## two-sided homogeinity test
## H0: S = 0 (no trend)
## HA: S != 0 (monotonic trend)
##  
## Statistics for total series
##      S     varS    Z    tau  pvalue
## 1 -401 74404.33 -1.5 -0.107 0.14253
  sen <- sens.slope(temp.ts); sen
##  
## Sen's slope and intercept
##  
##  
## slope:  -27.8002
## 95 percent confidence intervall for slope
## 10.0201 -65.247
##  
## intercept: 12216.0641
## nr. of observations: 87
  #sen$b.sen*sen$nobs;
  #sen$b.sen.up*sen$nobs;  sen$b.sen.lo*sen$nobs
  #sen$intercept  
  
# create lines to plot on graph
  #y-axis. We are using the equation y=a+b*x, where a is the intercept and b is the slope
  estimate<-round((sen$intercept+sen$b.sen*seq(0,dim(annual.flow)[1]-1,1)),1)
  #upper confidence interval
  upperc1<-round((sen$intercept+sen$b.sen.up*seq(0,dim(annual.flow)[1]-1,1)),1)
  #lower confidence interval
  lowerc1<-round((sen$intercept+sen$b.sen.lo*seq(0,dim(annual.flow)[1]-1,1)),1)

#Plot Annual Flow over time
par(mar = c(5,5,3,5)) #set plot margins
plot(annual.flow$WaterYear, annual.flow$Total_cms, type='n', yaxt="n",
     ylab="Total Streamflow (cms)", xlab = '', cex.lab=0.9)
axis(2, las=2, cex.axis=0.8)
box()
#add data to the plot
  points(annual.flow$WaterYear, annual.flow$Total_cms, col=rgb(0,0,0,0.8), cex=0.8, pch=19)  
  lines(annual.flow$WaterYear, annual.flow$Total_cms, col=rgb(0,0,0,0.8))  

#Add linear regression to plot
  lines(x.est$WaterYear, y.est$fit, col="red", lty=2);
  
#draw a polygon between the lower and upper confidence interval  
  polygon(c(rev(annual.flow$WaterYear), (annual.flow$WaterYear)), c(rev(lowerc1), (upperc1)),  col=rgb(0,0,0.2,0.2), border=NA)
  lines(annual.flow$WaterYear, estimate, lty=2, col=rgb(0,0,.2,1))
  legend("bottomleft",c("Annual Flow", "Linear Regression", "MK regression", "MK confidence interval"), col=c("black","red",rgb(0,0,0.2),rgb(0,0,0.2,0.2)),
         pch=c(19,NA,NA,22), pt.cex=c(0.8,1,1,2), lty=c(1,2,2,NA), pt.bg=c(NA,NA,NA,rgb(0,0,0.2,0.2)), cex=0.8)


Does the trend change before or after Falls Lake was built?

Here, we will repeat the MK and Sen tests for subsets of the data (before Falls Lake, after Falls Lake) and plot those data on the same graph.

#create the plot
par(mar = c(5,5,3,5)) #set plot margins
  plot(annual.flow$WaterYear, annual.flow$Total_cms, type='n', yaxt="n", ylim=c(0,max(annual.flow$Total_cms, na.rm=TRUE)),
       ylab="Total Streamflow (cms)", xlab = '', cex.lab=0.9)
  axis(2, las=2, cex.axis=0.8)
  
  #add the data
  points(annual.flow$WaterYear, annual.flow$Total_cms, col=rgb(0,0,0,0.8), cex=0.8, pch=19)  
  lines(annual.flow$WaterYear, annual.flow$Total_cms, col=rgb(0,0,0,0.8))  
  #draw a rectangle around the period Falls Lake was built (x1,y1,x2,y2)
  rect(1980,0,1984,100000, col=rgb(0.7,0,0,0.2), border=NA)

#Subset and run the MK test
######################################
# Pre Falls Lake
#######################################
pre.falls <- subset(annual.flow, WaterYear<=1980)
  #create time series
  temp.ts<-ts(pre.falls$Total_cms, start=min(pre.falls$WaterYear, freq=1));  
  mk <- mk.test(temp.ts); mk
## Mann-Kendall Test
##  
## two-sided homogeinity test
## H0: S = 0 (no trend)
## HA: S != 0 (monotonic trend)
##  
## Statistics for total series
##     S     varS    Z    tau  pvalue
## 1 -77 14291.67 -0.6 -0.063 0.52495
  sen <- sens.slope(temp.ts); sen
##  
## Sen's slope and intercept
##  
##  
## slope:  -25.9626
## 95 percent confidence intervall for slope
## 61.8574 -107.4761
##  
## intercept: 12148.8019
## nr. of observations: 50
  # create lines to plot on graph
  estimate<-round((sen$intercept+sen$b.sen*seq(0,dim(pre.falls)[1]-1,1)),1)
  upperc1<-round((sen$intercept+sen$b.sen.up*seq(0,dim(pre.falls)[1]-1,1)),1)
  lowerc1<-round((sen$intercept+sen$b.sen.lo*seq(0,dim(pre.falls)[1]-1,1)),1)
  
  #plot the polygon and line
  polygon(c(rev(pre.falls$WaterYear), (pre.falls$WaterYear)), c(rev(lowerc1), (upperc1)),  col=rgb(0,0,0.2,0.2), border=NA)
    lines(pre.falls$WaterYear, estimate, lty=2, col=rgb(0,0,.2,1))

######################################
# Post Falls Lake
#######################################
post.falls <- subset(annual.flow, WaterYear>=1984)
    #create time series
    temp.ts<-ts(post.falls$Total_cms, start=min(post.falls$WaterYear, freq=1));  
    mk <- mk.test(temp.ts); mk
## Mann-Kendall Test
##  
## two-sided homogeinity test
## H0: S = 0 (no trend)
## HA: S != 0 (monotonic trend)
##  
## Statistics for total series
##   S     varS Z   tau  pvalue
## 1 3 4550.333 0 0.005 0.97635
    sen <- sens.slope(temp.ts); sen
##  
## Sen's slope and intercept
##  
##  
## slope:  1.9004
## 95 percent confidence intervall for slope
## 171.663 -178.9858
##  
## intercept: 10195.6163
## nr. of observations: 34
    # create lines to plot on graph
    estimate<-round((sen$intercept+sen$b.sen*seq(0,dim(post.falls)[1]-1,1)),1)
    upperc1<-round((sen$intercept+sen$b.sen.up*seq(0,dim(post.falls)[1]-1,1)),1)
    lowerc1<-round((sen$intercept+sen$b.sen.lo*seq(0,dim(post.falls)[1]-1,1)),1)
    
    #plot the polygon and line on the graph
    polygon(c(rev(post.falls$WaterYear), (post.falls$WaterYear)), c(rev(lowerc1), (upperc1)),  col=rgb(0,0,0.2,0.2), border=NA)
      lines(post.falls$WaterYear, estimate, lty=2, col=rgb(0,0,.2,1))


What do you notice? This is often a challenge in looking at streamflow trends because the time period assessed can drastically change your results.


Exercise 2: Interpreting results

Use filter/subset to answer th following questions:

  1. How many months have a positive trend? A negative trend?
  2. How many of those trends are significant at the 0.05 level?, The 0.10 level?
  3. Which months have significant trends?



How does the annual trend compare with sites across North Carolina?

Often, it is good practice to put your results in the context of a broader geography (stream reaches, river basin, state). Here we are going to run the analysis for sites across North Carolina and plot the results.

  • First, find sites that meet your criteria in NC
  • Next, create a data frame to hold the stats for each site. Then create a for loop that reads in the site data, aggregates to annual flow, and calculates the mk and sen test.
#What sites are available in NC?
nc.sites <- readNWISdata(stateCd="NC", parameterCd = pcode, service = "site", seriesCatalogOutput=TRUE)
  length(unique(nc.sites$site_no));   #tells how many sites are present
## [1] 732
#filter data using pipes!
nc.sites2 <- filter(nc.sites, parm_cd %in% pcode) %>%                             #only sites with discharge
  filter(stat_cd %in% scode) %>%                                                 #mean daily values
  filter(begin_date <= "1950_01_01") %>%  filter(end_date >= "2010_01_01") %>%   #in this range
  mutate(period = as.Date(end_date) - as.Date("1970-01-01")) %>%                #how many days of data
  filter(period > 30*365)   #keep those with 30 years of data 

length(unique(nc.sites2$site_no))
## [1] 67
#calculate unique sites
unique.sites <- unique(nc.sites2$site_no)

#set up data frame  
stats <- as.data.frame(matrix(nrow=0,ncol=7));        
#Add column headers
colnames(stats) <- c("Site", "Pval","Intercept","Slope","Nobs","StartYr","EndYr"); 
start.date = "1970-10-01"
end.date = "2017-09-30"

#Loop through each site and calculate statistics
for (i in 1:length(unique.sites)){
  #read in data
  zt <- readNWISdv(siteNumbers = unique.sites[i], parameterCd = pcode, statCd = scode, startDate=start.date, endDate = end.date)
    zt <- renameNWISColumns(zt);
  zt$Flow_cms <- zt$Flow*0.028316847
  zt$Year <- year(zt$Date)
    
  #use pipes to aggregate flow by year
    annual.flow <- zt %>%  group_by(Year) %>%
      summarise(Total_cms = sum(Flow_cms, na.rm=T), n=n()) %>%  round(3)
    annual.flow <- as.data.frame(annual.flow);  
  
  #run trend test
    temp.ts<-ts(annual.flow$Total_cms, start=min(annual.flow$Year, freq=1));
    mk <- mk.test(temp.ts); 
    sen <- sens.slope(temp.ts); 
    
  #fill dataframe
    stats[i,1] <- as.character(unique.sites[i]);             stats[i,2]<-round(mk$pvalg,3);            stats[i,3]<-round(sen$intercept,3)
    stats[i,4] <- round(sen$b.sen,3);                        stats[i,5]<-sen$nobs;                     stats[i,6]<-min(zt$Year)
    stats[i,7] <- max(zt$Year);
    
    #print(i)
}
summary(stats)
##      Site                Pval          Intercept            Slope         
##  Length:67          Min.   :0.0020   Min.   :   106.9   Min.   :-780.556  
##  Class :character   1st Qu.:0.0685   1st Qu.:  1785.8   1st Qu.: -59.303  
##  Mode  :character   Median :0.2120   Median :  4428.6   Median : -16.717  
##                     Mean   :0.3197   Mean   : 11595.7   Mean   : -44.084  
##                     3rd Qu.:0.5125   3rd Qu.: 13756.3   3rd Qu.:  -4.735  
##                     Max.   :1.0000   Max.   :100958.7   Max.   : 884.029  
##       Nobs          StartYr         EndYr     
##  Min.   : 4.00   Min.   :1970   Min.   :2016  
##  1st Qu.:47.00   1st Qu.:1970   1st Qu.:2017  
##  Median :48.00   Median :1970   Median :2017  
##  Mean   :43.58   Mean   :1973   Mean   :2017  
##  3rd Qu.:48.00   3rd Qu.:1970   3rd Qu.:2017  
##  Max.   :48.00   Max.   :2013   Max.   :2017
#remove sites with less than 30 years observations
stats <- subset(stats, Nobs>=30); dim(stats)
## [1] 59  7


Exercise 3: Run analysis for other regions

  1. Run this analysis for a different time period (e.g. 1930 to present). How did the map change?
  2. Plot mann kendall trends for the Neuse River Basin (looka t notes from LoadStreamflowDescription.Rmd on how to subset sites to a river basins. You will have to find the HUC for the Neuse).
  3. Run this analysis for a different state.