We are using the same source data as SOE, at https://psl.noaa.gov/data/gridded/data.noaa.oisst.v2.highres.html
“NOAA High Resolution SST data provided by the NOAA/OAR/ESRL PSL, Boulder, Colorado, USA, from their Web site” [above]
Initial pull code was kindly provided by Kim Bastille https://github.com/kimberly-bastille/ecopull/blob/main/.github/workflows/pull_satellite_data.yml pulling daily gridded SST for each year 1985-2021 using her code starting line 260
I am also using Kim’s nc_to_raster function for NEUS shelf from https://github.com/kimberly-bastille/ecopull/blob/main/R/utils.R (below)
#' Convert netcdf to raster
#'
#' This function converts a netcdf object to a raster object
#' @param nc The nc file path
#' @param varname The name of the variable to convert to a raster
#' @param extent The latitude and longitude range the data should have, of the form c(xmin, xmax, ymin, ymax). Defaults to `c(0, 360, -90, 90)`
#' @param crop An extent object to use to crop data. Defaults to `raster::extent(280, 300, 30, 50)` (Northeast US)
#' @param show_images Boolean. Whether to display images of the data. Useful to check if cropping is occurring correctly.
#' @return A raster brick
#' @importFrom magrittr %>%
#' @export
nc_to_raster <- function(nc,
varname,
extent = c(0, 360, -90, 90),
crop = raster::extent(280, 300, 30, 50),
show_images = FALSE) {
message("Reading .nc as brick...")
r <- raster::brick(nc, varname = varname)
message("Setting CRS...")
raster::crs(r) <- "+proj=longlat +lat_1=35 +lat_2=45 +lat_0=40 +lon_0=-77 +x_0=0 +y_0=0 +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
# not sure if this is necessary?
raster::extent(r) <- raster::extent(extent)
if(show_images){
par(mfrow = c(1,2))
raster::plot(r, 1, sub = "Full dataset")
}
message("Cropping data...")
ne_data <- raster::crop(r, crop)
#ne_data <- raster::rotate(ne_data) add here for future pulls
if(show_images){
raster::plot(ne_data, 1, sub = "Cropped dataset")
par(mfrow = c(1,1))
}
message("Done!")
return(ne_data)
}
I pulled the data and stored NEUS rasters using scripts in
pull_OISST.R
copied below, which was adapted from https://github.com/kimberly-bastille/ecopull/blob/main/.github/workflows/run_recent_years.yaml
to work with the nc_to_raster
function above.
Note that this times out and had to be rerun for whatever years it didn’t get to. Probably my internet connection… works eventually.
varname <- "sst"
years <- 1985:2021
for(i in years) {
name <- paste0(i, ".nc")
dir.create(here::here("data-raw","gridded", "sst_data"), recursive = TRUE)
filename <- here::here("data-raw","gridded", "sst_data", paste0("test_", i, ".grd"))
url <- paste0("https://downloads.psl.noaa.gov/Datasets/noaa.oisst.v2.highres/sst.day.mean.", i, ".v2.nc")
download.file(url, destfile = name)
text <- knitr::knit_expand(text = "test_{{year}} <- nc_to_raster(nc = name, varname = varname)
raster::writeRaster(test_{{year}}, filename = filename, overwrite=TRUE)",
year = i)
print(text)
try(eval(parse(text = text)))
unlink(name) # remove nc file to save space
print(paste("finished",i))
}
The plan is to match each survey date and location with the daily SST at that location (or nearest location). Then the SST data will be integrated with the input data.
An alternative is to overlay survey stations on the daily SST raster and get the SST value that way.
This is a function to make rasters into data frame for merge with survey needs long df with date split to year, month, day, lat, lon, sst crop to NEUS extent conversion to df from https://towardsdatascience.com/transforming-spatial-data-to-tabular-data-in-r-4dab139f311f
raster_to_sstdf <- function(brick,
rotate=TRUE){
if(rotate) brick_r <- raster::rotate(brick)
brick_r <- raster::crop(brick_r, raster::extent(-77,-65,35,45))
sstdf <- as.data.frame(raster::rasterToPoints(brick_r, spatial = TRUE))
sstdf <- sstdf %>%
dplyr::rename(Lon = x,
Lat = y) %>%
tidyr::pivot_longer(cols = starts_with("X"),
names_to = c("year", "month", "day"),
names_prefix = "X",
names_sep = "\\.",
values_to = "sst",
)
return(sstdf)
}
Note this is working with the rasters in memory from running the download script, could read in files instead in name statement.
years <- 1985:2021
for(i in years) {
name <- get(paste0("test_",i))
filename <- here::here("data-raw","gridded", "sst_data", paste0("sst", i, ".rds"))
text <- knitr::knit_expand(text = "sst{{year}} <- raster_to_sstdf(brick = name)
saveRDS(sst{{year}}, filename)",
year = i)
print(text)
try(eval(parse(text = text)))
}
Plot to see if the dataframes of SST look reasonable
#visualize
sst2021 <- readRDS(here("data-raw/gridded/sst_data/sst2021.rds"))
oneday <- sst2021[sst2021$month=="07" & sst2021$day=="04",]
jan15 <- sst2021[sst2021$month=="01" & sst2021$day=="15",]
mar15 <- sst2021[sst2021$month=="03" & sst2021$day=="15",]
may15 <- sst2021[sst2021$month=="05" & sst2021$day=="15",]
jul15 <- sst2021[sst2021$month=="07" & sst2021$day=="15",]
sep15 <- sst2021[sst2021$month=="09" & sst2021$day=="15",]
nov15 <- sst2021[sst2021$month=="11" & sst2021$day=="15",]
dailysstplot <- function(oneday){
ggplot() +
geom_tile(data = oneday, aes(x = Lon, y = Lat, fill = sst)) +
geom_sf(data = ecodata::coast) +
geom_point(data = northwest_atlantic_grid, aes(x = Lon, y = Lat), size=0.05, alpha=0.1) +
scale_fill_gradientn(name = "Temp C",
limits = c(0.5, 31),
colours = c(scales::muted("blue"), "white",
scales::muted("red"), "black")) +
coord_sf(xlim = c(-77, -65), ylim = c(35, 45)) +
ggtitle(paste("SST, mm dd yyyy:", unique(oneday$month),
unique(oneday$day), unique(oneday$year), sep = " "))
}
#dailysstplot(oneday)
#par(mfrow=c(2,3))
dailysstplot(jan15)
dailysstplot(mar15)
dailysstplot(may15)
dailysstplot(jul15)
dailysstplot(sep15)
dailysstplot(nov15)
Next, find set of unique year month day from diet dataset (including NEAMAP) to find the equivalent times in the SST data. BUT I didn’t retain month and day in this dataset. The station id could be matched back to the full dataset instead.
This shows the work of getting the dataset with day, month, and recorded surface temp for NEAMAP along with some quality assurance, but the dataset has been saved and this code is not run live.
bluepyagg_stn_all <- readRDS(here("fhdat/bluepyagg_stn_all.rds"))
diethauls <- bluepyagg_stn_all %>%
dplyr::select(id, declat, declon)
# date already included in tow id for NEAMAP--nope, need this
# need allfh to get the day and month for NEFSC trawl data
load(url("https://github.com/Laurels1/Condition/raw/master/data/allfh.RData"))
# October 8 2022: add NEFSC 2021 data
load(here("fhdat/allfh21.Rdata"))
allfh <- allfh %>%
dplyr::bind_rows(allfh21)
NEFSCstations <- allfh %>%
dplyr::mutate(id = paste0(cruise6, "_", station),
year = as.numeric(year),
month = as.numeric(month),
day = as.numeric(day),
declon = -declon) %>%
dplyr::select(id, year, month, day, declat, declon) %>%
dplyr::distinct()
#diethauls <- left_join(diethauls, NEFSCstations)
Still need month and day columns for NEAMAP; got ’em fast thanks to Jim Gartland!
NEAMAPstationSST <- read.csv(here("fhdat/NEAMAP SST_2007_2021.csv"))
NEAMAPstations <- NEAMAPstationSST %>%
dplyr::mutate(id = station,
year = as.numeric(year),
month = as.numeric(month),
day = as.numeric(day),
declat = latitude,
declon = longitude) %>%
dplyr::select(id, year, month, day, declat, declon) %>%
dplyr::distinct()
Now combine NEAMAP and NEFSC and join with diet stations
Allstations <- bind_rows(NEFSCstations, NEAMAPstations)
diethauls <- left_join(diethauls, Allstations)
Check NA dates: 5 hauls in allfh have no station data, one made it to the bluefish diet database (201202_389) because it has piscivores sampled (cod) but no bluefish prey. It has no spatial information so has not been included in analysis, and it is on the Scotial Shelf.
badhaulNEFSC <- as.data.frame(filter(Allstations, is.na(month))) %>%
tidyr::separate(id, c("cruise6", "station")) %>%
dplyr::select(cruise6, station) %>%
dplyr::mutate(across(where(is.character), as.integer))
badhaulNEFSCFH <- badhaulNEFSC %>%
left_join(allfh)
badhaulall <- as.data.frame(filter(diethauls, is.na(month)))
badhaulNEFSCFH%>%filter(station %in% 389)
NEAMAP has station data; 34 records don’t merge because latitude is different at the 8th decimal place. Jim Gartland confirmed that this is due to conversions between different software during data processing. We will use the version of latitude that is recommended by the NEAMAP team, whch is the original entries in the food habits data.
mismatchNEAMAP <- badhaulall %>%
dplyr::filter(!is.na(declat)) %>%
dplyr::select(id)
mismatchNEAMAP
mismatchNEAMAPstn <- mismatchNEAMAP %>%
left_join(NEAMAPstations)
print(mismatchNEAMAPstn$declat, digits=16)
mismatchNEAMAPfh <- mismatchNEAMAP %>%
left_join(bluepyagg_stn_all)
print(mismatchNEAMAPfh$declat, digits=16)
Since we know the NEAMAP lat and lon in the original diet dataset are correct, we will merge only the station id number, day, month, year, and surface temperature into the diet dataset to avoid the mismatch with 34 stations. We will also add the SST field as surftemp to be in the same column as in-situ measured temperature for the NEFSC survey.
NEAMAPstations <- NEAMAPstationSST %>%
dplyr::mutate(id = station,
year = as.numeric(year),
month = as.numeric(month),
day = as.numeric(day)) %>%
dplyr::select(id, year, month, day) %>%
dplyr::distinct()
# remake diethauls
diethauls <- bluepyagg_stn_all %>%
dplyr::select(id, declat, declon)
NEFSCstations <- dplyr::select(NEFSCstations, c(-declat, -declon))
Allstations <- bind_rows(NEFSCstations, NEAMAPstations)
#station id, lat lon, year month day
diethauls <- left_join(diethauls, Allstations)
#add year month day to diet data
bluepyagg_stn_all <- left_join(bluepyagg_stn_all, diethauls)
# add NEAMAP SST to surftemp field
NEAMAPidSST <- NEAMAPstationSST %>%
mutate(id = station) %>%
dplyr::select(id, SST)
bluepyagg_stn_all <- left_join(bluepyagg_stn_all, NEAMAPidSST, by="id") %>%
mutate(surftemp = coalesce(surftemp, SST)) %>%
dplyr::select(-SST)
# save merged dataset with day, month, and NEAMAP surftemp, same name
saveRDS(bluepyagg_stn_all, here("fhdat/bluepyagg_stn_all.rds"))
In this new dataset, the majority of missing surftemp observations are from NEFSC (3073), but thre are still 45 NEAMAP stations missing surftemp as well that we can fill with the OISST temperature.
Now to get OISST data for each day/month/year/station location.
Process:
for each year, read in raster brick file or dataframe for that year find dates with survey hauls get SST for survey haul date and position (nearest neighbor) save year month day position OISST close the raster brick (df) bind_rows into single dataframe over all years (look at the matches and see if ok) merge with diet dataframe
#read in diet data with month day fields
bluepyagg_stn_all <- readRDS(here("fhdat/bluepyagg_stn_all.rds"))
stations <- bluepyagg_stn_all %>%
dplyr::mutate(day = str_pad(day, 2, pad='0'),
month = str_pad(month, 2, pad='0'),
yrmody = as.numeric(paste0(year, month, day))) %>%
dplyr::select(id, declon, declat, year, yrmody) %>%
na.omit() %>%
sf::st_as_sf(coords=c("declon","declat"), crs=4326, remove=FALSE)
#list of SST dataframes
SSTdfs <- list.files(here("data-raw/gridded/sst_data/"), pattern = "*.rds")
dietstn_OISST <- tibble()
for(df in SSTdfs){
sstdf <- readRDS(paste0(here("data-raw/gridded/sst_data/", df)))
# keep only bluefish dates in SST year
stationsyr <- stations %>%
filter(year == unique(sstdf$year))
# keep only sst days in bluefish dataset
sstdf_survdays <- sstdf %>%
dplyr::mutate(yrmody = as.numeric(paste0(year, month, day)) )%>%
dplyr::filter(yrmody %in% unique(stationsyr$yrmody)) %>%
dplyr::mutate(year = as.numeric(year),
month = as.numeric(month),
day = as.numeric(day),
declon = Lon,
declat = Lat) %>%
dplyr::select(-Lon, -Lat) %>%
sf::st_as_sf(coords=c("declon","declat"), crs=4326, remove=FALSE)
# now join by nearest neighbor and date
#https://stackoverflow.com/questions/71959927/spatial-join-two-data-frames-by-nearest-feature-and-date-in-r
yrdietOISST <- do.call('rbind', lapply(split(stationsyr, 1:nrow(stationsyr)), function(x) {
st_join(x, sstdf_survdays[sstdf_survdays$yrmody == unique(x$yrmody),],
#join = st_nearest_feature
join = st_nn, k = 1, progress = FALSE
)
}))
# #datatable solution--works but doesnt seem faster?
# df1 <- data.table(stationsyr)
#
# .nearest_samedate <- function(x) {
# st_join(st_as_sf(x), sstdf_survdays[sstdf_survdays$yrmody == unique(x$yrmody),], join = st_nearest_feature)
# }
# #
# yrdietOISST <- df1[, .nearest_samedate(.SD), by = list(1:nrow(df1))]
dietstn_OISST <- rbind(dietstn_OISST, yrdietOISST)
}
saveRDS(dietstn_OISST, here("data-raw/dietstn_OISST.rds"))
Now join the OISST dataset with the bluefish prey data (1985-):
#read in diet data and station-OISST
bluepyagg_stn_all <- readRDS(here("fhdat/bluepyagg_stn_all.rds"))
dietstn_OISST <- readRDS(here("data-raw/dietstn_OISST.rds"))
dietstn_OISST_merge <- dietstn_OISST %>%
dplyr::rename(declon = declon.x,
declat = declat.x,
year = year.x,
oisst = sst) %>%
dplyr::select(id, oisst) %>%
sf::st_drop_geometry()
bluepyagg_stn_all_OISST <- left_join(bluepyagg_stn_all, dietstn_OISST_merge)
saveRDS(bluepyagg_stn_all_OISST, here("fhdat/bluepyagg_stn_all_OISST.rds"))
Compare the in situ surface temperatures with OISST where there are both
comparesst <- bluepyagg_stn_all_OISST %>%
dplyr::filter(year>1984)%>%
dplyr::select(surftemp, oisst) %>%
na.omit()
ggplot2::ggplot(comparesst, aes(x=surftemp, y=oisst)) +
geom_point(color="blue", alpha=0.1)+
geom_abline(intercept = 0, slope = 1) +
theme_bw()
mapsst <- bluepyagg_stn_all_OISST %>%
dplyr::filter(year>1984) %>%
dplyr::mutate(sstdiff = surftemp-oisst) %>%
dplyr::select(id, year, season_ng, declon, declat, surftemp, oisst, sstdiff)
yrmap <- function(mapyr){
ggplot2::ggplot(mapsst%>%filter(year==mapyr)) +
geom_sf(data = ecodata::coast) +
coord_sf(xlim = c(-77, -65), ylim = c(35, 45)) +
geom_point(aes(x=declon, y=declat, colour=sstdiff)) +
scale_color_gradient2(low = "blue",
mid = "green",
high = "purple",
midpoint = 0,
na.value = "yellow") +
theme_bw() +
facet_wrap(~season_ng) +
ggtitle(paste("SST difference survey-OISST:", mapyr, sep = " "))
}
Map stations by SST match/mismatch for each year. Yellow is missing data that would be filled by OISST, and the color range shows how different SST is for stations with both values.
for(mapyr in 1985:2021){
cat(" \n####", as.character(mapyr)," \n")
print(yrmap(mapyr))
cat(" \n")
}
Following up on other temperature sources:
Continusous seawater source for Bigelow only (2009-)
Net sensors?
Higher resolution satellite products: AVHRR
Pathfinder SST
* 1981-present * Input to the OISST * 4 km instead of 25 km
This is higher resolution source data than used in the SOE, https://www.ncei.noaa.gov/products/avhrr-pathfinder-sst
“These data were provided by GHRSST and the NOAA National Centers for Environmental Information.”
“Saha, Korak; Zhao, Xuepeng; Zhang, Huai-min; Casey, Kenneth S.; Zhang, Dexin; Baker-Yeboah, Sheekela; Kilpatrick, Katherine A.; Evans, Robert H.; Ryan, Thomas; Relph, John M. (2018). AVHRR Pathfinder version 5.3 level 3 collated (L3C) global 4km sea surface temperature for 1981-Present. [indicate subset used]. NOAA National Centers for Environmental Information. Dataset. https://doi.org/10.7289/v52j68xx. Accessed [date].
AVHRR data is organized into year/data folders with 2 nc files for each date, one day and one night.
Downloading all of this will be too much so instead select only dates in months matching survey data.
There may be missing SSTs for a given day and area due to clouds. Can try borrowing from the nearest date but limit to one day earlier or later?
Will need to first get a list of filenames for the year from the
website
Split the filenames to dates and times
Match dates to dates (year month) in the survey diet dataset
Download only filenames matching those dates (month level in case we
need to fill)
Crop to NE shelf
Save as dataframe, one for each year?
Can I go right from daily nc to dataframe? Yes, making annual dfs here:
#read in diet data with month day fields
bluepyagg_stn_all <- readRDS(here("fhdat/bluepyagg_stn_all.rds"))
stations <- bluepyagg_stn_all %>%
dplyr::mutate(day = str_pad(day, 2, pad='0'),
month = str_pad(month, 2, pad='0'),
yrmody = as.numeric(paste0(year, month, day))) %>%
dplyr::select(id, declon, declat, year, yrmody) %>%
na.omit() %>%
sf::st_as_sf(coords=c("declon","declat"), crs=4326, remove=FALSE)
#full list of filenames to filter by survey dates
# get a list from https server R: https://stackoverflow.com/questions/67695769/list-files-on-https-server-using-r
years <- 1985:2021
#years <- 2011:2021 #this was because I had to stop downloading, would have done all
for(i in years) {
sstyrdf <- data.frame()
url <- paste0("https://www.ncei.noaa.gov/data/oceans/pathfinder/Version5.3/L3C/", i, "/data/")
page <- rvest::read_html(url)
#list of SST AVHRR files in each year folder
yrfiles <- rvest::html_elements(page, xpath= ".//a[contains(@href, '.nc')]") %>%
rvest::html_text()
# keep only bluefish dates in SST year, +/- 2 days for borrowing
datesyr <- stations %>%
sf::st_drop_geometry() %>%
dplyr::filter(year == i) %>%
dplyr::select(yrmody) %>%
dplyr::distinct() %>%
dplyr::mutate(p1 = yrmody + 1,
p2 = yrmody + 2,
m1 = yrmody - 1,
m2 = yrmody - 2) %>%
tidyr::pivot_longer(everything(), values_to = "yrmody") %>%
dplyr::select(yrmody) %>%
dplyr::distinct()
#remove bad dates and convert to character for comparison with filenames
datesyrch <- lubridate::parse_date_time(datesyr$yrmody, "%y%m%d") %>%
na.omit() %>%
format("%Y%m%d")
#list of files for the survey dates plus or minus 2 days
yrfiles <- yrfiles[stringr::str_starts(yrfiles, paste(datesyrch, collapse = "|"))]
# get data from all files
for(d in 1:length(yrfiles)){
name <- paste0(stringr::str_extract(yrfiles[d], "^.{14}"), ".nc")
download.file(paste0(url, yrfiles[d]), destfile = name)
r <- terra::rast(name, c("sea_surface_temperature",
"quality_level"))
rcrop <- terra::crop(r, c(-77,-65,35,45))
sstdaydf <- terra::as.data.frame(rcrop, xy=TRUE) %>%
dplyr::mutate(sst = sea_surface_temperature - 273.15) %>%
dplyr::filter(quality_level > 0) %>%
dplyr::mutate(date = unique(terra::time(rcrop)))
unlink(name) # remove nc file to save space
sstyrdf <- bind_rows(sstyrdf, sstdaydf)
}# end yrfiles within a year
#save each year's sst dataframe for survey dates +-2
filename <- here::here("data-raw","gridded", "sst_data", paste0("AVHRRsst_", i, ".rds"))
saveRDS(sstyrdf, filename)
} # end years
That took 7 hours for 1985-2010 plus another 3 hours for 2011-2021.
All sst files have been uploaded to this google folder.
plotAVHRR <- function(sstdfdat){
ggplot() +
geom_tile(data = sstdfdat, aes(x = x, y = y, fill = sst)) +
geom_sf(data = ecodata::coast) +
scale_fill_gradientn(name = "Temp C",
limits = c(0.5, 31),
colours = c(scales::muted("blue"), "white",
scales::muted("red"), "black")) +
coord_sf(xlim = c(-77, -65), ylim = c(35, 45)) +
#ggtitle(paste("SST, AVHRR", unique(sstdfdat$date), sep = " ")) +
facet_wrap(~as.character(date))
}
Here is some data plotted for 2021, illustrating both resolution and cloud issues. Data have been filtered for quality 3 and above as suggested by Kim Hyde. Gray is everything below 0.5 C. Night and day are shown as facets.
sstyrdf <- readRDS(here("data-raw/gridded/sst_data/AVHRRarchive/AVHRRsst_2021.rds"))
survdates <- unique(stringr::str_extract(sstyrdf$date, "^.{10}"))
for(survday in survdates){
mapday <- sstyrdf %>%
filter(quality_level>2, #Kim Hyde recommendation
str_detect(date, pattern = regex(paste0("\\b", survday))))
cat(" \n####", as.character(survday)," \n")
print(plotAVHRR(mapday))
cat(" \n")
}
If we are going to use this we need a way to combine and fill holes.