Environmental data from NOAA BigQuery

earth science
NOAA data

I created this demo for ECE 228 “Machine Learning for Physical Applications.” It demonstrates how to load, visualize, and analyze data from the NOAA GSOD dataset.

I’ve added a demo for prediction of seasonal timeseries, using the FFT to find major seasonal cycles, and SARIMAX (seasonal arima model) to predict. This notebook loads and visualizes data from the NOAA GSOD dataset into Pandas dataframe (2014 to 2018, access via BiqQuery API: https://cloud.google.com/bigquery/public-data/)

1 Include packages

Code
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
import matplotlib 
import matplotlib.pyplot as plt
import glob
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import scipy.signal as sig
import time
import datetime

# signalprocessing
from scipy.fftpack import fft, ifft, fftfreq
import matplotlib.pyplot as plt
import statsmodels.api as sma
import statsmodels.tsa.statespace.api as sm
from statsmodels.tsa.stattools import adfuller

from numpy.random import randint
import matplotlib.dates as mdates

# print(pd.__version__)

2 Load the data

After storing the data locally, load 5 years from 2014-2018 into a dataframe.

Code
dataset_path='datasets/NOAA_SST/'
fpath = dataset_path # + 'noaa_gsod/'
pandas_files = sorted(glob.glob(fpath + 'noaa_gsod.gsod*'))
pandas_files = pandas_files[-5:]  # Take the last five years

stations = pd.read_pickle(dataset_path+'noaa_gsod.stations') # load station data
stations = stations[stations['begin'].astype(int)>20140101] # station data for the past 5 years

df = None
for i, fi in enumerate(pandas_files):
    dtmp = pd.read_pickle(fi)
    
    # read year from the filename and add as a column
    yr = int(fi[-4:]) 
    dtmp['yr'] = yr*np.ones(len(dtmp),).astype(int)
    
    # create a datetime column, makes plotting timeseries easier
    dtmp['Datetime'] = pd.to_datetime((dtmp['yr'].astype(str) + dtmp['mo'].astype(str) + dtmp['da'].astype(str)),\
                                       format='%Y%m%d')
    
    # stick the years together
    df = pd.concat([df, dtmp], ignore_index=True)

3 Merge different datasets

Code
df_all = df.merge(stations, left_on='stn',right_on='usaf')
df_all = df_all[df_all['lon'].notna()]

df_all = df_all.set_index('Datetime', drop=True)

4 Select a random station (red + on map)

Code
np.random.seed(33) # set the seed to show the same example every time for the demo
rs = np.unique(df_all['stn'].values) # find unique stations with data
rand_stat = rs[randint(len(rs))] # pick a random station

features = df_all.loc[df_all['stn'] == rand_stat] # pick weather at random station
features = features.drop(columns=['stn','max','min'], axis=1)
features = features.sort_index()

# View station locations
fig = plt.figure(figsize=(18.5, 10.5))
m = Basemap(projection='cyl',llcrnrlat=-90,urcrnrlat=90,\
            llcrnrlon=-180,urcrnrlon=180, resolution='l')

# Make map look pretty
m.drawmapboundary(fill_color='xkcd:lightblue')
m.fillcontinents(color='xkcd:green',lake_color='xkcd:lightblue')
m.drawmeridians(np.arange(0.,350.,30.),labels=[True,False,False,True])
m.drawparallels(np.arange(-90.,90,30.),labels=[False,True,True,False])

# plot the stations with blue plusses
lon = df_all['lon'].tolist()
lat = df_all['lat'].tolist()
xpt,ypt = m(lon,lat)
m.plot(xpt,ypt,'b+') 

# Show the position of the station
print('Station ID: ', rand_stat, '. Number of samples: ', len(df_all.loc[df_all['stn']==rand_stat]))
lon = features['lon'].tolist()
lat = features['lat'].tolist()
m.plot(lon, lat,'r+',markersize=10) 
plt.show()
Station ID:  066900 . Number of samples:  1786
Figure 1: A map using a cylindrical projection and showing all stations with data from 2014 through 2018 (blue +) with a randomly selected station (red +).

5 Plot temperature timeseries

Code
x = features['temp'].values
xmean = np.mean(x)
# Remove the mean for processing
y = x - xmean
N = len(y)

downsamp = 7
yt = sig.decimate(y, downsamp)
tst = features.index[::downsamp]

# Check if it's stationary! (it's not)
# test = adfuller(y)
# print(["It's stationary" if test[1]<0.05 else "It's not stationary"])

fig=plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(features.index, x)
ax.plot(features.index, x, marker='o')
ax.plot(tst, yt+xmean, zorder=3, marker='o',color='r',markersize=3)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
fig.autofmt_xdate()
plt.legend(['Daily temperature','Weekly temperature'])
plt.ylabel('Temperature ($^\circ$C)')
plt.show()

Temperature from a random station shown at daily and weekly resolution.

6 Find the temperature seasonality

Code
Nfft = int( 2**(np.ceil(np.log2(N))) )
freq = fftfreq(Nfft)
freq[freq==0] = freq[freq==0] + 0.0001 # avoid divide-by-zero
n_oneside = Nfft//2
t_d = 1/freq[:n_oneside] 
seasonality = np.abs(fft(y))

# find peak frequency
pkfreq, props = sig.find_peaks(seasonality[:n_oneside], prominence=5000)
plt.figure()
plt.plot(freq[:n_oneside], seasonality[:n_oneside], 'b')
plt.stem([1/(365*2),2/365,1/365], 5000*np.ones((3,)), 'r')
plt.scatter(freq[pkfreq[0]], seasonality[pkfreq[0]])
plt.xlabel('Frequency (1/days)')
plt.xlim([0,0.05])

plt.legend(['Seasonality (cycles per day)','Peak: ' + str(int(1/freq[pkfreq[0]])) + ' days','Biannual/Annual/SemiPeriodicity'])
plt.show()

ts = features.index

A frequency plot showing the seasonality of temperature at the random location.

7 Train a SARIMAX model on weekly temperature

Code
tpred = 100 # predict 100 days into the future
Ntrain = 230#len(tst[tst<datetime.datetime.strptime('January 1, 2018','%B %d, %Y')])
traindata = yt[0:Ntrain]
testdata = yt[Ntrain:]

ts_pred = [tst[-(len(yt)-Ntrain)] + DateOffset(days=i) for i in range((tpred+len(yt)-Ntrain)*downsamp)]
ts_pred = ts_pred[::downsamp]

# use seasonal arima model
mod = sm.SARIMAX(traindata, order=(0, 0, 0), seasonal_order = (2,0,2,np.floor(1/freq[pkfreq[0]]/downsamp)))
res = mod.fit()
 
train = res.predict(1, Ntrain)
prediction = res.forecast(tpred+ (len(yt)-Ntrain))

plt.figure()
plt.scatter(features.index, y)
plt.plot(ts_pred, prediction,'r')
plt.plot(tst[:Ntrain], train, 'r:')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
fig.autofmt_xdate()
plt.ylabel('Temperature anomaly ($^\circ$C)')
plt.legend(['Data','Prediction (unseen data)','Training Results'])
plt.show()

Temperature at a random station along with the SARIMAX predictions on training (dashed red) and unseen data (solid red). The model requires a ramp-up period of about 1 year at weekly resolution.