import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import fnmatch
df = pd.read_parquet('data/streamflow_prediction_dataset.parquet')
display(df)
SNWD_BisonLake | SNWD_McClurePass | WTEQ_BisonLake | WTEQ_McClurePass | PREC_BisonLake | PREC_McClurePass | PRCP_BisonLake | PRCP_McClurePass | TAVG_BisonLake | TAVG_McClurePass | ... | soilmoisture_station607_4ft | soilmoisture_station607_8ft | soilmoisture_station607_20ft | soilmoisture_station680_2ft | soilmoisture_station680_8ft | soilmoisture_station680_20ft | soilmoisture_station802_2ft | soilmoisture_station802_8ft | soilmoisture_station802_20ft | streamflow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2008-03-12 | 82.0 | 60.0 | 28.6 | 23.9 | 29.1 | 22.9 | 0.1 | 0.0 | 19.4 | 30.2 | ... | 10.2 | 24.2 | 25.8 | 14.6 | 11.7 | 26.0 | 4.0 | 1.3 | 6.5 | 2360.0 |
2008-03-15 | 86.0 | 63.0 | 29.2 | 24.5 | 29.7 | 23.4 | 0.1 | 0.2 | 11.7 | 23.4 | ... | 10.3 | 24.4 | 25.4 | 14.7 | 11.7 | 26.2 | 4.0 | 1.4 | 6.9 | 2260.0 |
2008-03-17 | 84.0 | 63.0 | 29.3 | 24.8 | 29.8 | 23.6 | 0.0 | 0.2 | 14.4 | 24.3 | ... | 10.3 | 24.7 | 25.2 | 14.4 | 12.1 | 26.1 | 4.1 | 1.4 | 6.7 | 2260.0 |
2008-03-18 | 83.0 | 62.0 | 29.3 | 24.8 | 29.8 | 23.6 | 0.0 | 0.0 | 11.8 | 23.9 | ... | 10.6 | 24.9 | 25.3 | 14.7 | 12.1 | 26.0 | 4.0 | 1.4 | 6.6 | 2260.0 |
2008-03-19 | 82.0 | 61.0 | 29.4 | 24.9 | 29.8 | 23.6 | 0.0 | 0.0 | 20.8 | 30.2 | ... | 10.6 | 25.0 | 25.1 | 14.6 | 11.7 | 25.9 | 4.0 | 1.6 | 6.9 | 2200.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-07-23 | 0.0 | 0.0 | 0.0 | 0.0 | 27.7 | 20.7 | 0.2 | 0.2 | 52.9 | 62.1 | ... | 13.8 | 14.5 | 11.5 | 35.5 | 14.2 | 30.2 | 24.1 | 5.5 | 9.2 | 1170.0 |
2021-07-24 | 0.0 | 0.0 | 0.0 | 0.0 | 27.7 | 21.1 | 0.0 | 0.4 | 54.7 | 57.0 | ... | 13.4 | 14.1 | 11.3 | 33.7 | 13.9 | 30.2 | 24.0 | 5.3 | 8.7 | 1240.0 |
2021-07-25 | 0.0 | 0.0 | 0.0 | 0.0 | 27.8 | 21.5 | 0.1 | 0.4 | 52.7 | 57.6 | ... | 13.1 | 13.9 | 10.8 | 39.8 | 13.8 | 30.2 | 23.8 | 5.0 | 8.1 | 1190.0 |
2021-07-26 | 0.0 | 0.0 | 0.0 | 0.0 | 27.8 | 21.5 | 0.0 | 0.0 | 57.0 | 61.2 | ... | 12.9 | 13.9 | 10.4 | 37.7 | 14.1 | 30.2 | 23.7 | 4.8 | 7.0 | 1170.0 |
2021-07-27 | 0.0 | 0.0 | 0.0 | 0.0 | 27.8 | 21.5 | 0.0 | 0.0 | 58.5 | 64.8 | ... | 12.4 | 13.6 | 10.1 | 34.0 | 13.9 | 30.1 | 23.9 | 4.4 | 6.1 | 1110.0 |
2996 rows × 30 columns
sns.heatmap(df.corr())
plt.show()
Good reminder that a lot of features are redundant. TAVG, TMAX, and TMIN are average, max, and min temperatures for a given day. We can drop TMAX and TMIN. SNWD and WTEQ are snow depth and snow water equivalent, respectively. These two features are highly correlated. We will select snow water equivalent. PREC and PRCP mean precipitation accumulation and precipitation increment. Since we are using daily values, these two measures are identical. Accumulation measures the accumulation of rain over the course of a period, whereas increment will measure the incremental rainfall in shorter periods. Since our period is "daily", we're showing the incremental daily rain, which equals the accumulated rain total for that day.
We will also drop streamflow from the dataset as this is the value we aim to predict in future ML applications.
drop_cols = ['TMAX', 'TMIN', 'SNWD', 'PRCP', 'streamflow']
for c in drop_cols:
for col in df.columns:
if fnmatch.fnmatch(col, f"*{c}*"):
df = df.drop(columns=[col])
display(df)
df.to_csv('data/streamflow_prediction_dataset_dropped_cols.csv')
sns.heatmap(df.corr())
plt.show()
WTEQ_BisonLake | WTEQ_McClurePass | PREC_BisonLake | PREC_McClurePass | TAVG_BisonLake | TAVG_McClurePass | soilmoisture_station378_2ft | soilmoisture_station378_8ft | soilmoisture_station378_20ft | soilmoisture_station457_2ft | ... | soilmoisture_station457_20ft | soilmoisture_station607_4ft | soilmoisture_station607_8ft | soilmoisture_station607_20ft | soilmoisture_station680_2ft | soilmoisture_station680_8ft | soilmoisture_station680_20ft | soilmoisture_station802_2ft | soilmoisture_station802_8ft | soilmoisture_station802_20ft | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2008-03-12 | 28.6 | 23.9 | 29.1 | 22.9 | 19.4 | 30.2 | 32.2 | 35.4 | 23.9 | 18.7 | ... | 20.9 | 10.2 | 24.2 | 25.8 | 14.6 | 11.7 | 26.0 | 4.0 | 1.3 | 6.5 |
2008-03-15 | 29.2 | 24.5 | 29.7 | 23.4 | 11.7 | 23.4 | 32.1 | 35.6 | 24.1 | 18.7 | ... | 20.9 | 10.3 | 24.4 | 25.4 | 14.7 | 11.7 | 26.2 | 4.0 | 1.4 | 6.9 |
2008-03-17 | 29.3 | 24.8 | 29.8 | 23.6 | 14.4 | 24.3 | 32.2 | 35.6 | 24.2 | 19.0 | ... | 21.0 | 10.3 | 24.7 | 25.2 | 14.4 | 12.1 | 26.1 | 4.1 | 1.4 | 6.7 |
2008-03-18 | 29.3 | 24.8 | 29.8 | 23.6 | 11.8 | 23.9 | 32.2 | 35.6 | 24.2 | 19.0 | ... | 21.2 | 10.6 | 24.9 | 25.3 | 14.7 | 12.1 | 26.0 | 4.0 | 1.4 | 6.6 |
2008-03-19 | 29.4 | 24.9 | 29.8 | 23.6 | 20.8 | 30.2 | 32.2 | 35.6 | 24.2 | 18.9 | ... | 21.1 | 10.6 | 25.0 | 25.1 | 14.6 | 11.7 | 25.9 | 4.0 | 1.6 | 6.9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-07-23 | 0.0 | 0.0 | 27.7 | 20.7 | 52.9 | 62.1 | 21.2 | 17.1 | 18.4 | 4.2 | ... | 1.5 | 13.8 | 14.5 | 11.5 | 35.5 | 14.2 | 30.2 | 24.1 | 5.5 | 9.2 |
2021-07-24 | 0.0 | 0.0 | 27.7 | 21.1 | 54.7 | 57.0 | 19.6 | 17.0 | 18.4 | 3.8 | ... | 1.6 | 13.4 | 14.1 | 11.3 | 33.7 | 13.9 | 30.2 | 24.0 | 5.3 | 8.7 |
2021-07-25 | 0.0 | 0.0 | 27.8 | 21.5 | 52.7 | 57.6 | 19.7 | 16.8 | 18.4 | 3.9 | ... | 1.4 | 13.1 | 13.9 | 10.8 | 39.8 | 13.8 | 30.2 | 23.8 | 5.0 | 8.1 |
2021-07-26 | 0.0 | 0.0 | 27.8 | 21.5 | 57.0 | 61.2 | 25.2 | 16.8 | 18.4 | 4.1 | ... | 1.2 | 12.9 | 13.9 | 10.4 | 37.7 | 14.1 | 30.2 | 23.7 | 4.8 | 7.0 |
2021-07-27 | 0.0 | 0.0 | 27.8 | 21.5 | 58.5 | 64.8 | 22.4 | 16.8 | 18.2 | 3.9 | ... | 1.2 | 12.4 | 13.6 | 10.1 | 34.0 | 13.9 | 30.1 | 23.9 | 4.4 | 6.1 |
2996 rows × 21 columns
scaler = StandardScaler()
scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled, columns=df.columns, index=df.index)
display(df_scaled)
WTEQ_BisonLake | WTEQ_McClurePass | PREC_BisonLake | PREC_McClurePass | TAVG_BisonLake | TAVG_McClurePass | soilmoisture_station378_2ft | soilmoisture_station378_8ft | soilmoisture_station378_20ft | soilmoisture_station457_2ft | ... | soilmoisture_station457_20ft | soilmoisture_station607_4ft | soilmoisture_station607_8ft | soilmoisture_station607_20ft | soilmoisture_station680_2ft | soilmoisture_station680_8ft | soilmoisture_station680_20ft | soilmoisture_station802_2ft | soilmoisture_station802_8ft | soilmoisture_station802_20ft | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2008-03-12 | 1.883552 | 3.196112 | 0.342990 | 0.509263 | -0.814967 | -0.741420 | 0.740597 | 0.920122 | 0.648393 | 0.475802 | ... | 0.193986 | -1.005935 | -0.005907 | -0.087666 | -1.006524 | -0.508549 | -0.749191 | -1.060531 | -1.166220 | -0.465500 |
2008-03-15 | 1.943418 | 3.293490 | 0.385416 | 0.557196 | -1.274531 | -1.161009 | 0.727894 | 0.944797 | 0.677460 | 0.475802 | ... | 0.193986 | -0.996075 | 0.015597 | -0.087700 | -0.993916 | -0.508549 | -0.707242 | -1.060531 | -1.146229 | -0.405506 |
2008-03-17 | 1.953396 | 3.342180 | 0.392487 | 0.576370 | -1.113385 | -1.105475 | 0.740597 | 0.944797 | 0.691994 | 0.513427 | ... | 0.200775 | -0.996075 | 0.047854 | -0.087718 | -1.031742 | -0.444300 | -0.728216 | -1.048451 | -1.146229 | -0.435503 |
2008-03-18 | 1.953396 | 3.342180 | 0.392487 | 0.576370 | -1.268562 | -1.130157 | 0.740597 | 0.944797 | 0.691994 | 0.513427 | ... | 0.214353 | -0.966494 | 0.069358 | -0.087709 | -0.993916 | -0.444300 | -0.749191 | -1.060531 | -1.146229 | -0.450502 |
2008-03-19 | 1.963373 | 3.358410 | 0.392487 | 0.576370 | -0.731410 | -0.741420 | 0.740597 | 0.944797 | 0.691994 | 0.500885 | ... | 0.207564 | -0.966494 | 0.080110 | -0.087726 | -1.006524 | -0.508549 | -0.770166 | -1.060531 | -1.106247 | -0.405506 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-07-23 | -0.970057 | -0.682804 | 0.243995 | 0.298355 | 1.184432 | 1.226948 | -0.656634 | -1.337656 | -0.150959 | -1.342745 | ... | -1.123074 | -0.650968 | -1.048870 | -0.088893 | 1.628705 | -0.106994 | 0.131751 | 1.367593 | -0.326606 | -0.060535 |
2021-07-24 | -0.970057 | -0.682804 | 0.243995 | 0.336702 | 1.291862 | 0.912256 | -0.859867 | -1.349994 | -0.150959 | -1.392912 | ... | -1.116285 | -0.690409 | -1.091878 | -0.088910 | 1.401747 | -0.155181 | 0.131751 | 1.355512 | -0.366587 | -0.135529 |
2021-07-25 | -0.970057 | -0.682804 | 0.251066 | 0.375049 | 1.172495 | 0.949278 | -0.847165 | -1.374669 | -0.150959 | -1.380371 | ... | -1.129863 | -0.719989 | -1.113383 | -0.088953 | 2.170881 | -0.171243 | 0.131751 | 1.331352 | -0.426560 | -0.225521 |
2021-07-26 | -0.970057 | -0.682804 | 0.251066 | 0.375049 | 1.429134 | 1.171414 | -0.148550 | -1.374669 | -0.150959 | -1.355287 | ... | -1.143441 | -0.739710 | -1.113383 | -0.088987 | 1.906097 | -0.123056 | 0.131751 | 1.319272 | -0.466541 | -0.390507 |
2021-07-27 | -0.970057 | -0.682804 | 0.251066 | 0.375049 | 1.518659 | 1.393549 | -0.504209 | -1.374669 | -0.180026 | -1.380371 | ... | -1.143441 | -0.789011 | -1.145639 | -0.089013 | 1.439574 | -0.155181 | 0.110776 | 1.343432 | -0.546504 | -0.525495 |
2996 rows × 21 columns
3 Components¶
# PCA
components = 3
pca = PCA(n_components=components)
pca.fit(df)
df_pca = pd.DataFrame(
pca.transform(df),
columns=[f'PC{n+1}' for n in range(0,components)],
index=df.index
)
display(df_pca)
print(f"Explained Variance with {components} components: {pca.explained_variance_ratio_.sum()*100:.2f}%")
PC1 | PC2 | PC3 | |
---|---|---|---|
date | |||
2008-03-12 | -1021.971054 | 22.610200 | 4.247154 |
2008-03-15 | -1022.371487 | 30.408159 | 3.839125 |
2008-03-17 | -1022.571249 | 28.511002 | 4.505413 |
2008-03-18 | -1022.471308 | 30.214762 | 4.536593 |
2008-03-19 | -1022.670723 | 21.842089 | 5.726391 |
... | ... | ... | ... |
2021-07-23 | -1036.271276 | -36.857547 | -12.463828 |
2021-07-24 | -1036.471823 | -35.275929 | -14.026421 |
2021-07-25 | -1036.971419 | -35.283413 | -13.384042 |
2021-07-26 | -1037.370909 | -38.701855 | -11.943589 |
2021-07-27 | -1037.671318 | -41.436215 | -13.434326 |
2996 rows × 3 columns
Explained Variance with 3 components: 100.00%
# 3D plot
fig = px.scatter_3d(
df_pca,
x='PC1',
y='PC2',
z='PC3',
title='PCA'
)
fig.show("png")
# Scree plot
plt.plot(pca.explained_variance_ratio_)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.xticks(range(0, components))
plt.ylabel('Explained Variance')
plt.show()
sns.heatmap(df_pca.corr().apply(lambda x: round(x,8)), annot=True)
<Axes: >
Proof of no correlation
# Loadings
loadings = pca.components_.T
df_loadings = pd.DataFrame(
loadings,
columns = [f'PC{n+1}' for n in range(0,components)],
index=df.columns
)
display(df_loadings)
PC1 | PC2 | PC3 | |
---|---|---|---|
WTEQ_BisonLake | 0.000079 | 0.175965 | 0.260975 |
WTEQ_McClurePass | -0.000023 | 0.118754 | 0.044725 |
PREC_BisonLake | 0.000034 | -0.346831 | 0.268762 |
PREC_McClurePass | 0.000025 | -0.261255 | 0.195453 |
TAVG_BisonLake | 0.000038 | -0.558725 | 0.067593 |
TAVG_McClurePass | 0.000043 | -0.535658 | 0.095225 |
soilmoisture_station378_2ft | 0.000061 | 0.122124 | 0.255191 |
soilmoisture_station378_8ft | 0.000071 | 0.168061 | 0.253477 |
soilmoisture_station378_20ft | 0.000076 | 0.055520 | 0.235689 |
soilmoisture_station457_2ft | 0.000080 | 0.165979 | 0.247819 |
soilmoisture_station457_8ft | 0.000058 | 0.050228 | 0.117303 |
soilmoisture_station457_20ft | -0.000003 | 0.238684 | 0.379029 |
soilmoisture_station607_4ft | 0.000081 | -0.018007 | 0.323086 |
soilmoisture_station607_8ft | 0.000106 | 0.060923 | 0.373157 |
soilmoisture_station607_20ft | 1.000000 | 0.000029 | -0.000237 |
soilmoisture_station680_2ft | 0.000083 | -0.107431 | 0.170617 |
soilmoisture_station680_8ft | 0.000098 | -0.058145 | 0.209255 |
soilmoisture_station680_20ft | 0.000064 | -0.071565 | 0.112727 |
soilmoisture_station802_2ft | 0.000016 | -0.094705 | 0.157722 |
soilmoisture_station802_8ft | 0.000072 | -0.010157 | 0.161957 |
soilmoisture_station802_20ft | -0.000014 | -0.029139 | 0.147373 |
2 Components¶
# PCA
components = 2
pca = PCA(n_components=components)
pca.fit(df)
df_pca = pd.DataFrame(
pca.transform(df),
columns=[f'PC{n+1}' for n in range(0,components)],
index=df.index
)
display(df_pca)
print(f"Explained Variance with {components} components: {pca.explained_variance_ratio_.sum()*100:.2f}%")
PC1 | PC2 | |
---|---|---|
date | ||
2008-03-12 | -1021.971054 | 22.610200 |
2008-03-15 | -1022.371487 | 30.408159 |
2008-03-17 | -1022.571249 | 28.511002 |
2008-03-18 | -1022.471308 | 30.214762 |
2008-03-19 | -1022.670723 | 21.842089 |
... | ... | ... |
2021-07-23 | -1036.271276 | -36.857547 |
2021-07-24 | -1036.471823 | -35.275929 |
2021-07-25 | -1036.971419 | -35.283413 |
2021-07-26 | -1037.370909 | -38.701855 |
2021-07-27 | -1037.671318 | -41.436215 |
2996 rows × 2 columns
Explained Variance with 2 components: 100.00%
# 2D plot
fig = px.scatter(
df_pca,
x='PC1',
y='PC2',
title='PCA'
)
fig.show()