In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [2]:
df = pd.read_csv('data/streamflow_prediction_dataset_dropped_cols.csv')
display(df)
date WTEQ_BisonLake WTEQ_McClurePass PREC_BisonLake PREC_McClurePass TAVG_BisonLake TAVG_McClurePass soilmoisture_station378_2ft soilmoisture_station378_8ft soilmoisture_station378_20ft ... soilmoisture_station457_20ft soilmoisture_station607_4ft soilmoisture_station607_8ft soilmoisture_station607_20ft soilmoisture_station680_2ft soilmoisture_station680_8ft soilmoisture_station680_20ft soilmoisture_station802_2ft soilmoisture_station802_8ft soilmoisture_station802_20ft
0 2008-03-12 28.6 23.9 29.1 22.9 19.4 30.2 32.2 35.4 23.9 ... 20.9 10.2 24.2 25.8 14.6 11.7 26.0 4.0 1.3 6.5
1 2008-03-15 29.2 24.5 29.7 23.4 11.7 23.4 32.1 35.6 24.1 ... 20.9 10.3 24.4 25.4 14.7 11.7 26.2 4.0 1.4 6.9
2 2008-03-17 29.3 24.8 29.8 23.6 14.4 24.3 32.2 35.6 24.2 ... 21.0 10.3 24.7 25.2 14.4 12.1 26.1 4.1 1.4 6.7
3 2008-03-18 29.3 24.8 29.8 23.6 11.8 23.9 32.2 35.6 24.2 ... 21.2 10.6 24.9 25.3 14.7 12.1 26.0 4.0 1.4 6.6
4 2008-03-19 29.4 24.9 29.8 23.6 20.8 30.2 32.2 35.6 24.2 ... 21.1 10.6 25.0 25.1 14.6 11.7 25.9 4.0 1.6 6.9
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2991 2021-07-23 0.0 0.0 27.7 20.7 52.9 62.1 21.2 17.1 18.4 ... 1.5 13.8 14.5 11.5 35.5 14.2 30.2 24.1 5.5 9.2
2992 2021-07-24 0.0 0.0 27.7 21.1 54.7 57.0 19.6 17.0 18.4 ... 1.6 13.4 14.1 11.3 33.7 13.9 30.2 24.0 5.3 8.7
2993 2021-07-25 0.0 0.0 27.8 21.5 52.7 57.6 19.7 16.8 18.4 ... 1.4 13.1 13.9 10.8 39.8 13.8 30.2 23.8 5.0 8.1
2994 2021-07-26 0.0 0.0 27.8 21.5 57.0 61.2 25.2 16.8 18.4 ... 1.2 12.9 13.9 10.4 37.7 14.1 30.2 23.7 4.8 7.0
2995 2021-07-27 0.0 0.0 27.8 21.5 58.5 64.8 22.4 16.8 18.2 ... 1.2 12.4 13.6 10.1 34.0 13.9 30.1 23.9 4.4 6.1

2996 rows × 22 columns

In [3]:
# Identifying and averaging columns with "PREC" and "TAVG"
prec_columns = [col for col in df.columns if "PREC" in col]
tavg_columns = [col for col in df.columns if "TAVG" in col]

# Averaging the PREC and TAVG columns
df['PREC_Avg'] = df[prec_columns].mean(axis=1)
df['TAVG_Avg'] = df[tavg_columns].mean(axis=1)

# Identifying soil moisture columns by depth and averaging them dynamically
soilmoisture_depths = ['2ft', '4ft', '8ft', '20ft']
for depth in soilmoisture_depths:
    soilmoisture_columns = [col for col in df.columns if f"soilmoisture" in col and depth in col]
    if soilmoisture_columns:
        df[f'soilmoisture_Avg_{depth}'] = df[soilmoisture_columns].mean(axis=1)

# Dropping only the original columns used in the averaging, not the new averaged columns
columns_to_drop = prec_columns + \
    tavg_columns + \
        [col for col in df.columns if "soilmoisture" in col and col not in df.columns[-len(soilmoisture_depths):]]
df_final = df.drop(columns=columns_to_drop)

df_final = df_final.set_index('date')
df_final.to_csv('data/streamflow_prediction_dataset_averaged_cols.csv')
display(df_final)
WTEQ_BisonLake WTEQ_McClurePass PREC_Avg TAVG_Avg soilmoisture_Avg_2ft soilmoisture_Avg_4ft soilmoisture_Avg_8ft soilmoisture_Avg_20ft
date
2008-03-12 28.6 23.9 26.00 24.80 17.375 10.2 17.74 20.62
2008-03-15 29.2 24.5 26.55 17.55 17.375 10.3 17.88 20.70
2008-03-17 29.3 24.8 26.70 19.35 17.425 10.3 18.04 20.64
2008-03-18 29.3 24.8 26.70 17.85 17.475 10.6 18.06 20.66
2008-03-19 29.4 24.9 26.70 25.50 17.425 10.6 18.06 20.64
... ... ... ... ... ... ... ... ...
2021-07-23 0.0 0.0 24.20 57.50 21.250 13.8 14.60 14.16
2021-07-24 0.0 0.0 24.40 55.85 20.275 13.4 14.38 14.04
2021-07-25 0.0 0.0 24.65 55.15 21.800 13.1 14.24 13.78
2021-07-26 0.0 0.0 24.65 59.10 22.675 12.9 14.26 13.44
2021-07-27 0.0 0.0 24.65 61.65 21.050 12.4 14.08 13.14

2996 rows × 8 columns

In [4]:
# Corr plot
sns.heatmap(df_final.corr(), annot=True)
plt.show()
No description has been provided for this image
In [5]:
# PCA 3 Components
scaler = StandardScaler()
scaler.fit(df_final)
scaled_data = scaler.transform(df_final)

pca = PCA(n_components=3)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum()}")
Explained variance ratio: 0.7410008019014971
In [6]:
x_pca_df = pd.DataFrame(x_pca, columns=['PC1', 'PC2', 'PC3'])
x_pca_df.to_csv('data/streamflow_prediction_dataset_pca.csv')
In [7]:
# Loadings
loadings = pd.DataFrame(
    pca.components_.T, 
    columns=['PC1', 'PC2', 'PC3'], 
    index=df_final.columns
)

display(loadings)
PC1 PC2 PC3
WTEQ_BisonLake 0.497955 -0.183066 -0.328210
WTEQ_McClurePass 0.352926 -0.394528 -0.435644
PREC_Avg -0.052566 0.441095 -0.665005
TAVG_Avg -0.277844 0.534101 -0.209983
soilmoisture_Avg_2ft 0.436094 0.294634 0.169407
soilmoisture_Avg_4ft 0.315327 0.386690 0.049965
soilmoisture_Avg_8ft 0.495560 0.274834 0.183762
soilmoisture_Avg_20ft 0.111063 0.138321 0.388850
In [8]:
# 3D Plot
fig = px.scatter_3d(
    x=x_pca[:,0], 
    y=x_pca[:,1], 
    z=x_pca[:,2], 
    labels={
        'x': 'PC1', 
        'y': 'PC2', 
        'z': 'PC3'
    }
)

fig.show("png")
No description has been provided for this image
In [9]:
# PCA 2 Components
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum()}")
Explained variance ratio: 0.6084890287448601
In [10]:
# 2D Plot
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1])
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
No description has been provided for this image

How many components are needed to attain at least 95% of the variance of this dataset?

In [11]:
# PCA >95% explained variance ratio
for i in range(1, len(df_final.columns)):
    pca = PCA(n_components=i)
    pca.fit(scaled_data)
    if pca.explained_variance_ratio_.sum() > 0.95:
        print(f"Number of components: {i}")
        break
    
pca = PCA(n_components=i)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum()}")
display(x_pca)
Number of components: 6
Explained variance ratio: 0.9601600406294081
array([[ 1.76389574, -2.35196846, -2.27267027,  1.03494896,  0.25632147,
         0.47177285],
       [ 1.96467502, -2.60582904, -2.26658516,  1.03564982,  0.17733779,
         0.3367471 ],
       [ 1.97491624, -2.55189082, -2.31504949,  1.04876267,  0.19527908,
         0.38190608],
       ...,
       [-1.42983787,  0.79763789,  0.06373772, -0.04160527,  0.83965854,
         0.71349838],
       [-1.43235217,  0.96569096,  0.0395898 , -0.04953814,  0.97037452,
         0.87139566],
       [-1.63548575,  0.93390702, -0.05131443,  0.02569397,  0.88502306,
         0.77966624]])
In [12]:
# Top 3 eigenvalues
eigenvalues = pca.explained_variance_
eigenvalues = eigenvalues / eigenvalues.sum()
top_3_eigenvalues = eigenvalues[:3]
display(top_3_eigenvalues)
array([0.36065468, 0.2730824 , 0.13801009])