import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly.express as px
from IPython.display import Image
KMeans: All Features¶
df = pd.read_csv('data/streamflow_prediction_dataset_averaged_cols.csv')
df = df.set_index('date')
display(df)
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(df)
df = pd.DataFrame(X, columns=df.columns)
display(df)
WTEQ_BisonLake | WTEQ_McClurePass | PREC_Avg | TAVG_Avg | soilmoisture_Avg_2ft | soilmoisture_Avg_4ft | soilmoisture_Avg_8ft | soilmoisture_Avg_20ft | |
---|---|---|---|---|---|---|---|---|
date | ||||||||
2008-03-12 | 28.6 | 23.9 | 26.00 | 24.80 | 17.375 | 10.2 | 17.74 | 20.62 |
2008-03-15 | 29.2 | 24.5 | 26.55 | 17.55 | 17.375 | 10.3 | 17.88 | 20.70 |
2008-03-17 | 29.3 | 24.8 | 26.70 | 19.35 | 17.425 | 10.3 | 18.04 | 20.64 |
2008-03-18 | 29.3 | 24.8 | 26.70 | 17.85 | 17.475 | 10.6 | 18.06 | 20.66 |
2008-03-19 | 29.4 | 24.9 | 26.70 | 25.50 | 17.425 | 10.6 | 18.06 | 20.64 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-07-23 | 0.0 | 0.0 | 24.20 | 57.50 | 21.250 | 13.8 | 14.60 | 14.16 |
2021-07-24 | 0.0 | 0.0 | 24.40 | 55.85 | 20.275 | 13.4 | 14.38 | 14.04 |
2021-07-25 | 0.0 | 0.0 | 24.65 | 55.15 | 21.800 | 13.1 | 14.24 | 13.78 |
2021-07-26 | 0.0 | 0.0 | 24.65 | 59.10 | 22.675 | 12.9 | 14.26 | 13.44 |
2021-07-27 | 0.0 | 0.0 | 24.65 | 61.65 | 21.050 | 12.4 | 14.08 | 13.14 |
2996 rows × 8 columns
WTEQ_BisonLake | WTEQ_McClurePass | PREC_Avg | TAVG_Avg | soilmoisture_Avg_2ft | soilmoisture_Avg_4ft | soilmoisture_Avg_8ft | soilmoisture_Avg_20ft | |
---|---|---|---|---|---|---|---|---|
0 | 1.883552 | 3.196112 | 0.415239 | -0.780671 | -0.320641 | -1.005935 | -0.061222 | -0.087600 |
1 | 1.943418 | 3.293490 | 0.460183 | -1.221634 | -0.320641 | -0.996075 | -0.034319 | -0.087566 |
2 | 1.953396 | 3.342180 | 0.472441 | -1.112154 | -0.311656 | -0.996075 | -0.003573 | -0.087591 |
3 | 1.953396 | 3.342180 | 0.472441 | -1.203387 | -0.302671 | -0.966494 | 0.000271 | -0.087583 |
4 | 1.963373 | 3.358410 | 0.472441 | -0.738096 | -0.311656 | -0.966494 | 0.000271 | -0.087591 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2991 | -0.970057 | -0.682804 | 0.268148 | 1.208223 | 0.375667 | -0.650968 | -0.664616 | -0.090370 |
2992 | -0.970057 | -0.682804 | 0.284491 | 1.107866 | 0.200467 | -0.690409 | -0.706892 | -0.090422 |
2993 | -0.970057 | -0.682804 | 0.304920 | 1.065290 | 0.474498 | -0.719989 | -0.733795 | -0.090533 |
2994 | -0.970057 | -0.682804 | 0.304920 | 1.305539 | 0.631728 | -0.739710 | -0.729952 | -0.090679 |
2995 | -0.970057 | -0.682804 | 0.304920 | 1.460636 | 0.339728 | -0.789011 | -0.764541 | -0.090808 |
2996 rows × 8 columns
def silhouette_analysis(X, range_n_clusters):
for n_clusters in range_n_clusters:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
print(f"For n_clusters = {n_clusters}, the average silhouette_score "
f"is : {silhouette_avg}")
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[
cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper), 0,
ith_cluster_silhouette_values, facecolor=color,
edgecolor=color, alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
# Change the feature indices here to plot different features
feature_x = 2 # Index of the feature for the x-axis
feature_y = 3 # Index of the feature for the y-axis
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, feature_x], X[:, feature_y], marker=".", s=30, lw=0, alpha=0.7,
c=colors, edgecolor="k")
centers = kmeans.cluster_centers_
ax2.scatter(centers[:, feature_x], centers[:, feature_y], marker="o", c="white",
alpha=1, s=200, edgecolor="k")
for i, c in enumerate(centers):
ax2.scatter(c[feature_x], c[feature_y], marker=f"${i}$", alpha=1, s=50,
edgecolor="k")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel(f"Feature space for the {feature_x + 1} feature")
ax2.set_ylabel(f"Feature space for the {feature_y + 1} feature")
plt.suptitle((f"Silhouette analysis for KMeans clustering on sample "
f"data with n_clusters = {n_clusters}"),
fontsize=14, fontweight="bold")
plt.show()
range_n_clusters = [2, 3, 4, 5, 6]
silhouette_analysis(X, range_n_clusters)
For n_clusters = 2, the average silhouette_score is : 0.278979440255955 For n_clusters = 3, the average silhouette_score is : 0.30947333950013606 For n_clusters = 4, the average silhouette_score is : 0.316245405519837 For n_clusters = 5, the average silhouette_score is : 0.3307353088244625 For n_clusters = 6, the average silhouette_score is : 0.35581225527993837
def plot_clusters(df, n_clusters, title='Clusters', x=None, y=None, z=None):
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0)
kmeans.fit(df)
df['cluster'] = kmeans.predict(df)
sns.pairplot(df, hue='cluster')
plt.show()
plot_clusters(df, 4)
del df
KMeans: Snow Water Equivalent, Precipitation, and Temperature¶
My initial cleaning steps involved pivoting the dataset to have each station_value_[height depth] combination to be a separate feature. However, with KMeans we would like to retain the labels to see if there are interesting patterns between individual stations. I will utilize the original datasets to perform KMeans below.
df_swe_temp = pd.read_parquet('data/nrcs_snow_water_equivalent_and_temp.parquet')
df_swe_temp = df_swe_temp.set_index(['date','Station'])
df_swe_temp = df_swe_temp.drop(columns=['SNWD', 'TMAX', 'TMIN', 'PRCP'])
display(df_swe_temp)
WTEQ | PREC | TAVG | ||
---|---|---|---|---|
date | Station | |||
1997-08-01 | Station 345 - Bison Lake | 0.0 | 48.2 | 50.5 |
1997-08-02 | Station 345 - Bison Lake | 0.0 | 48.2 | 55.0 |
1997-08-03 | Station 345 - Bison Lake | 0.0 | 48.5 | 54.9 |
1997-08-04 | Station 345 - Bison Lake | 0.0 | 48.6 | 50.0 |
1997-08-05 | Station 345 - Bison Lake | 0.0 | 48.8 | 46.0 |
... | ... | ... | ... | ... |
2024-09-02 | Station 618 - McClure Pass | 0.0 | 32.4 | 58.1 |
2024-09-03 | Station 345 - Bison Lake | 0.0 | 36.9 | 52.3 |
Station 618 - McClure Pass | 0.0 | 32.4 | 59.0 | |
2024-09-04 | Station 345 - Bison Lake | 0.0 | 37.1 | 45.5 |
2024-09-05 | Station 345 - Bison Lake | 0.0 | 37.1 | 43.7 |
16245 rows × 3 columns
# Perform elbow method analysis
def elbow_method(X, range_n_clusters):
distortions = []
for n_clusters in range_n_clusters:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(X)
distortions.append(kmeans.inertia_)
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()
range_n_clusters = range(1, 11)
X = scaler.fit_transform(df_swe_temp.values)
elbow_method(X, range_n_clusters)
# Fit KMeans with 4 clusters
kmeans = KMeans(n_clusters=4, n_init=10, random_state=0)
kmeans.fit(df_swe_temp)
df_swe_temp['cluster'] = kmeans.predict(
df_swe_temp
)
df_swe_temp = df_swe_temp.reset_index()
# Plot the clusters
fig = px.scatter_3d(
df_swe_temp,
x='WTEQ',
y='PREC',
z='TAVG',
color='cluster',
hover_data=['date', 'Station']
)
fig.show()
Image(filename='img/swe_clusters.png')
def assign_meteorological_season(date):
month = date.month
day = date.day
if month in [3, 4, 5]:
return 'spring'
elif month in [6, 7, 8]:
return 'summer'
elif month in [9, 10, 11]:
return 'fall'
elif month in [12, 1, 2]:
return 'winter'
else:
return 'unknown'
df_swe_temp['season'] = df_swe_temp['date'].apply(assign_meteorological_season)
px.scatter_3d(
df_swe_temp,
x='WTEQ',
y='PREC',
z='TAVG',
color='season',
hover_data=['date', 'Station', 'cluster']
)
Image(filename='img/swe_seasons.png')
# Fit KMeans with 4 clusters (if not already fitted)
if 'cluster' not in df_swe_temp.columns:
kmeans = KMeans(n_clusters=4, n_init=10, random_state=0)
kmeans.fit(df_swe_temp[['WTEQ', 'PREC', 'TAVG']])
df_swe_temp['cluster'] = kmeans.predict(df_swe_temp[['WTEQ', 'PREC', 'TAVG']])
df_swe_temp = df_swe_temp.reset_index()
# Create interactive html with centroids
fig = px.scatter_3d(
df_swe_temp,
x='WTEQ',
y='PREC',
z='TAVG',
color='cluster',
hover_data=['date', 'Station', 'cluster', 'season']
)
# Add centroids to the plot
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns=['WTEQ', 'PREC', 'TAVG'])
centroid_df['cluster'] = range(4)
fig.add_trace(
px.scatter_3d(
centroid_df,
x='WTEQ',
y='PREC',
z='TAVG',
color='cluster'
).update_traces(marker=dict(symbol='x', size=12, color='black', opacity=1)).data[0]
)
fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))
fig.write_html('img/swe_clusters_with_centroids.html')
fig.show()
Conclusions¶
The seasonal relationship between temperature, snow water equivalent (WTEQ), and precipitation becomes visible with the use of the KMeans clustering algorithm. Cluster 1 tends to indicate the winter months, where temperatures are low and percipitation has increased, thus driving WTEQ higher. Cluster 2 is generally a mix of hotter spring and summer conditions with high precipitation but low WTEQ. Cluster 3 consists mostly of the spring season, where TAVG is cooler, and PREC and WTEQ are at their peaks. Cluster 4 generally indicates the fall season, where temperatures have a large range, but WTEQ and PREC are at their lows.