In [2]:
import pandas as pd
from IPython.display import Markdown
In [3]:
# Import data
df_swe_and_temp = pd.read_parquet('data/nrcs_swe_pivot.parquet')
df_soil_moisture = pd.read_parquet('data/nrcs_soil_moisture_pivot.parquet')
df_streamflow = pd.read_parquet('data/usgs_colorado_river_glenwood_springs.parquet')

df_swe_and_temp.index = pd.to_datetime(df_swe_and_temp.index)
df_soil_moisture.index = pd.to_datetime(df_soil_moisture.index)
df_streamflow.index = pd.to_datetime(df_streamflow.index)

dfs = {
    'Snow Water Equivalent / Temperature': df_swe_and_temp,
    'Soil Moisture': df_soil_moisture,
    'Streamflow': df_streamflow
}
In [4]:
# Dislay metadata
for name, df in dfs.items():
    header = f"### {name}"
    rows = f"**Rows:** {df.shape[0]:,}"
    unique_dates = f"**Unique dates:** {df.index.nunique():,}"
    uniqueness = f"**Date uniqueness:** {df.shape[0] / df.index.nunique():.2f} rows per date"
    columns = f"**Column names:** {', '.join(df.columns)}"
    date_range = f"""**Date range:** 
    {df.index.min().strftime('%Y-%m-%d')} to 
    {df.index.max().strftime('%Y-%m-%d')}"""
    markdown_content = f"{header}\n\n{rows}\n\n{unique_dates}\n\n{uniqueness}\n\n{columns}\n\n{date_range}"
    display(Markdown(markdown_content))

Snow Water Equivalent / Temperature¶

Rows: 6,492

Unique dates: 6,492

Date uniqueness: 1.00 rows per date

Column names: SNWD_BisonLake, SNWD_McClurePass, WTEQ_BisonLake, WTEQ_McClurePass, PREC_BisonLake, PREC_McClurePass, PRCP_BisonLake, PRCP_McClurePass, TAVG_BisonLake, TAVG_McClurePass, TMAX_BisonLake, TMAX_McClurePass, TMIN_BisonLake, TMIN_McClurePass

Date range: 2003-06-25 to 2024-09-03

Soil Moisture¶

Rows: 3,503

Unique dates: 3,503

Date uniqueness: 1.00 rows per date

Column names: soilmoisture_station378_2ft, soilmoisture_station378_8ft, soilmoisture_station378_20ft, soilmoisture_station457_2ft, soilmoisture_station457_8ft, soilmoisture_station457_20ft, soilmoisture_station607_4ft, soilmoisture_station607_8ft, soilmoisture_station607_20ft, soilmoisture_station680_2ft, soilmoisture_station680_8ft, soilmoisture_station680_20ft, soilmoisture_station802_2ft, soilmoisture_station802_8ft, soilmoisture_station802_20ft

Date range: 2008-03-12 to 2021-07-27

Streamflow¶

Rows: 8,792

Unique dates: 8,792

Date uniqueness: 1.00 rows per date

Column names: streamflow

Date range: 2000-01-01 to 2024-01-26

In [5]:
df_joined = pd.concat(dfs, axis=1, join='inner')
df_joined = df_joined.droplevel(0, axis=1)
display(df_joined)
SNWD_BisonLake SNWD_McClurePass WTEQ_BisonLake WTEQ_McClurePass PREC_BisonLake PREC_McClurePass PRCP_BisonLake PRCP_McClurePass TAVG_BisonLake TAVG_McClurePass ... soilmoisture_station607_4ft soilmoisture_station607_8ft soilmoisture_station607_20ft soilmoisture_station680_2ft soilmoisture_station680_8ft soilmoisture_station680_20ft soilmoisture_station802_2ft soilmoisture_station802_8ft soilmoisture_station802_20ft streamflow
date
2008-03-12 82.0 60.0 28.6 23.9 29.1 22.9 0.1 0.0 19.4 30.2 ... 10.2 24.2 25.8 14.6 11.7 26.0 4.0 1.3 6.5 2360.0
2008-03-15 86.0 63.0 29.2 24.5 29.7 23.4 0.1 0.2 11.7 23.4 ... 10.3 24.4 25.4 14.7 11.7 26.2 4.0 1.4 6.9 2260.0
2008-03-17 84.0 63.0 29.3 24.8 29.8 23.6 0.0 0.2 14.4 24.3 ... 10.3 24.7 25.2 14.4 12.1 26.1 4.1 1.4 6.7 2260.0
2008-03-18 83.0 62.0 29.3 24.8 29.8 23.6 0.0 0.0 11.8 23.9 ... 10.6 24.9 25.3 14.7 12.1 26.0 4.0 1.4 6.6 2260.0
2008-03-19 82.0 61.0 29.4 24.9 29.8 23.6 0.0 0.0 20.8 30.2 ... 10.6 25.0 25.1 14.6 11.7 25.9 4.0 1.6 6.9 2200.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-07-23 0.0 0.0 0.0 0.0 27.7 20.7 0.2 0.2 52.9 62.1 ... 13.8 14.5 11.5 35.5 14.2 30.2 24.1 5.5 9.2 1170.0
2021-07-24 0.0 0.0 0.0 0.0 27.7 21.1 0.0 0.4 54.7 57.0 ... 13.4 14.1 11.3 33.7 13.9 30.2 24.0 5.3 8.7 1240.0
2021-07-25 0.0 0.0 0.0 0.0 27.8 21.5 0.1 0.4 52.7 57.6 ... 13.1 13.9 10.8 39.8 13.8 30.2 23.8 5.0 8.1 1190.0
2021-07-26 0.0 0.0 0.0 0.0 27.8 21.5 0.0 0.0 57.0 61.2 ... 12.9 13.9 10.4 37.7 14.1 30.2 23.7 4.8 7.0 1170.0
2021-07-27 0.0 0.0 0.0 0.0 27.8 21.5 0.0 0.0 58.5 64.8 ... 12.4 13.6 10.1 34.0 13.9 30.1 23.9 4.4 6.1 1110.0

2996 rows × 30 columns

In [6]:
df_joined.to_parquet('data/streamflow_prediction_dataset.parquet')