In [2]:
import requests
import pandas as pd
from io import StringIO
import html
import re
import seaborn as sns
import matplotlib.pyplot as plt
In [5]:
def usgs_water_api_pull(
        startDT="2000-01-01", 
        endDT="2024-09-06", 
        stateCd="CO", 
        siteStatus="all", 
        altMin=0):
    """API reference: https://waterservices.usgs.gov/test-tools/"""
    url = "https://waterservices.usgs.gov/nwis/dv/"
    params = {
        "format": "json",
        "stateCd": stateCd,
        "startDT": startDT,
        "endDT": endDT,
        "siteStatus": siteStatus,
        "altMin": altMin
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    print(response.url)
    data = response.json()

    # Flatten the timeSeries object without expanding the list elements
    df_flat = pd.json_normalize(
        data['value']['timeSeries'],
        sep='_',
        errors='ignore'
    )

    # Expand the 'values.value' field manually (since it's a list of dicts)
    expanded_values = []
    for i, row in df_flat.iterrows():
        for value_entry in row['values']:
            for val in value_entry['value']:
                expanded_values.append({
                    'siteName': row['sourceInfo_siteName'],
                    'siteCode': row['sourceInfo_siteCode'][0]['value'],
                    'variableName': row['variable_variableName'],
                    'unitCode': row['variable_unit_unitCode'],
                    'dateTime': val['dateTime'],
                    'value': val['value']
                })
    df = pd.DataFrame(expanded_values)

    def clean_html(text):
        text = html.unescape(text)
        text = re.sub(r'<.*?>', '', text)
        return text
    
    df = df.map(lambda x: clean_html(x) if isinstance(x, str) else x)

    return df

df = usgs_water_api_pull()
https://waterservices.usgs.gov/nwis/dv/?format=json&stateCd=CO&startDT=2000-01-01&endDT=2024-09-06&siteStatus=all&altMin=0
In [6]:
# Cleaning
df_streamflow = df[df['variableName'] == 'Streamflow, ft³/s']
df_streamflow = df_streamflow[['siteName', 'siteCode', 'dateTime', 'value']]
df_streamflow = df_streamflow.rename(columns={'value': 'streamflow'})

df_precipitation = df[df['variableName'] == 'Precipitation, total, in']
df_precipitation = df_precipitation[['siteName', 'siteCode', 'dateTime', 'value']]
df_precipitation = df_precipitation.rename(columns={'value': 'precipitation'})

df_joined = df_streamflow.merge(
    df_precipitation, on=['siteName', 'siteCode', 'dateTime'], how='left')

df_joined = df_joined.fillna(0.0).convert_dtypes()
df_joined['dateTime'] = pd.to_datetime(df_joined['dateTime'])
df_joined['streamflow'] = df_joined['streamflow'].astype(float)
df_joined['precipitation'] = df_joined['precipitation'].astype(float)
df_joined = df_joined[df_joined['streamflow'] >= 0.0]
df_joined = df_joined[df_joined['precipitation'] >= 0.0]

df_final = df_joined.copy()
del df_joined
display(df_final.info())
display(df_final.isnull().sum())
display(df_final)

df_final.to_parquet('data/usgs_water_data.parquet', index=False)
<class 'pandas.core.frame.DataFrame'>
Index: 2619518 entries, 0 to 2623056
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   siteName       string        
 1   siteCode       string        
 2   dateTime       datetime64[ns]
 3   streamflow     float64       
 4   precipitation  float64       
dtypes: datetime64[ns](1), float64(2), string(2)
memory usage: 119.9 MB
None
siteName         0
siteCode         0
dateTime         0
streamflow       0
precipitation    0
dtype: int64
siteName siteCode dateTime streamflow precipitation
0 HELL CREEK ABV NF NORTH PLATTE RIVER NR WALDEN... 06613950 2014-08-21 11.8 0.0
1 HELL CREEK ABV NF NORTH PLATTE RIVER NR WALDEN... 06613950 2014-08-22 11.4 0.0
2 HELL CREEK ABV NF NORTH PLATTE RIVER NR WALDEN... 06613950 2014-08-23 13.1 0.0
3 HELL CREEK ABV NF NORTH PLATTE RIVER NR WALDEN... 06613950 2014-08-24 11.4 0.0
4 HELL CREEK ABV NF NORTH PLATTE RIVER NR WALDEN... 06613950 2014-08-25 12.4 0.0
... ... ... ... ... ...
2623052 BIG THOMPSON BL MORAINE PARK NR ESTES PARK, CO 402114105350101 2024-09-02 19.6 0.0
2623053 BIG THOMPSON BL MORAINE PARK NR ESTES PARK, CO 402114105350101 2024-09-03 18.7 0.0
2623054 BIG THOMPSON BL MORAINE PARK NR ESTES PARK, CO 402114105350101 2024-09-04 19.6 0.0
2623055 BIG THOMPSON BL MORAINE PARK NR ESTES PARK, CO 402114105350101 2024-09-05 23.8 0.0
2623056 BIG THOMPSON BL MORAINE PARK NR ESTES PARK, CO 402114105350101 2024-09-06 20.6 0.0

2619518 rows × 5 columns

In [3]:
df = pd.read_parquet('data/usgs_water_data.parquet')
print(f"Rows: {df.shape[0]:,}")
Rows: 2,619,518
In [33]:
df_filtered = df[df['siteName'].str.contains('COLORADO RIVER BELOW GLENWOOD SPRINGS, CO')].reset_index(drop=True)
display(df_filtered)
siteName siteCode dateTime streamflow precipitation
0 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2000-01-01 1420.0 0.0
1 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2000-01-02 1520.0 0.0
2 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2000-01-03 1480.0 0.0
3 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2000-01-04 1230.0 0.0
4 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2000-01-05 1420.0 0.0
... ... ... ... ... ...
9011 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2024-09-02 1890.0 0.0
9012 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2024-09-03 1880.0 0.0
9013 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2024-09-04 1840.0 0.0
9014 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2024-09-05 1940.0 0.0
9015 COLORADO RIVER BELOW GLENWOOD SPRINGS, CO 09085100 2024-09-06 2130.0 0.0

9016 rows × 5 columns

In [6]:
# Visualize Streamflow
sns.lineplot(data=df_filtered, x='dateTime', y='streamflow', hue='siteName', legend=False)
plt.title('Streamflow of Colorado River below Glenwood Springs, CO')
plt.xlabel('Date')
plt.ylabel('Streamflow (ft³/s)')
plt.show()
No description has been provided for this image
In [7]:
# Visualize Precipitation
sns.lineplot(data=df_filtered, x='dateTime', y='precipitation', hue='siteName', legend=False)
plt.title('Precipitation of Colorado River Site below Glenwood Springs, CO')
plt.xlabel('Date')
plt.ylabel('Precipitation (in)')
plt.show()
No description has been provided for this image

Will drop precipitation based on lack of data.

In [6]:
# Drop columns
df_filtered = df[df['siteName'].str.contains('COLORADO RIVER BELOW GLENWOOD SPRINGS, CO')].reset_index(drop=True)
df_filtered['date'] = pd.to_datetime(df['dateTime'])
df_filtered = df_filtered.drop(columns=['precipitation', 'dateTime', 'siteName', 'siteCode'])
df_filtered = df_filtered.dropna().drop_duplicates()
df_filtered = df_filtered.set_index('date')
print(f"Rows: {df_filtered.shape[0]:,}")
print(f"Unique Dates: {df_filtered.index.nunique():,}")
df_grouped = df_filtered.groupby(by='date').mean()
display(df_grouped)

df_grouped.to_parquet('data/usgs_colorado_river_glenwood_springs.parquet')
Rows: 9,016
Unique Dates: 8,792
streamflow
date
2000-01-01 2510.0
2000-01-02 2570.0
2000-01-03 2580.0
2000-01-04 2540.0
2000-01-05 2500.0
... ...
2024-01-22 1890.0
2024-01-23 1880.0
2024-01-24 1840.0
2024-01-25 1940.0
2024-01-26 2130.0

8792 rows × 1 columns