Source code for olm.USGS.DataRetrieval

#Function to download USGS Daily Average Discharge values given date and the location text
"""
Contains functions used to download data from USGS databases.
"""

from lxml import etree
import lxml.html, requests
try:
    from urllib.parse import quote #could eventually rework to use only requests
except ImportError:
    from urllib import quote
from io import StringIO
from pandas import read_csv, DataFrame, to_datetime
#import requests
#import os



[docs]
def GetSiteData(location):
    """
    Retrieves meta data about a Water Quality portal site using the site identifier.


    Parameters
    ----------
    location : string
        Full site number.

    Returns
    -------
    siteDF : pandas.DataFrame
        Returns a pandas DataFrame object that contains all of the site meta data.

    Notes
    -----

    """
    BASEURL = 'https://www.waterqualitydata.us/data/Station/search?siteid='
    queryURL = BASEURL + location + '&mimeType=csv&Zip=no'
    #Need to skip header, which is hopefully uniform across USGS queries
    siteDF = read_csv(queryURL, sep=',')
    siteDF = siteDF.iloc[0]#change axis so that we only have key-data pairs
    return siteDF



def GetNWISSiteData(location):
    """
    Retrieves meta data about a USGS site using the full site identifier.


    Parameters
    ----------
    location : string
        Full USGS site number starting with 'USGS-' or the bare integer number of a USGS site.

    Returns
    -------
    siteDF : pandas.DataFrame
        Returns a pandas DataFrame object that contains all of the site meta data from an expanded USGS site data query.  Data is indexed using field labels given in USGS file (see Notes).

    Notes
    -----
    Pair of keys and descriptions from USGS site meta data.

    agency_cd       -- Agency

    site_no         -- Site identification number

    station_nm      -- Site name

    site_tp_cd      -- Site type

    lat_va          -- DMS latitude

    long_va         -- DMS longitude

    dec_lat_va      -- Decimal latitude

    dec_long_va     -- Decimal longitude

    coord_meth_cd   -- Latitude-longitude method

    coord_acy_cd    -- Latitude-longitude accuracy

    coord_datum_cd  -- Latitude-longitude datum

    dec_coord_datum_cd -- Decimal Latitude-longitude datum

    district_cd     -- District code

    state_cd        -- State code

    county_cd       -- County code

    country_cd      -- Country code

    land_net_ds     -- Land net location description

    map_nm          -- Name of location map

    map_scale_fc    -- Scale of location map

    alt_va          -- Altitude of Gage/land surface

    alt_meth_cd     -- Method altitude determined

    alt_acy_va      -- Altitude accuracy

    alt_datum_cd    -- Altitude datum

    huc_cd          -- Hydrologic unit code

    basin_cd        -- Drainage basin code

    topo_cd         -- Topographic setting code

    instruments_cd  -- Flags for instruments at site

    construction_dt -- Date of first construction

    inventory_dt    -- Date site established or inventoried

    drain_area_va   -- Drainage area

    contrib_drain_area_va -- Contributing drainage area

    tz_cd           -- Time Zone abbreviation

    local_time_fg   -- Site honors Daylight Savings Time

    reliability_cd  -- Data reliability code

    gw_file_cd      -- Data-other GW files

    nat_aqfr_cd     -- National aquifer code

    aqfr_cd         -- Local aquifer code

    aqfr_type_cd    -- Local aquifer type code

    well_depth_va   -- Well depth

    hole_depth_va   -- Hole depth

    depth_src_cd    -- Source of depth data

    project_no      -- Project number

    """
    if (location[:5] == 'USGS-'):
        sitenum = location[5:]
    else:
        sitenum = location
    BASEURL = 'https://waterservices.usgs.gov/nwis/site/?site='
    queryURL = BASEURL + sitenum + '&siteOutput=expanded'
    #Need to skip header, which is hopefully uniform across USGS queries
    skiprows = list(range(0,59))
    skiprows.append(60)
    siteDF = read_csv(queryURL, sep='\t', skiprows=skiprows)
    siteDF = siteDF.iloc[0]#change axis so that we only have key-data pairs
    return siteDF


[docs]
def querySiteList(siteList, charList):
    BASE_URL = 'https://www.waterqualitydata.us/Result/search?'
    queryText = BASE_URL + 'siteid='
    #add sites to query
    for site in siteList:
        #check for USGS prefixes (are there others?  EPA?)
        ##if not(site.startswith('USGS-')):
        ##    site = 'USGS-' + site
        #add this site to list with trailing semi-colon
        queryText += site + ';'
    #remove final semi-colon
    queryText = queryText[:-1]
    #add characteristics to query
    queryText += '&characteristicName='
    for characteristic in charList:
        queryText += characteristic + ';'
    #remove trailing semi-colon
    queryText = queryText[:-1]
    #add mime type
    queryText += '&mimeType=xml'
    #convert query string to url special characters
    queryText = quote(queryText, safe="/&=:?")
    return queryText




[docs]
def GetDailyDischarge(location, date):
    """
    Retrieve daily average discharge value from USGS database for given date and USGS site.

    Parameters
    ----------
    location : string
        Full USGS site number starting with 'USGS-' or a string that just contains the bare integer number of a USGS site.
    date : string
        String containing the date for which discharge will be retrieved.  Should be given as YYYY-MM-DD.

    Returns
    -------
    data : dict {'discharge':float, 'quality':string, 'name':string}
        Returns a dicionary that contains three items, the average discharge value for the site and date given, the quality code assigned to that discharge value, and the name of the site.

    Notes
    -----
    Currently hard-wired to retrieve USGS pcode 00060, daily discharge in cfs.

    """
    #construct url for discharge query
    BASE_URL = 'https://waterservices.usgs.gov/nwis/dv?format=waterml,1.1'
    #query discharge and read into xml parser
    #pull site number out of location text
    #Check to see if location contains 'USGS-' or is just the bare number
    if (location[:5] == 'USGS-'):
        site_number = location[5:]
    else:
        site_number = location
    #construct html address for query
    query_html = BASE_URL + '&sites=' + site_number + '&startDT='+date+'&endDT='+date
    #read in xml file through html query
    try:
        print("Discharge query html: ",query_html)
        r = requests.get(query_html)
        #qtree = etree.parse(r.raw)
        root = etree.fromstring(r.content)
    except IOError:
        print("Problem retrieving discharge value (IOError).")
        return -1
    #parse xml file to pull out discharge and quality code
    #root = qtree.getroot()
    #get namespace map
    NSMAP = root.nsmap
    NS1 = "{%s}" % NSMAP['ns1']
    tsString = "timeSeries[@name='USGS:"+site_number+":00060:00003']"
    ts = root.find(NS1+tsString)
    if (ts == None):
        #there is no time series data for this site and date
        return None
    sourceInfo = ts.find(NS1+"sourceInfo")
    name = sourceInfo.findtext(NS1+"siteName")
    values = ts.find(NS1+"values")
    value = values.find(NS1+"value")
    if (value == None):
        #there is no discharge data for this site and date
        return None
    q = value.text
    quality_code = value.get("qualifiers")
    #return discharge and quality code
    data = {'discharge':q, 'quality':quality_code, 'name':name}
    return data




[docs]
def GetDailyDischargeRecord(location, start_date, end_date=None):
    """
    Retrieve daily average discharge values from USGS database for given date range and USGS site.

    Parameters
    ----------
    location : str
        Full USGS site number starting with 'USGS-' or a string that just contains the bare integer number of a USGS site.
    start_date : str
        String containing the beginning date in the range for which discharge will be retrieved.  Should be given as YYYY-MM-DD.
    end_date : str (optional)
        String containing the ending date in the range for which discharge will be retrieved.  Should be given as YYYY-MM-DD.  If not provided then data will be retrieved up to the current date.

    Returns
    -------
    data : pandas dataframe
        Returns a Pandas dataframe with an index of the date, a column 'discharge' of discharge values, and a column 'quality' of the USGS quality rating.

    Notes
    -----
    Currently hard-wired to retrieve USGS pcode 00060, daily discharge in cfs.

   """
    #construct url for discharge query
    BASE_URL = 'https://waterservices.usgs.gov/nwis/dv?format=waterml,1.1'
    #query discharge and read into xml parser
    #pull site number out of location text
    #Check to see if location contains 'USGS-' or is just the bare number
    if (location[:5] == 'USGS-'):
        site_number = location[5:]
    else:
        site_number = location
    #construct html address for query
    if end_date==None:
        query_html = BASE_URL + '&sites=' + site_number + '&startDT='+start_date
    else:
        query_html = BASE_URL + '&sites=' + site_number + '&startDT='+start_date+'&endDT='+end_date
    #read in xml file through html query
    try:
        r = requests.get(query_html)
        #qtree = etree.parse(r.raw)
        root = etree.fromstring(r.content)
    except IOError:
        print("Problem retrieving discharge value (IOError).")
        return -1
    #parse xml file to pull out discharge and quality code
    #root = qtree.getroot()
    #get namespace map
    NSMAP = root.nsmap
    NS1 = "{%s}" % NSMAP['ns1']
    tsString = "timeSeries[@name='USGS:"+site_number+":00060:00003']"
    ts = root.find(NS1+tsString)
    if (ts == None):
        #there is no time series data for this site and date
        return None
    sourceInfo = ts.find(NS1+"sourceInfo")
    name = sourceInfo.findtext(NS1+"siteName")
    values = ts.find(NS1+"values")
    value_list = values.findall(NS1+"value")
    if (values == None):
        #there is no discharge data for this site and date
        return None
    q=[]
    quality_code=[]
    date_list = []
    for value in value_list:
        q.append(float(value.text))
        quality_code.append(value.get("qualifiers"))
        date_list.append(value.get("dateTime")[:10])
    #4/24/14 ended coding here.  need to write into dataframe
    data = DataFrame({'discharge':q, 'quality':quality_code}, index=to_datetime(date_list))
    #return discharge and quality code data frame
    return data