Source code for olm.USGS.DataRetrieval

#Function to download USGS Daily Average Discharge values given date and the location text
"""
Contains functions used to download data from USGS databases.
"""

from lxml import etree
import lxml.html, requests
try:
    from urllib.parse import quote #could eventually rework to use only requests
except ImportError:
    from urllib import quote
from io import StringIO
from pandas import read_csv, DataFrame, to_datetime
#import requests
#import os


[docs] def GetSiteData(location): """ Retrieves meta data about a Water Quality portal site using the site identifier. Parameters ---------- location : string Full site number. Returns ------- siteDF : pandas.DataFrame Returns a pandas DataFrame object that contains all of the site meta data. Notes ----- """ BASEURL = 'https://www.waterqualitydata.us/data/Station/search?siteid=' queryURL = BASEURL + location + '&mimeType=csv&Zip=no' #Need to skip header, which is hopefully uniform across USGS queries siteDF = read_csv(queryURL, sep=',') siteDF = siteDF.iloc[0]#change axis so that we only have key-data pairs return siteDF
def GetNWISSiteData(location): """ Retrieves meta data about a USGS site using the full site identifier. Parameters ---------- location : string Full USGS site number starting with 'USGS-' or the bare integer number of a USGS site. Returns ------- siteDF : pandas.DataFrame Returns a pandas DataFrame object that contains all of the site meta data from an expanded USGS site data query. Data is indexed using field labels given in USGS file (see Notes). Notes ----- Pair of keys and descriptions from USGS site meta data. agency_cd -- Agency site_no -- Site identification number station_nm -- Site name site_tp_cd -- Site type lat_va -- DMS latitude long_va -- DMS longitude dec_lat_va -- Decimal latitude dec_long_va -- Decimal longitude coord_meth_cd -- Latitude-longitude method coord_acy_cd -- Latitude-longitude accuracy coord_datum_cd -- Latitude-longitude datum dec_coord_datum_cd -- Decimal Latitude-longitude datum district_cd -- District code state_cd -- State code county_cd -- County code country_cd -- Country code land_net_ds -- Land net location description map_nm -- Name of location map map_scale_fc -- Scale of location map alt_va -- Altitude of Gage/land surface alt_meth_cd -- Method altitude determined alt_acy_va -- Altitude accuracy alt_datum_cd -- Altitude datum huc_cd -- Hydrologic unit code basin_cd -- Drainage basin code topo_cd -- Topographic setting code instruments_cd -- Flags for instruments at site construction_dt -- Date of first construction inventory_dt -- Date site established or inventoried drain_area_va -- Drainage area contrib_drain_area_va -- Contributing drainage area tz_cd -- Time Zone abbreviation local_time_fg -- Site honors Daylight Savings Time reliability_cd -- Data reliability code gw_file_cd -- Data-other GW files nat_aqfr_cd -- National aquifer code aqfr_cd -- Local aquifer code aqfr_type_cd -- Local aquifer type code well_depth_va -- Well depth hole_depth_va -- Hole depth depth_src_cd -- Source of depth data project_no -- Project number """ if (location[:5] == 'USGS-'): sitenum = location[5:] else: sitenum = location BASEURL = 'https://waterservices.usgs.gov/nwis/site/?site=' queryURL = BASEURL + sitenum + '&siteOutput=expanded' #Need to skip header, which is hopefully uniform across USGS queries skiprows = list(range(0,59)) skiprows.append(60) siteDF = read_csv(queryURL, sep='\t', skiprows=skiprows) siteDF = siteDF.iloc[0]#change axis so that we only have key-data pairs return siteDF
[docs] def querySiteList(siteList, charList): BASE_URL = 'https://www.waterqualitydata.us/Result/search?' queryText = BASE_URL + 'siteid=' #add sites to query for site in siteList: #check for USGS prefixes (are there others? EPA?) ##if not(site.startswith('USGS-')): ## site = 'USGS-' + site #add this site to list with trailing semi-colon queryText += site + ';' #remove final semi-colon queryText = queryText[:-1] #add characteristics to query queryText += '&characteristicName=' for characteristic in charList: queryText += characteristic + ';' #remove trailing semi-colon queryText = queryText[:-1] #add mime type queryText += '&mimeType=xml' #convert query string to url special characters queryText = quote(queryText, safe="/&=:?") return queryText
[docs] def GetDailyDischarge(location, date): """ Retrieve daily average discharge value from USGS database for given date and USGS site. Parameters ---------- location : string Full USGS site number starting with 'USGS-' or a string that just contains the bare integer number of a USGS site. date : string String containing the date for which discharge will be retrieved. Should be given as YYYY-MM-DD. Returns ------- data : dict {'discharge':float, 'quality':string, 'name':string} Returns a dicionary that contains three items, the average discharge value for the site and date given, the quality code assigned to that discharge value, and the name of the site. Notes ----- Currently hard-wired to retrieve USGS pcode 00060, daily discharge in cfs. """ #construct url for discharge query BASE_URL = 'https://waterservices.usgs.gov/nwis/dv?format=waterml,1.1' #query discharge and read into xml parser #pull site number out of location text #Check to see if location contains 'USGS-' or is just the bare number if (location[:5] == 'USGS-'): site_number = location[5:] else: site_number = location #construct html address for query query_html = BASE_URL + '&sites=' + site_number + '&startDT='+date+'&endDT='+date #read in xml file through html query try: print("Discharge query html: ",query_html) r = requests.get(query_html) #qtree = etree.parse(r.raw) root = etree.fromstring(r.content) except IOError: print("Problem retrieving discharge value (IOError).") return -1 #parse xml file to pull out discharge and quality code #root = qtree.getroot() #get namespace map NSMAP = root.nsmap NS1 = "{%s}" % NSMAP['ns1'] tsString = "timeSeries[@name='USGS:"+site_number+":00060:00003']" ts = root.find(NS1+tsString) if (ts == None): #there is no time series data for this site and date return None sourceInfo = ts.find(NS1+"sourceInfo") name = sourceInfo.findtext(NS1+"siteName") values = ts.find(NS1+"values") value = values.find(NS1+"value") if (value == None): #there is no discharge data for this site and date return None q = value.text quality_code = value.get("qualifiers") #return discharge and quality code data = {'discharge':q, 'quality':quality_code, 'name':name} return data
[docs] def GetDailyDischargeRecord(location, start_date, end_date=None): """ Retrieve daily average discharge values from USGS database for given date range and USGS site. Parameters ---------- location : str Full USGS site number starting with 'USGS-' or a string that just contains the bare integer number of a USGS site. start_date : str String containing the beginning date in the range for which discharge will be retrieved. Should be given as YYYY-MM-DD. end_date : str (optional) String containing the ending date in the range for which discharge will be retrieved. Should be given as YYYY-MM-DD. If not provided then data will be retrieved up to the current date. Returns ------- data : pandas dataframe Returns a Pandas dataframe with an index of the date, a column 'discharge' of discharge values, and a column 'quality' of the USGS quality rating. Notes ----- Currently hard-wired to retrieve USGS pcode 00060, daily discharge in cfs. """ #construct url for discharge query BASE_URL = 'https://waterservices.usgs.gov/nwis/dv?format=waterml,1.1' #query discharge and read into xml parser #pull site number out of location text #Check to see if location contains 'USGS-' or is just the bare number if (location[:5] == 'USGS-'): site_number = location[5:] else: site_number = location #construct html address for query if end_date==None: query_html = BASE_URL + '&sites=' + site_number + '&startDT='+start_date else: query_html = BASE_URL + '&sites=' + site_number + '&startDT='+start_date+'&endDT='+end_date #read in xml file through html query try: r = requests.get(query_html) #qtree = etree.parse(r.raw) root = etree.fromstring(r.content) except IOError: print("Problem retrieving discharge value (IOError).") return -1 #parse xml file to pull out discharge and quality code #root = qtree.getroot() #get namespace map NSMAP = root.nsmap NS1 = "{%s}" % NSMAP['ns1'] tsString = "timeSeries[@name='USGS:"+site_number+":00060:00003']" ts = root.find(NS1+tsString) if (ts == None): #there is no time series data for this site and date return None sourceInfo = ts.find(NS1+"sourceInfo") name = sourceInfo.findtext(NS1+"siteName") values = ts.find(NS1+"values") value_list = values.findall(NS1+"value") if (values == None): #there is no discharge data for this site and date return None q=[] quality_code=[] date_list = [] for value in value_list: q.append(float(value.text)) quality_code.append(value.get("qualifiers")) date_list.append(value.get("dateTime")[:10]) #4/24/14 ended coding here. need to write into dataframe data = DataFrame({'discharge':q, 'quality':quality_code}, index=to_datetime(date_list)) #return discharge and quality code data frame return data